本文目录导读:

我来介绍几种Python实现数据分类的常用方法和完整案例:
使用scikit-learn进行机器学习分类
鸢尾花数据集分类案例
# 导入所需库
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
import numpy as np
# 加载数据
iris = load_iris()
X = iris.data # 特征数据
y = iris.target # 标签
# 将数据转换为DataFrame便于查看
df = pd.DataFrame(X, columns=iris.feature_names)
df['target'] = y
print("数据前5行:")
print(df.head())
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# 数据标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 创建KNN分类器
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_scaled, y_train)
# 预测
y_pred = knn.predict(X_test_scaled)
# 评估模型
print("\n分类报告:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))
print("混淆矩阵:")
print(confusion_matrix(y_test, y_pred))
# 预测新样本
new_sample = np.array([[5.1, 3.5, 1.4, 0.2]]) # 新样本特征
new_sample_scaled = scaler.transform(new_sample)
prediction = knn.predict(new_sample_scaled)
print(f"\n新样本预测类别:{iris.target_names[prediction[0]]}")
使用决策树进行分类
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import matplotlib.pyplot as plt
# 创建决策树分类器
dt_classifier = DecisionTreeClassifier(
max_depth=3, # 树的最大深度
random_state=42
)
# 训练模型
dt_classifier.fit(X_train_scaled, y_train)
# 预测
y_pred_dt = dt_classifier.predict(X_test_scaled)
# 评估
accuracy = dt_classifier.score(X_test_scaled, y_test)
print(f"决策树准确率:{accuracy:.2f}")
# 可视化决策树
plt.figure(figsize=(20, 10))
tree.plot_tree(dt_classifier,
feature_names=iris.feature_names,
class_names=iris.target_names,
filled=True)
plt.show()
逻辑回归分类
from sklearn.linear_model import LogisticRegression
# 创建逻辑回归分类器
log_reg = LogisticRegression(
multi_class='multinomial', # 多分类
max_iter=1000,
random_state=42
)
# 训练模型
log_reg.fit(X_train_scaled, y_train)
# 预测
y_pred_lr = log_reg.predict(X_test_scaled)
# 获取预测概率
y_pred_proba = log_reg.predict_proba(X_test_scaled)
print("逻辑回归准确率:", log_reg.score(X_test_scaled, y_test))
print("\n前5个样本的预测概率:")
print(pd.DataFrame(y_pred_proba[:5], columns=iris.target_names))
文本分类案例
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
# 示例文本数据
texts = [
"这部电影很棒,我很喜欢",
"故事剧情很无聊,浪费时间",
"演员表演很出色,值得一看",
"画面太差,不建议观看",
"强烈推荐,非常好的电影"
]
labels = ['正面', '负面', '正面', '负面', '正面']
# 文本向量化
vectorizer = TfidfVectorizer()
X_text = vectorizer.fit_transform(texts)
# 创建朴素贝叶斯分类器
nb_classifier = MultinomialNB()
# 训练模型
nb_classifier.fit(X_text, labels)
# 预测新文本
new_texts = ["这个电影太精彩了", "很糟糕的观影体验"]
X_new = vectorizer.transform(new_texts)
predictions = nb_classifier.predict(X_new)
print("新文本分类结果:")
for text, pred in zip(new_texts, predictions):
print(f"'{text}' → {pred}")
图像分类(使用Keras)
import tensorflow as tf
from tensorflow import keras
import numpy as np
# 加载MNIST数据集
(X_train_img, y_train_img), (X_test_img, y_test_img) = keras.datasets.mnist.load_data()
# 数据预处理
X_train_img = X_train_img.reshape(-1, 28*28).astype('float32') / 255.0
X_test_img = X_test_img.reshape(-1, 28*28).astype('float32') / 255.0
# 转换标签为one-hot编码
y_train_cat = keras.utils.to_categorical(y_train_img, 10)
y_test_cat = keras.utils.to_categorical(y_test_img, 10)
# 构建神经网络模型
model = keras.Sequential([
keras.layers.Dense(128, activation='relu', input_shape=(784,)),
keras.layers.Dropout(0.2),
keras.layers.Dense(64, activation='relu'),
keras.layers.Dense(10, activation='softmax')
])
# 编译模型
model.compile(
optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy']
)
# 训练模型
history = model.fit(
X_train_img, y_train_cat,
batch_size=32,
epochs=5,
validation_split=0.1
)
# 评估模型
test_loss, test_acc = model.evaluate(X_test_img, y_test_cat)
print(f"\n测试集准确率:{test_acc:.4f}")
# 预测单个图像
sample_img = X_test_img[0].reshape(1, 784)
prediction = model.predict(sample_img)
predicted_digit = np.argmax(prediction)
print(f"预测数字:{predicted_digit}")
完整的数据分类流程封装
class DataClassifier:
def __init__(self, classifier_type='knn'):
self.classifier_type = classifier_type
self.classifier = None
self.scaler = StandardScaler()
def prepare_data(self, X, y, test_size=0.2):
"""准备数据"""
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=42
)
X_train_scaled = self.scaler.fit_transform(X_train)
X_test_scaled = self.scaler.transform(X_test)
return X_train_scaled, X_test_scaled, y_train, y_test
def create_classifier(self):
"""创建分类器"""
if self.classifier_type == 'knn':
self.classifier = KNeighborsClassifier(n_neighbors=3)
elif self.classifier_type == 'dt':
self.classifier = DecisionTreeClassifier(max_depth=3)
elif self.classifier_type == 'lr':
self.classifier = LogisticRegression(max_iter=1000)
else:
raise ValueError("不支持的分类器类型")
def train(self, X_train, y_train):
"""训练模型"""
self.create_classifier()
self.classifier.fit(X_train, y_train)
def predict(self, X):
"""预测"""
X_scaled = self.scaler.transform(X)
return self.classifier.predict(X_scaled)
def evaluate(self, X_test, y_test):
"""评估模型"""
y_pred = self.predict(X_test)
accuracy = np.mean(y_pred == y_test)
return accuracy
# 使用示例
classifier = DataClassifier('knn')
X_train, X_test, y_train, y_test = classifier.prepare_data(X, y)
classifier.train(X_train, y_train)
accuracy = classifier.evaluate(X_test, y_test)
print(f"分类准确率:{accuracy:.2f}")
分类算法对比
# 对比不同分类算法的性能
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import time
classifiers = {
'KNN': KNeighborsClassifier(n_neighbors=3),
'决策树': DecisionTreeClassifier(max_depth=3),
'逻辑回归': LogisticRegression(max_iter=1000),
'SVM': SVC(kernel='rbf'),
'随机森林': RandomForestClassifier(n_estimators=100),
'梯度提升': GradientBoostingClassifier(n_estimators=100)
}
results = []
for name, clf in classifiers.items():
start_time = time.time()
clf.fit(X_train_scaled, y_train)
score = clf.score(X_test_scaled, y_test)
training_time = time.time() - start_time
results.append({
'算法': name,
'准确率': f"{score:.3f}",
'训练时间': f"{training_time:.3f}秒"
})
results_df = pd.DataFrame(results)
print("分类算法对比结果:")
print(results_df)
- 数据预处理:标准化/归一化处理特征数据
- 数据划分:训练集和测试集划分
- 模型选择:根据数据特点选择合适的算法
- 模型训练:使用训练数据学习模式
- 模型评估:使用测试数据评估性能
- 预测应用:对新数据进行分类
这些案例涵盖了从简单到复杂的各种数据分类场景,你可以根据实际需求选择合适的实现方式。