本文目录导读:

我来详细讲解Python实现数据聚类的几种常用方法,并附带完整案例。
准备环境
# 安装必要的库 # pip install numpy pandas matplotlib scikit-learn import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.datasets import make_blobs from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering from sklearn.preprocessing import StandardScaler from sklearn.metrics import silhouette_score
生成示例数据
# 生成模拟数据
X, y_true = make_blobs(n_samples=300, centers=4,
cluster_std=0.60, random_state=0)
# 可视化原始数据
plt.figure(figsize=(8, 6))
plt.scatter(X[:, 0], X[:, 1], s=50)'原始数据分布')
plt.xlabel('特征1')
plt.ylabel('特征2')
plt.show()
K-Means聚类
# K-Means聚类
kmeans = KMeans(n_clusters=4, random_state=0)
kmeans_labels = kmeans.fit_predict(X)
kmeans_centers = kmeans.cluster_centers_
# 可视化结果
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.scatter(X[:, 0], X[:, 1], c=kmeans_labels, cmap='viridis', s=50)
plt.scatter(kmeans_centers[:, 0], kmeans_centers[:, 1],
marker='x', s=200, linewidths=3, color='red')'K-Means聚类结果')
plt.xlabel('特征1')
plt.ylabel('特征2')
# 肘部法则确定最佳K值
inertias = []
K_range = range(1, 11)
for k in K_range:
kmeans_temp = KMeans(n_clusters=k, random_state=0)
kmeans_temp.fit(X)
inertias.append(kmeans_temp.inertia_)
plt.subplot(1, 2, 2)
plt.plot(K_range, inertias, 'bo-')
plt.xlabel('K值')
plt.ylabel('惯性值')'肘部法则')
plt.tight_layout()
plt.show()
print(f"K-Means轮廓系数: {silhouette_score(X, kmeans_labels):.3f}")
DBSCAN聚类
# DBSCAN聚类
dbscan = DBSCAN(eps=0.3, min_samples=5)
dbscan_labels = dbscan.fit_predict(X)
# 可视化结果
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.scatter(X[:, 0], X[:, 1], c=dbscan_labels, cmap='viridis', s=50)'DBSCAN聚类结果')
plt.xlabel('特征1')
plt.ylabel('特征2')
# 参数调优:不同eps值的效果
eps_values = [0.1, 0.3, 0.5, 0.7]
plt.subplot(1, 2, 2)
for eps in eps_values:
dbscan_temp = DBSCAN(eps=eps, min_samples=5)
labels_temp = dbscan_temp.fit_predict(X)
n_clusters = len(set(labels_temp)) - (1 if -1 in labels_temp else 0)
n_noise = list(labels_temp).count(-1)
print(f"eps={eps}: 聚类数={n_clusters}, 噪声点={n_noise}")
plt.tight_layout()
plt.show()
层次聚类
# 层次聚类
agg_clustering = AgglomerativeClustering(n_clusters=4)
agg_labels = agg_clustering.fit_predict(X)
# 可视化
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.scatter(X[:, 0], X[:, 1], c=agg_labels, cmap='viridis', s=50)'层次聚类结果')
plt.xlabel('特征1')
plt.ylabel('特征2')
# 创建树状图
from scipy.cluster.hierarchy import dendrogram, linkage
plt.subplot(1, 2, 2)
linked = linkage(X, 'ward')
dendrogram(linked, truncate_mode='level', p=5)'层次聚类树状图')
plt.xlabel('样本索引')
plt.ylabel('距离')
plt.tight_layout()
plt.show()
完整聚类分析流程
class ClusterAnalyzer:
"""聚类分析器"""
def __init__(self, data):
self.data = data
self.scaler = StandardScaler()
self.scaled_data = None
self.results = {}
def preprocess(self):
"""数据预处理"""
self.scaled_data = self.scaler.fit_transform(self.data)
return self
def kmeans_analysis(self, k_range=range(2, 11)):
"""K-Means完整分析"""
results = []
for k in k_range:
kmeans = KMeans(n_clusters=k, random_state=0)
labels = kmeans.fit_predict(self.scaled_data)
silhouette = silhouette_score(self.scaled_data, labels)
results.append({
'k': k,
'inertia': kmeans.inertia_,
'silhouette': silhouette
})
self.results['kmeans'] = results
return self.plot_results(results, 'K-Means')
def plot_results(self, results, title):
"""绘制分析结果"""
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
ks = [r['k'] for r in results]
inertias = [r['inertia'] for r in results]
silhouettes = [r['silhouette'] for r in results]
axes[0].plot(ks, inertias, 'bo-')
axes[0].set_xlabel('K值')
axes[0].set_ylabel('惯性值')
axes[0].set_title(f'{title} - 肘部法则')
axes[1].plot(ks, silhouettes, 'ro-')
axes[1].set_xlabel('K值')
axes[1].set_ylabel('轮廓系数')
axes[1].set_title(f'{title} - 轮廓系数')
plt.tight_layout()
plt.show()
best_k = ks[np.argmax(silhouettes)]
print(f"最佳K值: {best_k} (轮廓系数: {max(silhouettes):.3f})")
return best_k
# 使用完整分析流程
analyzer = ClusterAnalyzer(X)
analyzer.preprocess()
best_k = analyzer.kmeans_analysis()
实战:客户分群案例
# 生成客户数据
np.random.seed(42)
n_customers = 200
data = {
'年龄': np.random.randint(18, 65, n_customers),
'收入': np.random.normal(50000, 20000, n_customers),
'消费金额': np.random.normal(2000, 800, n_customers),
'购物频率': np.random.randint(1, 30, n_customers)
}
df = pd.DataFrame(data)
# 数据标准化
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[['收入', '消费金额']])
# K-Means聚类
kmeans = KMeans(n_clusters=3, random_state=0)
df['客户类别'] = kmeans.fit_predict(scaled_data)
# 分析结果
print("\n客户分群统计:")
for cluster in range(3):
cluster_data = df[df['客户类别'] == cluster]
print(f"\n类别 {cluster}:")
print(f" 数量: {len(cluster_data)}")
print(f" 平均年龄: {cluster_data['年龄'].mean():.1f}")
print(f" 平均收入: {cluster_data['收入'].mean():.0f}")
print(f" 平均消费: {cluster_data['消费金额'].mean():.0f}")
print(f" 购物频率: {cluster_data['购物频率'].mean():.1f}")
# 可视化
plt.figure(figsize=(10, 6))
scatter = plt.scatter(df['收入'], df['消费金额'],
c=df['客户类别'], cmap='viridis', s=50)
plt.xlabel('收入')
plt.ylabel('消费金额')'客户分群结果')
plt.colorbar(scatter)
plt.show()
聚类效果评估
def evaluate_clustering(X, labels, algorithm_name):
"""评估聚类效果"""
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
print(f"\n{algorithm_name} 评估结果:")
print(f"聚类数量: {n_clusters}")
if n_clusters > 1:
silhouette = silhouette_score(X, labels)
ch_score = calinski_harabasz_score(X, labels)
db_score = davies_bouldin_score(X, labels)
print(f"轮廓系数: {silhouette:.3f}")
print(f"Calinski-Harabasz分数: {ch_score:.3f}")
print(f"Davies-Bouldin分数: {db_score:.3f}")
# 判断标准
if silhouette > 0.5:
print(" 聚类效果良好")
elif silhouette > 0.25:
print(" 聚类效果一般")
else:
print(" 聚类效果较差")
else:
print("需要多个聚类才能进行效果评估")
# 评估不同聚类算法
print("="*50)
print("聚类算法对比评估")
print("="*50)
evaluate_clustering(X, kmeans_labels, "K-Means")
evaluate_clustering(X, dbscan_labels, "DBSCAN")
evaluate_clustering(X, agg_labels, "层次聚类")
总结关键点
算法选择建议
- K-Means: 球形簇、数据量大的情况
- DBSCAN: 任意形状簇、有噪声数据
- 层次聚类: 需要层次结构、小数据集
实际应用技巧
- 数据标准化:不同尺度的特征会影响聚类结果
- 降维处理:高维数据先降维再聚类
- 异常值处理:异常值会影响聚类中心
- 多指标评估:结合多个指标选择最佳参数
常见问题解决
- K值选择困难:肘部法则 + 轮廓系数
- 噪声点处理:DBSCAN或预处理去噪
- 大规模数据:Mini-Batch K-Means
- 非球形簇:谱聚类或DBSCAN
这个完整教程涵盖了Python数据聚类的核心方法和实战技巧,你可以根据实际需求选择合适的算法和参数。