k均值聚类

裴来凡

发布于 2022-05-29 10:19:43

7460

from time import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.preprocessing import scale
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.decomposition import PCA
np.random.seed()
digits=load_digits()
data=scale(digits.data)
n_samples,n_features=data.shape
n_digits=len(np.unique(digits.target))
labels=digits.target
sample_size=300
print("n_digits: %d, \t n_samples %d, \t n_features %d" %(n_digits, n_samples,n_features))
print(79*'_')
print('% 9s' % 'init'' time inertiahomo compl v-meas ARI AMI silhouette')
def bench_k_means(estimator,name,data):
    t0=time()
    estimator.fit(data)
    print('%-9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
          % (name,(time()-t0),estimator.inertia_,
             metrics.homogeneity_score(labels,estimator.labels_),
             metrics.completeness_score(labels,estimator.labels_),
             metrics.v_measure_score(labels,estimator.labels_),
             metrics.adjusted_rand_score(labels,estimator.labels_),
             metrics.adjusted_mutual_info_score(labels, estimator.labels_),
             metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)))
bench_k_means(KMeans(init='k-means++',n_clusters=n_digits, n_init=10),name="k-means++", data=data)
bench_k_means(KMeans(init='random',n_clusters=n_digits, n_init=10),name="random",data=data)
pca = PCA(n_components=n_digits).fit(data)
bench_k_means(KMeans(init=pca.components_,n_clusters=n_digits,n_init=1),name="PCA-based",data=data)
print(79*'_')
reduced_data=PCA(n_components=2).fit_transform(data)
kmeans=KMeans(init='k-means++',n_clusters=n_digits,n_init=10)
kmeans.fit(reduced_data)
h=.02
x_min,x_max=reduced_data[:,0].min()-1,reduced_data[:,0].max()+1
y_min,y_max=reduced_data[:,1].min()-1,reduced_data[:,1].max()+1
xx, yy=np.meshgrid(np.arange(x_min,x_max,h),np.arange(y_min,y_max,h))
#获取每个点的标签，使用上次训练的模型
Z=kmeans.predict(np.c_[xx.ravel(),yy.ravel()])
#将结果绘制成彩色图
Z=Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(Z,interpolation='nearest',extent=(xx.min(),xx.max(),yy.min(),yy.max()),cmap=plt.cm.Paired,aspect='auto',origin='lower')
plt.plot(reduced_data[:, 0],reduced_data[:,1],'k.',markersize=2)
#将质心绘制为白色X
centroids=kmeans.cluster_centers_
plt.scatter(centroids[:,0],centroids[:,1],marker='x',s=169,linewidths=1,color='w',zorder=10)
plt.title('K-means clustering on the digits dataset (PCA-reduced data)\n''Centroids are marked with white cross')
plt.xlim(x_min,x_max)
plt.ylim(y_min,y_max)
plt.xticks(())
plt.yticks(())
plt.show()

n_digits: 10, n_samples 1797, n_features 64 _______________________________________________________________________________ init time inertiahomo compl v-meas ARI AMI silhouette k-means++ 0.43s 69684 0.683 0.722 0.702 0.573 0.699 0.154 random 0.30s 69656 0.673 0.713 0.692 0.558 0.689 0.120 PCA-based 0.05s 70793 0.667 0.695 0.681 0.553 0.677 0.156 _______________________________________________________________________________

算法：k均值聚类是首先将数据分为k组并随机选取k个对象作为初始聚类中心，然后计算每个对象与各个种子聚类中心间距离，最后将每个对象分配给距离其最近聚类中心。

本文参与腾讯云自媒体同步曝光计划，分享自微信公众号。

原始发表：2022-03-17，如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自图像处理与模式识别研究所微信公众号，前往查看

如有侵权，请联系 cloudcommunity@tencent.com 删除。

本文参与腾讯云自媒体同步曝光计划，欢迎热爱写作的你一起参与！

登录后参与评论

0 条评论

热度

k均值聚类

k均值聚类

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐