# -*- coding: utf-8 -*-from sklearn.cluster import KMeansfrom sklearn.externals import joblibimport numpyimport pandas as pddef kmeans(inputfile,n): final = open(‘data/dataset.csv‘ , ‘r‘) data = [line.strip().split(‘,‘) for line in final] feature = [[float(x) for x in row[1]] for row in data] # print feature #调用kmeans类 clf = KMeans(n_clusters=n) #给定类别个数为3 s = clf.fit(feature) # print s #聚类中心坐标 print ‘聚类中心‘,clf.cluster_centers_ #每个样本所属的簇 print ‘每个样本所属的簇‘,clf.labels_ #每个点的分类 #用来评估簇的个数是否合适,距离越小说明簇分的越好,选取临界点的簇个数 print clf.inertia_ #每个点到其簇的质心的距离之和 #进行预测 df = pd.read_csv(‘data/dataset1.csv‘) #未添加类标签的数据集 label = clf.predict(feature) labelpre = pd.DataFrame(label,columns=[‘label‘]) df[‘label‘] = labelpre #将聚类之后的类标签添加到数据集中 # print clf.predict(feature) print df #保存模型 joblib.dump(clf , ‘data/km.pkl‘) #载入保存的模型 clf = joblib.load(‘data/km.pkl‘) print ‘clf‘,clf ‘‘‘ #用来评估簇的个数是否合适,距离越小说明簇分的越好,选取临界点的簇个数 for i in range(5,30,1): clf = KMeans(n_clusters=i) s = clf.fit(feature) print i , clf.inertia_ ‘‘‘kmeans(‘data/danger.csv‘,3)
原文地址:https://www.cnblogs.com/eternallql/p/8142990.html
时间: 2024-10-08 07:29:52