【机器学习实战之三】：C++实现K-均值（K-Means）聚类算法

聚类是一种无监督的学习，它将相似的对象归到同一个簇中。它有点像全自动分类（类别体系是自动构建的）。聚类方法几乎可以应用于所有对象，簇内的对象越相似，聚类的效果越好。本文要介绍一种称为K-均值（K-means）聚类的算法。之所以称之为K-均值是因为它可以发现k个不同的簇，且每个簇的中心采用簇中所含值的均值计算而成。

在介绍K-均值之前，先讨论一席簇识别（cluster identification）。簇识别给出聚类结果的含义。假定有一些数据，现在将相似数据归到一起，簇识别会告诉我们这些簇到底都是些什么。聚类与分类的最大不同在于，分类的目标事先已经知道，而聚类则不一样。因为其产生的结果与分类相同，只是类别没有预先定义，聚类有时也被称为无监督分类（unsupervised
classification）。

聚类分析试图将相似对象归入同一簇，将不相似对象归到不同簇。相似这一概念取决于所选择的的相似度计算方法。

一、Kmeans算法

优点：容易实现

缺点：可能收敛到局部最小值，在大规模数据集上收敛较慢

使用数据类型：数值型数据

K-均值是发现给定数据集的k个簇的算法。簇个数k是用户给定的（当然如何选择k使得聚类更准确是一个难题），每一个簇通过其质心（centroid），即簇中所有点的中心来描述。

K-均值算法工作流程是这样的。首先，随机确定k个初始点作为质心。然后将数据集中的每个点分配到一个簇中，具体来讲，为每个点找距其最近的质心，并将其分配给该质心所对应的簇。这一步完成之后，每个簇的质心更新为该簇所有点的平均值。

伪代码：

创建k个点作为起始质心(经常是随机选择)
当任意一个点的簇分配结果发生改变时
	对数据集中的每个数据点
		对每个质心
			计算质心与数据点之间的距离
		将数据点分配到距其最近的簇
	对每一个簇，计算簇中所有点的均值并将均值作为质心

二、K-均值聚类的一般流程

(1)收集数据：使用任意方法

(2)准备数据：需要数值型数据来计算距离，也可以将标称型数据映射为二值型数据再用于距离计算。

(3)分析数据：使用任意方法

(4)训练算法：不适用于无监督学习，即无监督学习没有训练过程

(5)测试算法：应用聚类算法、观察结果。可以使用量化的误差指标如误差平方和来评价算法的结果。

(6)使用算法：可以用于所希望的任何应用。通常情况下，簇质心可以代表整个簇的数据来做出决策。

三、算法设计和实现

首先看一下类的定义：

template<typename T>
class KMEANS
{
private:
	vector< vector<T> > dataSet;//the data set
	vector< T > mmin,mmax;
	int colLen,rowLen;//colLen:the dimension of vector;rowLen:the number of vectors
	int k;
	vector< vector<T> > centroids;
	typedef struct MinMax
	{
		T Min;
		T Max;
		MinMax(T min , T max):Min(min),Max(max) {}
	}tMinMax;
	typedef struct Node
	{
		int minIndex; //the index of each node
		double minDist;
		Node(int idx,double dist):minIndex(idx),minDist(dist) {}
	}tNode;
	vector<tNode>  clusterAssment;

	/*split line into numbers*/
	void split(char *buffer , vector<T> &vec);
	tMinMax getMinMax(int idx);
	void setCentroids(tMinMax &tminmax , int idx);
	void updateCentroids();
	void initClusterAssment();
	double distEclud(vector<T> &v1 , vector<T> &v2);

public:
	KMEANS(int k);
	void loadDataSet(char *filename);
	void randCent();
	void print();
	void kmeans();
};

数据成员：

+vector< vector<T> > dataSet：训练数据，即所有点的集合，是一个二维的矩阵。

+colLen：向量的维度；rowLen：向量的个数

+k：KMeans中对应的k值

+vector< vector<T> > centroids：k个簇的中心点

+tMinMax：存有最大最小值的结构体

+tNode：标识着每个点所属的簇类别以及距离值

+vector<tNode> clusterAssment：代表每一个向量所属的类别（簇）。

成员函数：

+void loadDataSet( char *filename )：读入文件信息并初始化dataSet以及colLen、rowLen。

+void split(char *buffer , vector<T> &vec)：将一行切分成多个数值放入容器中。

+void randCent()：随机生成k个中心点

+void kmeans()：kmeans的核心函数，计算k个中心点以及每个点的归类。

+double distEclud(vector<T> &v1 , vector<T> &v2)：计算两个向量的欧式距离。

+void print()：打印结果

在这里贴一下kmeans的核心函数kmeans()：

template<typename T>
void KMEANS<T>::kmeans()
{
	initClusterAssment();
	bool clusterChanged = true;
	//the termination condition can also be the loops less than	some number such as 1000
	while( clusterChanged )
	{
		clusterChanged = false;
		//step one : find the nearest centroid of each point
		cout<<"find the nearest centroid of each point : "<<endl;
		for(int i=0;i<rowLen;i++)
		{
			int minIndex = -1;
			double minDist = INT_MAX;
			//distance between dataSet[i] and all centroids
			for(int j=0;j<k;j++)
			{
				double distJI = distEclud( centroids[j],dataSet[i] );
				if( distJI < minDist )
				{
					minDist = distJI;
					minIndex = j;
				}
			}
			//update the cluster which the dataSet[i] belongs to...
			if( clusterAssment[i].minIndex != minIndex )
			{
				clusterChanged = true;
				clusterAssment[i].minIndex = minIndex;
				clusterAssment[i].minDist = minDist ;
			}
		}

		//step two : update the centroids
		cout<<"update the centroids:"<<endl;
		for(int cent=0;cent<k;cent++)
		{
			vector<T> vec(colLen,0);
			int cnt = 0;
			for(int i=0;i<rowLen;i++)
			{
				if( clusterAssment[i].minIndex == cent )
				{
					++cnt;
					//sum of two vectors
					for(int j=0;j<colLen;j++)
					{
						vec[j] += dataSet[i].at(j);
					}
				}
			}

			//mean of the vector and update the centroids[cent]
			for(int i=0;i<colLen;i++)
			{
				if( cnt!=0 )	vec[i] /= cnt;
				centroids[cent].at(i) = vec[i];
			}
		}//for
		print();//update the centroids
	}//while

#if 0
	typename vector<tNode> :: iterator it = clusterAssment.begin();
	while( it!=clusterAssment.end() )
	{
		cout<<(*it).minIndex<<"\t"<<(*it).minDist<<endl;
		it++;
	}
#endif
}

做一个简单的解释：

line 4：首先初始化clusterAssment的值，每个元素为一个结构体，一个域值为簇的索引值；一个域值为存储误差。

line 5：创建一个标志变量clusterChanged，如果该值为true，说明有点改变了类的归属，那么继续迭代，知道没有任何一个点改变了簇的归属为止。当然还有一种方法来作为循环的结束条件：循环的次数，例如循环在1000次以内计算。

line 12~33：第一步：找到每个点距离最近的中心点。line17~25 计算dataSet的第i个向量距离所有簇中心centroids的距离。line27~32 如果簇中心有所变化然后更新。

line 37~60：第二步：将同一类的向量相加并且求平均然后更新簇中心点centroids的值。

输入数据集testSet.txt（http://yunpan.cn/cyRYWWeXKa8bU（提取码：f2ad））为：

1.658985	4.285136
-3.453687	3.424321
4.838138	-1.151539
-5.379713	-3.362104
0.972564	2.924086
-3.567919	1.531611
0.450614	-3.302219
-3.487105	-1.724432
2.668759	1.594842
-3.156485	3.191137
3.165506	-3.999838
-2.786837	-3.099354
4.208187	2.984927
-2.123337	2.943366
0.704199	-0.479481
-0.392370	-3.963704
2.831667	1.574018
-0.790153	3.343144
2.943496	-3.357075
-3.195883	-2.283926
2.336445	2.875106

完整代码kmeans.cc：

#include<iostream>
#include<vector>
#include<map>
#include<cstdlib>
#include<algorithm>
#include<fstream>
#include<stdio.h>
#include<string.h>
#include<string>
#include<time.h>  //for srand
#include<limits.h> //for INT_MIN INT_MAX

using namespace std;

template<typename T>
class KMEANS
{
private:
	vector< vector<T> > dataSet;//the data set
	vector< T > mmin,mmax;
	int colLen,rowLen;//colLen:the dimension of vector;rowLen:the number of vectors
	int k;
	vector< vector<T> > centroids;
	typedef struct MinMax
	{
		T Min;
		T Max;
		MinMax(T min , T max):Min(min),Max(max) {}
	}tMinMax;
	typedef struct Node
	{
		int minIndex; //the index of each node
		double minDist;
		Node(int idx,double dist):minIndex(idx),minDist(dist) {}
	}tNode;
	vector<tNode>  clusterAssment;

	/*split line into numbers*/
	void split(char *buffer , vector<T> &vec);
	tMinMax getMinMax(int idx);
	void setCentroids(tMinMax &tminmax , int idx);
	void initClusterAssment();
	double distEclud(vector<T> &v1 , vector<T> &v2);

public:
	KMEANS(int k);
	void loadDataSet(char *filename);
	void randCent();
	void print();
	void kmeans();
};

template<typename T>
void KMEANS<T>::initClusterAssment()
{
	tNode node(-1,-1);
	for(int i=0;i<rowLen;i++)
	{
		clusterAssment.push_back(node);
	}
}

template<typename T>
void KMEANS<T>::kmeans()
{
	initClusterAssment();
	bool clusterChanged = true;
	//the termination condition can also be the loops less than	some number such as 1000
	while( clusterChanged )
	{
		clusterChanged = false;
		//step one : find the nearest centroid of each point
		cout<<"find the nearest centroid of each point : "<<endl;
		for(int i=0;i<rowLen;i++)
		{
			int minIndex = -1;
			double minDist = INT_MAX;
			for(int j=0;j<k;j++)
			{
				double distJI = distEclud( centroids[j],dataSet[i] );
				if( distJI < minDist )
				{
					minDist = distJI;
					minIndex = j;
				}
			}
			if( clusterAssment[i].minIndex != minIndex )
			{
				clusterChanged = true;
				clusterAssment[i].minIndex = minIndex;
				clusterAssment[i].minDist = minDist ;
			}
		}

		//step two : update the centroids
		cout<<"update the centroids:"<<endl;
		for(int cent=0;cent<k;cent++)
		{
			vector<T> vec(colLen,0);
			int cnt = 0;
			for(int i=0;i<rowLen;i++)
			{
				if( clusterAssment[i].minIndex == cent )
				{
					++cnt;
					//sum of two vectors
					for(int j=0;j<colLen;j++)
					{
						vec[j] += dataSet[i].at(j);
					}
				}
			}

			//mean of the vector and update the centroids[cent]
			for(int i=0;i<colLen;i++)
			{
				if( cnt!=0 )	vec[i] /= cnt;
				centroids[cent].at(i) = vec[i];
			}
		}//for
		print();//update the centroids
	}//while

#if 0
	typename vector<tNode> :: iterator it = clusterAssment.begin();
	while( it!=clusterAssment.end() )
	{
		cout<<(*it).minIndex<<"\t"<<(*it).minDist<<endl;
		it++;
	}
#endif
}

template<typename T>
KMEANS<T>::KMEANS(int k)
{
	this->k = k;
}

template<typename T>
void KMEANS<T>::setCentroids(tMinMax &tminmax,int idx)
{
	T rangeIdx = tminmax.Max - tminmax.Min;
	for(int i=0;i<k;i++)
	{
		/* generate float data between 0 and 1 */
		centroids[i].at(idx) = tminmax.Min + rangeIdx *  ( rand() /  (double)RAND_MAX  ) ;
	}
}

//get the min and max value of the idx column
template<typename T>
typename KMEANS<T>::tMinMax KMEANS<T>::getMinMax(int idx)
{
    T min , max ;
	dataSet[0].at(idx) > dataSet[1].at(idx) ? ( max = dataSet[0].at(idx),min = dataSet[1].at(idx) ) : ( max = dataSet[1].at(idx),min = dataSet[0].at(idx) ) ;

	for(int i=2;i<rowLen;i++)
	{
		if( dataSet[i].at(idx) < min )	min = dataSet[i].at(idx);
		else if( dataSet[i].at(idx) > max ) max = dataSet[i].at(idx);
		else continue;
	}

	tMinMax tminmax(min,max);
	return tminmax;
}

template<typename T>
void KMEANS<T>::randCent()
{
	//init centroids
	vector<T> vec(colLen,0);
	for(int i=0;i<k;i++)
	{
		centroids.push_back(vec);
	}

	//set values by column
	srand( time(NULL) );
	for(int j=0;j<colLen;j++)
	{
	    tMinMax tminmax = getMinMax(j);
		setCentroids(tminmax,j);
	}
}

template<typename T>
double KMEANS<T>::distEclud(vector<T> &v1 , vector<T> &v2)
{
	T sum = 0;
	int size = v1.size();
	for(int i=0;i<size;i++)
	{
		sum += (v1[i] - v2[i])*(v1[i] - v2[i]);
	}
	return sum;
}

template<typename T>
void KMEANS<T>::split(char *buffer , vector<T> &vec)
{
	char *p = strtok(buffer," \t");
	while(p!=NULL)
	{
		vec.push_back( atof(p) );
		p = strtok( NULL," " );
	}
}

template<typename T>
void KMEANS<T>::print()
{
	ofstream fout;
	fout.open("res.txt");
	if(!fout)
	{
		cout<<"file res.txt open failed"<<endl;
		exit(0);
	}

#if 0
	typename vector< vector<T> > :: iterator it = centroids.begin();
	while( it!=centroids.end() )
	{
		typename vector<T> :: iterator it2 = (*it).begin();
		while( it2 != (*it).end() )
		{
			//fout<<*it2<<"\t";
			cout<<*it2<<"\t";
			it2++;
		}
		//fout<<endl;
		cout<<endl;
		it++;
	}
#endif

	typename vector< vector<T> > :: iterator it = dataSet.begin();
	typename vector< tNode > :: iterator itt = clusterAssment.begin();
	for(int i=0;i<rowLen;i++)
	{
		typename vector<T> :: iterator it2 = (*it).begin();
		while( it2!=(*it).end() )
		{
			fout<<*it2<<"\t";
			it2++;
		}
		fout<<(*itt).minIndex<<endl;
		itt++;
		it++;
	}

}

template<typename T>
void KMEANS<T>:: loadDataSet(char *filename)
{
	FILE *pFile;
	pFile = fopen(filename,"r");
	if( !pFile )
	{
		printf("open file %s failed...\n",filename);
		exit(0);
	}

	//init dataSet
	char *buffer = new char[100];
	vector<T> temp;
	while( fgets(buffer,100,pFile) )
	{
		temp.clear();
		split(buffer,temp);
		dataSet.push_back(temp);
	}

	//init colLen,rowLen
	colLen = dataSet[0].size();
	rowLen = dataSet.size();
}

int main( int argc , char *argv[])
{
	if(argc!=3)
	{
		cout<<"Usage : ./a.out filename k"<<endl;
		exit(0);
	}

	char *filename = argv[1];
	int k = atoi(argv[2]);
	KMEANS<double> kms(k);
	kms.loadDataSet(filename);
	kms.randCent();
	kms.kmeans();

	return 0;
}

makefile:

target:
	g++ kmeans.cc
	./a.out testSet.txt 4

得到了结果文件res.txt，那么下面用python来画一下图，看看聚类的效果。

res.txt文件：

1.65898	4.28514	2
-3.45369	3.42432	1
4.83814	-1.15154	3
-5.37971	-3.3621	0
0.972564	2.92409	2
-3.56792	1.53161	1
0.450614	-3.30222	3
-3.48711	-1.72443	0
2.66876	1.59484	2
-3.15649	3.19114	1
3.16551	-3.99984	3
-2.78684	-3.09935	0
4.20819	2.98493	2
-2.12334	2.94337	1
0.704199	-0.479481	3
-0.39237	-3.9637	3
2.83167	1.57402	2
-0.790153	3.34314	1
2.9435	-3.35708	3
-3.19588	-2.28393	0
2.33644	2.87511	2
-1.78635	2.55425	1
2.1901	-1.90602	3
-3.40337	-2.77829	0
1.77812	3.88083	2
-1.68835	2.23027	1
2.59298	-2.05437	3
-4.00726	-3.20707	0
2.25773	3.38756	2
-2.67901	0.785119	1
。。。。。

四、结果验证

将res.txt对应的值画成图，需要用到python的一些包（ python2.7.3+NumPy+matplotlib：http://yunpan.cn/cyRSixGj49HVq（提取码：80be））：

import matplotlib.pyplot as plt

if __name__ == '__main__':
    fp = open('res.txt', 'r')
    colors = ['ro', 'go', 'bo', 'mo', 'co']
    plt.figure()
    for line in fp:
        a = line.strip('\n').split()
        xi = float(a[0])
        yi = float(a[1])
        kind = int(a[2])
        plt.plot(xi, yi, colors[kind], markersize=5)
    fp.close()
    plt.savefig('res.png')

效果截图如下：

当选择k=2：