FP_Growth算法原理及实现

***********************************************声明******************************************************

原创作品，出自 “晓风残月xj” 博客，欢迎转载，转载时请务必注明出处（http://blog.csdn.net/xiaofengcanyuexj）。

由于各种原因，可能存在诸多不足，欢迎斧正！

*********************************************************************************************************

前面提到关联规则寻找频繁项集的Apriori算法，Apriori算法是挖掘布尔型关联规则频繁项集的最为经典、最为基本的算法，但是该算法需要不断寻找候选集，然后剪枝即去掉包含非频繁子集的候选集，效率不是很高，时间复杂度由暴力枚举所有子集的指数级别O(n^2) 降为多项式级别，多项式具体系数是底层实现情况而定
。Apriori算法的主要瓶颈在于不断寻找候选项集，可不可以找到一种不用频繁寻找候选项集的算法呢？而且当待挖掘的数据很大进而需要存储在数据库中时，Apriori算法还有一个无可回避的问题就是每次都要扫描数据库，涉及大量I/O操作，比较耗时（当然可以不用数据库）。

FP_Gwoth算法是一种不生成候选集从而寻找频繁项集的算法，主要基于树结构：包含一个一棵FP_Tree和一个项头表，每个项通过一个结点链指向它在树中出现的位置。基本结构如下所示。需要注意的是项头表需要按照支持度递减排序，在FP_Tree（有后缀的也成条件FP_Tree）中高支持度的节点只能是低支持度节点的祖先节点。这样一来可以保证尽可能的共用祖先节点，更重要的是保证正确性。

procedure FP_Growth(FP_Tree, α)

if FP_Tree 只含单个路径P then{
【1】

for 路径P中结点的每个组合（记作β）
【2】

产生模式βUα，其支持度MinSupport =β 中结点的最小支持度；

}

else{

for each αi 在FP_Tree的项头表(按照支持度由低到高顺序进行扫描){ 【3】

产生一个模式β= αiUβ，其支持度MinSupport=αi.MinSupport；

构造β的条件模式基，然后构造β的条件FP_Treeβ；
【4】

if FP_Tree不为空 then

调用 FP_Growth (FP_Treeβ, β)；

}

【1】 FP_Tree 只含单个路径P，即只有一条分支且分支不能分叉，如果分叉可能隐含了分支合并问题，可能导致在为合并之前误删为不满足最小支持度；

【2】若分支上有n个属性值，则总共有2^n组合，可以每个属性值取或不取两种情况递归下去；

【3】当前条件FP_Tree的的项头表，用尾插法建立单链表；

【4】
以当前项头表的αi沿着条件FP_Tree的每条分支向上找出所有条件模式，然后建立后缀模式β的条件FP_Treeβ

源代码:

在此声明，以下代码并不是系本人原创，如需使用，必须声明，谢谢！

//FP_Tree.h

/**
 * Created by xujin on 2014/12/4.
   All Rights Reserved，but you can use this program.
 */
#ifndef FP_TREE_H
#define FP_TREE_H

#include"Transaction.h"
#include"TransactionSet.h"

#include<map>
using namespace std;

const int MAXN_CHILD=20;
typedef string ItemType;

struct ItemSupport
{
	ItemType m_ITItemName;
	int m_nSupportCount;
	ItemSupport(ItemType tItem,int tSup)
	{
		m_ITItemName=tItem;
	    m_nSupportCount=tSup;
	}
};

struct CFP_TreeNode
{
	int m_nSupportCount;
	int m_nChildSize;
	ItemType m_ITItemName;
	CFP_TreeNode *m_pFatherNode;
	CFP_TreeNode *m_pLinkedNode;
	CFP_TreeNode *m_pChildNode[MAXN_CHILD];

	CFP_TreeNode()
	{
		m_ITItemName.clear();
		m_nChildSize=0;
		m_nSupportCount =0;
		m_pFatherNode=NULL;
		m_pLinkedNode=NULL;
		for(int i=0;i<MAXN_CHILD;++i)
			m_pChildNode[i]=NULL;
	}

	CFP_TreeNode(int tCount)
	{
		m_ITItemName.clear();
		m_nChildSize=0;
		m_nSupportCount =tCount;
		m_pFatherNode=NULL;
		m_pLinkedNode=NULL;
		for(int i=0;i<MAXN_CHILD;++i)
			m_pChildNode[i]=NULL;
	}

	CFP_TreeNode(ItemType tITtem,CFP_TreeNode *tFa,CFP_TreeNode *tLinked,int tCount)
	{
		m_ITItemName=tITtem;
		m_nChildSize=0;
		m_nSupportCount=tCount;
		m_pFatherNode =tFa;
		m_pLinkedNode=tLinked;
		for(int i=0;i<MAXN_CHILD;++i)
			m_pChildNode[i]=NULL;
	}

};

struct CItemHeaderNode
{
	int m_nSupportCount;
	CFP_TreeNode *m_pFPFirst;
	CItemHeaderNode()
	{
		m_nSupportCount =0 ;
		m_pFPFirst = NULL;
	}
	CItemHeaderNode(int tCount)
	{
		m_nSupportCount =tCount ;
		m_pFPFirst = NULL;
	}
};

class CFP_Tree
{
private:
	double m_dMinConfidence;
	double m_dMinSupport;
	int m_nSize;
	int m_nMinConfidence;
	int m_nMinSupport;

	CFP_TreeNode *m_pCFP_TreeRoot;

private:

	void insertFPTree(CFP_TreeNode *tRoot,CTransaction &tTran,int id,int tCount);
	void DFSPrintPath(CFP_TreeNode *tRoot,vector<ItemSupport> &tItemSupportSet);
	void printLinkList(CFP_TreeNode *tRoot);
	void destroy(CFP_TreeNode *tRoot);

public:
	vector<ItemSupport>m_vecItemSupportSet;
	map<ItemType,CItemHeaderNode>m_mapItemHeaderList;

	void sortMapItemHeaderList();
	void addItem(ItemType tItem,int tCount);
	void eraseInfrequent1ItemSet();

	/***********************************************
	*
	*功能:对tTranSet进行计数支持度从大到小排序
	*
	***********************************************/
	void sortTransactionSet(CTransactionSet &tTranSet);

	CFP_Tree(CTransactionSet &tTranSet,double tMinCon,double tMinSup,int tCount);
	CFP_Tree(double tMinCon,double tMinSup,int tSize);
	void insertFPTree(CTransaction &tTran,int id,int tCount);
	void printPath();
	void printItemHeaderList();
	bool isSinglePath(CFP_TreeNode *tRoot);
	void destroy();
	friend class CFP_Growth;
};

#endif

//FP_Tree.cpp
/**
 * Created by xujin on 2014/12/4.
   All Rights Reserved，but you can use this program.
 */
#include<algorithm>
#include"FP_Tree.h"

bool cmp(ItemSupport &a,ItemSupport &b)
{
	return a.m_nSupportCount<b.m_nSupportCount;
}

void CFP_Tree::sortMapItemHeaderList()
{
	vector<ItemSupport>tItemSupportSet;

	for(map<ItemType,CItemHeaderNode>::iterator iter=m_mapItemHeaderList.begin();iter!=m_mapItemHeaderList.end();++iter)
	{
		tItemSupportSet.push_back(ItemSupport(iter->first,iter->second.m_nSupportCount));
	}

	sort(tItemSupportSet.begin(),tItemSupportSet.end(),cmp);

	m_vecItemSupportSet.clear();
	for(vector<ItemSupport>::iterator iter=tItemSupportSet.begin();iter!=tItemSupportSet.end();++iter)
	{
	//	cout<<"--->("<<iter->m_ITItemName<<","<<iter->m_nSupportCount<<")"<<endl;
		m_vecItemSupportSet.push_back(*iter);
	}
}

CFP_Tree::CFP_Tree(CTransactionSet &tTranSet,double tMinCon,double tMinSup,int tCount)
{
	this->m_mapItemHeaderList.clear();
	this->m_vecItemSupportSet.clear();
	this->m_dMinConfidence=tMinCon;
	this->m_dMinSupport =tMinSup;
	this->m_nSize=tTranSet.getSize();
    this->m_nMinConfidence = (this->m_dMinConfidence)*(this->m_nSize);
	this->m_nMinSupport=(this->m_dMinSupport)*(this->m_nSize);

	this->m_pCFP_TreeRoot=new CFP_TreeNode(tCount);

	for(vector<CTransaction>::iterator iter=tTranSet.getVeCTransaction().begin();iter!=tTranSet.getVeCTransaction().end();++iter)
	{
		for(vector<string>::iterator strIter=iter->getVecItem().begin();strIter!=iter->getVecItem().end();++strIter)
		{
			this->addItem(*strIter,1);
		}
	}
	this->sortTransactionSet(tTranSet);

	//cout<<"**********CFP_Tree::m_mapItemHeaderList.size()="<<CFP_Tree::m_mapItemHeaderList.size()<<endl;

	this->eraseInfrequent1ItemSet();
	this->sortMapItemHeaderList();

	//cout<<"**********CFP_Tree::m_mapItemHeaderList.size()="<<CFP_Tree::m_mapItemHeaderList.size()<<endl;
}

CFP_Tree::CFP_Tree(double tMinCon,double tMinSup,int tSize)
{
	this->m_mapItemHeaderList.clear();
	this->m_vecItemSupportSet.clear();
	this->m_dMinConfidence=tMinCon;
	this->m_dMinSupport =tMinSup;
	this->m_nSize=tSize;
    this->m_nMinConfidence = (this->m_dMinConfidence)*tSize;
	this->m_nMinSupport=(this->m_dMinSupport)*tSize;

	this->m_pCFP_TreeRoot=new CFP_TreeNode();
}

void CFP_Tree::addItem(ItemType tItem,int tCount)
{
	map<ItemType,CItemHeaderNode>::iterator iter=m_mapItemHeaderList.find(tItem);
	if(iter!=m_mapItemHeaderList.end())
	{
	//	cout<<"&&&&&&&&&&&"<<endl;
		iter->second.m_nSupportCount +=tCount;
	}
	else
	{
		//cout<<"**********"<<endl;
	    CItemHeaderNode p(tCount);
	//	cout<<"**********p.m_nSupportCount"<<p.m_nSupportCount<<endl;
		pair<ItemType,CItemHeaderNode> node=make_pair(tItem,p);
		m_mapItemHeaderList.insert(node);
	}
}

void CFP_Tree::eraseInfrequent1ItemSet()
{
	for(map<ItemType,CItemHeaderNode>::iterator iter=m_mapItemHeaderList.begin();iter!=m_mapItemHeaderList.end();)
	{
		if(iter->second.m_nSupportCount<this->m_nMinSupport)
		{
			m_mapItemHeaderList.erase(iter);
		}
		else
		{
			++iter;
		}
	}
}

class CTransactionItemSort
{
public:
  static bool cmp(ItemType a,ItemType b);
  static CFP_Tree *g_pRoot;
//std::sort要求函数对象，或是静态/全局函数指针
//非静态成员函数指针不能直接传递给std::sort

};

CFP_Tree *CTransactionItemSort::g_pRoot=NULL;

bool CTransactionItemSort::cmp(ItemType a,ItemType b)
{
	int aCount,bCount;
	map<ItemType,CItemHeaderNode>::iterator iter=CTransactionItemSort::g_pRoot->m_mapItemHeaderList.find(a);
	if(iter!=CTransactionItemSort::g_pRoot->m_mapItemHeaderList.end())
	{
		aCount=iter->second.m_nSupportCount;
	}
	else
	{
		aCount=-1;
	}

	iter=CTransactionItemSort::g_pRoot->m_mapItemHeaderList.find(b);
	if(iter!=CTransactionItemSort::g_pRoot->m_mapItemHeaderList.end())
	{
		bCount=iter->second.m_nSupportCount;
	}
	else
	{
		bCount=-1;
	}
	return aCount>bCount;
}

void CFP_Tree::sortTransactionSet(CTransactionSet &tTranSet)
{
	CTransactionItemSort::g_pRoot=this;
	for(vector<CTransaction>::iterator iter=tTranSet.getVeCTransaction().begin();iter!=tTranSet.getVeCTransaction().end();++iter)
	{
		sort(iter->getVecItem().begin(),iter->getVecItem().end(),CTransactionItemSort::cmp);//cmp调用出错
	}
}

void CFP_Tree::insertFPTree(CFP_TreeNode *tRoot,CTransaction &tTran,int id,int tCount)
{
	if(id>=tTran.getVecItem().size()||this->m_mapItemHeaderList.find(tTran.getVecItem()[id])==this->m_mapItemHeaderList.end())
		return ;

	CFP_TreeNode *pChild;
	//cout<<"**********this->m_nChildSize="<<this->m_nChildSize<<endl;
	for(int i=0;i<tRoot->m_nChildSize;++i)
	{
		 pChild=tRoot->m_pChildNode[i];
		if(pChild!=NULL&&pChild->m_ITItemName==tTran.getVecItem()[id])
		{
		//	cout<<"pChild!=NULL&&pChild->m_ITItemName==tTran.getVecItem()[id]"<<endl;
			pChild->m_nSupportCount+=tCount;
			this->insertFPTree(pChild,tTran,id+1,tCount);
			return ;
		}
	}

	//cout<<"&&&&&&&this->m_nChildSize="<<this->m_nChildSize<<endl;
	ItemType item=tTran.getVecItem()[id];
	pChild=new CFP_TreeNode(item,tRoot, this->m_mapItemHeaderList[item].m_pFPFirst,tCount);
	this->m_mapItemHeaderList[item].m_pFPFirst=pChild;
	tRoot->m_pChildNode[tRoot->m_nChildSize]=pChild;
	++tRoot->m_nChildSize;
	insertFPTree(pChild,tTran,id+1,tCount);

	//cout<<"!!!!!!!!!!!!this->m_nChildSize="<<this->m_nChildSize<<endl;
}

void CFP_Tree::insertFPTree(CTransaction &tTran,int id,int tCount)
{
	this->insertFPTree(this->m_pCFP_TreeRoot,tTran,id,tCount);
}

void CFP_Tree::DFSPrintPath(CFP_TreeNode *tRoot,vector<ItemSupport> &tItemSupportSet)
{
	if(tRoot->m_nChildSize==0)
	{
		for(vector<ItemSupport>::iterator iter=tItemSupportSet.begin(); iter!=tItemSupportSet.end();++iter)
			cout<<"--->("<<iter->m_ITItemName<<","<<iter->m_nSupportCount<<")";
		cout<<endl;
		return ;
	}
	//cout<<"&&&&&&&&this->m_nChildSize="<<this->m_nChildSize<<endl;

	for(int i=0;i<tRoot->m_nChildSize;++i)
	{
		CFP_TreeNode *pChild= tRoot->m_pChildNode[i];
		tItemSupportSet.push_back(ItemSupport(pChild->m_ITItemName,pChild->m_nSupportCount));
		DFSPrintPath(pChild,tItemSupportSet);
		tItemSupportSet.pop_back();
	}
}

void CFP_Tree::printPath()
{
	vector<ItemSupport>tItemSupportSet;
	cout<<"打印FP_Tree树:"<<endl;
	this->DFSPrintPath(m_pCFP_TreeRoot,tItemSupportSet);
}

void CFP_Tree::printLinkList(CFP_TreeNode *tRoot)
{
	cout<<"--->("<<tRoot->m_ITItemName<<","<<tRoot->m_nSupportCount<<")";
	if(tRoot->m_pLinkedNode!=NULL)
	  printLinkList(tRoot->m_pLinkedNode);
}

void CFP_Tree::printItemHeaderList()
{
	cout<<"打印顶点表中每个单链表:"<<endl;
	for(map<ItemType,CItemHeaderNode>::iterator iter=m_mapItemHeaderList.begin();iter!=m_mapItemHeaderList.end();++iter)
	{
		if(iter->second.m_pFPFirst!=NULL)
		{
			cout<<"("<<iter->first<<","<<iter->second.m_nSupportCount<<") :";
			printLinkList(iter->second.m_pFPFirst);
			cout<<endl;
		}
	}
}

void CFP_Tree::destroy(CFP_TreeNode *tRoot)
{
	CFP_TreeNode *pChild;
	for(int i=0;i<tRoot->m_nChildSize;++i)
	{
	    pChild=tRoot->m_pChildNode[i];
		if(pChild!=NULL)
		{
			destroy(pChild);
		}
	}
	tRoot->m_pFatherNode=NULL;
	tRoot->m_pLinkedNode=NULL;
	for(int i=0;i<tRoot->m_nChildSize;++i)
	{
		if(tRoot->m_pChildNode[i]!=NULL)
		{
		   delete tRoot->m_pChildNode[i];
	    	tRoot->m_pChildNode[i]=NULL;
		}
	}
}

void CFP_Tree::destroy()
{
	this->destroy(this->m_pCFP_TreeRoot);
}

bool CFP_Tree::isSinglePath(CFP_TreeNode *tRoot)
{
	if(0==tRoot->m_nChildSize)
		return true;
	else if(tRoot->m_nChildSize>1)
		return false;
	return isSinglePath(tRoot->m_pChildNode[0]);
}

//FP_Crowth.h
/**
 * Created by xujin on 2014/12/4.
   All Rights Reserved，but you can use this program.
 */
#ifndef FP_GROWTH_H
#define FP_GROWTH_H

#include"Transaction.h"
#include"TransactionSet.h"
#include"FP_Tree.h"

class CFP_Growth
{
private:
	CFP_Tree *m_pCFPTConditionTree;

private:
	void initCFP_Growth(CFP_Tree *tCFPTTree,vector<ItemSupport>& tItemSupportSet);
	void printOneFreSet(vector<ItemSupport> &tItemSupportSet);
	void findCombine(CFP_TreeNode *tRoot,vector<ItemSupport> &tItemSupportSet);

public:

	CFP_Growth(CFP_Tree *tCFPTTree,vector<ItemSupport>& tItemSupportSet);
	void printPath();
	void printItemHeaderList();
};

#endif

//FP_Crowth.cpp
/**
 * Created by xujin on 2014/12/4.
   All Rights Reserved，but you can use this program.
 */
#include"FP_Crowth.h"
#include<algorithm>
using namespace std;

void CFP_Growth::initCFP_Growth(CFP_Tree *tCFPTTree,vector<ItemSupport>& tItemSupportSet)
{
	for(vector<ItemSupport>::iterator iter=tCFPTTree->m_vecItemSupportSet.begin();iter!=tCFPTTree->m_vecItemSupportSet.end();++iter)
	{
		map<ItemType,CItemHeaderNode>::iterator iterMap=tCFPTTree->m_mapItemHeaderList.find(iter->m_ITItemName);

		//创建条件FP_growth树
		CFP_Tree * pCFPTConTree =new CFP_Tree(tCFPTTree->m_dMinConfidence,tCFPTTree->m_dMinSupport,tCFPTTree->m_nSize);
		for(CFP_TreeNode *next=iterMap->second.m_pFPFirst; next!=NULL; next=next->m_pLinkedNode)
		{
			CTransaction tran;
			CFP_TreeNode *fa=next->m_pFatherNode;
			int count=next->m_nSupportCount;
			while(fa!=NULL&&!fa->m_ITItemName.empty())
			{
				tran.addItem(fa->m_ITItemName);
				pCFPTConTree->addItem(fa->m_ITItemName,count);
				fa=fa->m_pFatherNode;
			}
			CTransaction reve;
			for(vector<string>::reverse_iterator iter=tran.getVecItem().rbegin();iter!=tran.getVecItem().rend();++iter)
			{
				reve.addItem(*iter);
			}
			pCFPTConTree->insertFPTree(reve,0,count);
		}
		pCFPTConTree->sortMapItemHeaderList();

		tItemSupportSet.push_back(ItemSupport(iterMap->first,iterMap->second.m_nSupportCount));
		new CFP_Growth(pCFPTConTree,tItemSupportSet);
		tItemSupportSet.pop_back();
	}
}

CFP_Growth::CFP_Growth(CFP_Tree *tCFPTTree,vector<ItemSupport> &tItemSupportSet)
{
	this->m_pCFPTConditionTree=tCFPTTree;

	if(0==tCFPTTree->m_pCFP_TreeRoot->m_nChildSize)
	{
		this->printOneFreSet(tItemSupportSet);
		return ;
	}
	else if(tCFPTTree->isSinglePath(tCFPTTree->m_pCFP_TreeRoot))
	{
		findCombine(tCFPTTree->m_pCFP_TreeRoot->m_pChildNode[0],tItemSupportSet);
		return ;
	}
	else
	{
		initCFP_Growth(tCFPTTree,tItemSupportSet);
	}
}

void CFP_Growth::findCombine(CFP_TreeNode *tRoot,vector<ItemSupport> &tItemSupportSet)
{
	if(tRoot==NULL)
	{
		printOneFreSet(tItemSupportSet);
		return ;
	}
	findCombine(tRoot->m_pChildNode[0],tItemSupportSet);

	tItemSupportSet.push_back(ItemSupport(tRoot->m_ITItemName,tRoot->m_nSupportCount));
	findCombine(tRoot->m_pChildNode[0],tItemSupportSet);
	tItemSupportSet.pop_back();

}

void CFP_Growth::printOneFreSet(vector<ItemSupport> &tItemSupportSet)
{
	if(1==tItemSupportSet.size())
		return ;

	int count=m_pCFPTConditionTree->m_nSize*10;

	for(vector<ItemSupport>::reverse_iterator iter=tItemSupportSet.rbegin();iter!=tItemSupportSet.rend();++iter)
	{
		if(count>iter->m_nSupportCount)
			count=iter->m_nSupportCount;
	}
	if(count<m_pCFPTConditionTree->m_nMinSupport)
	   return ;

	cout<<"{ ";
	for(vector<ItemSupport>::reverse_iterator iter=tItemSupportSet.rbegin();iter!=tItemSupportSet.rend();++iter)
	{
		cout<<iter->m_ITItemName<<" ";
	}
	cout<<count<<" }";
	cout<<endl;
}

void CFP_Growth::printPath()
{
	vector<ItemSupport>tItemSupportSet;
	cout<<"打印FP_Tree树:"<<endl;
	this->m_pCFPTConditionTree->DFSPrintPath(this->m_pCFPTConditionTree->m_pCFP_TreeRoot,tItemSupportSet);
}

void CFP_Growth::printItemHeaderList()
{
	cout<<"打印顶点表中每个单链表:"<<endl;
	for(map<ItemType,CItemHeaderNode>::iterator iter=this->m_pCFPTConditionTree->m_mapItemHeaderList.begin();iter!=this->m_pCFPTConditionTree->m_mapItemHeaderList.end();++iter)
	{
		if(iter->second.m_pFPFirst!=NULL)
		{
			cout<<"("<<iter->first<<","<<iter->second.m_nSupportCount<<") :";
			this->m_pCFPTConditionTree->printLinkList(iter->second.m_pFPFirst);
			cout<<endl;
		}
	}
}

FP-growth算法比Apriori算法快一个数量级，在空间复杂度方面也比Apriori也有数量级级别的优化。但是对于海量数据，FP-growth的时空复杂度仍然很高，可以采用的改进方法包括数据库划分，数据采样等等。

由于时间有限，在写博文的过程中参考过一些文献，在此表示感谢；同时鉴于水平原因，你难免有不足之处，欢迎斧正！

时间： 2024-10-05 21:48:07

FP_Growth算法原理及实现

FP_Growth算法原理及实现的相关文章

Adaboost算法原理分析和实例+代码（简明易懂）

FP Tree算法原理总结

分布式memcached学习（四）—— 一致性hash算法原理

POJ1523(求连用分量数目,tarjan算法原理理解)

Kmeans聚类算法原理与实现

【转】两种非对称算法原理：RSA和DH

Canny边缘检测算法原理及其VC实现详解(一)

排序算法原理及实现

Bagging与随机森林算法原理小结