***********************************************声明******************************************************
原创作品,出自 “晓风残月xj” 博客,欢迎转载,转载时请务必注明出处(http://blog.csdn.net/xiaofengcanyuexj)。
由于各种原因,可能存在诸多不足,欢迎斧正!
*********************************************************************************************************
前面提到关联规则寻找频繁项集的Apriori算法,Apriori算法是挖掘布尔型关联规则频繁项集的最为经典、最为基本的算法,但是该算法需要不断寻找候选集,然后剪枝即去掉包含非频繁子集的候选集 ,效率不是很高,时间复杂度由暴力枚举所有子集的指数级别O(n^2) 降为多项式级别,多项式具体系数是底层实现情况而定
。Apriori算法的主要瓶颈在于不断寻找候选项集,可不可以找到一种不用频繁寻找候选项集的算法呢?而且当待挖掘的数据很大进而需要存储在数据库中时,Apriori算法还有一个无可回避的问题就是每次都要扫描数据库,涉及大量I/O操作,比较耗时(当然可以不用数据库)。
FP_Gwoth算法是一种不生成候选集从而寻找频繁项集的算法,主要基于树结构:包含一个一棵FP_Tree和一个项头表,每个项通过一个结点链指向它在树中出现的位置。基本结构如下所示。需要注意的是项头表需要按照支持度递减排序,在FP_Tree(有后缀的也成条件FP_Tree)中高支持度的节点只能是低支持度节点的祖先节点。这样一来可以保证尽可能的共用祖先节点,更重要的是保证正确性。
procedure FP_Growth(FP_Tree, α)
if FP_Tree 只含单个路径P then{
【1】
for 路径P中结点的每个组合(记作β)
【2】
产生模式βUα,其支持度MinSupport =β 中结点的最小支持度;
}
else{
for each αi 在FP_Tree的项头表(按照支持度由低到高顺序进行扫描){ 【3】
产生一个模式β= αiUβ,其支持度MinSupport=αi.MinSupport;
构造β的条件模式基,然后构造β的条件FP_Treeβ;
【4】
if FP_Tree不为空 then
调用 FP_Growth (FP_Treeβ, β);
}
}
【1】 FP_Tree 只含单个路径P,即只有一条分支且分支不能分叉,如果分叉可能隐含了分支合并问题,可能导致在为合并之前误删为不满足最小支持度;
【2】 若分支上有n个属性值,则总共有2^n组合,可以每个属性值取或不取两种情况递归下去;
【3】当前条件FP_Tree的的项头表,用尾插法建立单链表;
【4】
以当前项头表的αi沿着条件FP_Tree的每条分支向上找出所有条件模式,然后建立后缀模式β的条件FP_Treeβ
源代码:
在此声明,以下代码并不是系本人原创,如需使用,必须声明,谢谢!
//FP_Tree.h /** * Created by xujin on 2014/12/4. All Rights Reserved,but you can use this program. */ #ifndef FP_TREE_H #define FP_TREE_H #include"Transaction.h" #include"TransactionSet.h" #include<map> using namespace std; const int MAXN_CHILD=20; typedef string ItemType; struct ItemSupport { ItemType m_ITItemName; int m_nSupportCount; ItemSupport(ItemType tItem,int tSup) { m_ITItemName=tItem; m_nSupportCount=tSup; } }; struct CFP_TreeNode { int m_nSupportCount; int m_nChildSize; ItemType m_ITItemName; CFP_TreeNode *m_pFatherNode; CFP_TreeNode *m_pLinkedNode; CFP_TreeNode *m_pChildNode[MAXN_CHILD]; CFP_TreeNode() { m_ITItemName.clear(); m_nChildSize=0; m_nSupportCount =0; m_pFatherNode=NULL; m_pLinkedNode=NULL; for(int i=0;i<MAXN_CHILD;++i) m_pChildNode[i]=NULL; } CFP_TreeNode(int tCount) { m_ITItemName.clear(); m_nChildSize=0; m_nSupportCount =tCount; m_pFatherNode=NULL; m_pLinkedNode=NULL; for(int i=0;i<MAXN_CHILD;++i) m_pChildNode[i]=NULL; } CFP_TreeNode(ItemType tITtem,CFP_TreeNode *tFa,CFP_TreeNode *tLinked,int tCount) { m_ITItemName=tITtem; m_nChildSize=0; m_nSupportCount=tCount; m_pFatherNode =tFa; m_pLinkedNode=tLinked; for(int i=0;i<MAXN_CHILD;++i) m_pChildNode[i]=NULL; } }; struct CItemHeaderNode { int m_nSupportCount; CFP_TreeNode *m_pFPFirst; CItemHeaderNode() { m_nSupportCount =0 ; m_pFPFirst = NULL; } CItemHeaderNode(int tCount) { m_nSupportCount =tCount ; m_pFPFirst = NULL; } }; class CFP_Tree { private: double m_dMinConfidence; double m_dMinSupport; int m_nSize; int m_nMinConfidence; int m_nMinSupport; CFP_TreeNode *m_pCFP_TreeRoot; private: void insertFPTree(CFP_TreeNode *tRoot,CTransaction &tTran,int id,int tCount); void DFSPrintPath(CFP_TreeNode *tRoot,vector<ItemSupport> &tItemSupportSet); void printLinkList(CFP_TreeNode *tRoot); void destroy(CFP_TreeNode *tRoot); public: vector<ItemSupport>m_vecItemSupportSet; map<ItemType,CItemHeaderNode>m_mapItemHeaderList; void sortMapItemHeaderList(); void addItem(ItemType tItem,int tCount); void eraseInfrequent1ItemSet(); /*********************************************** * *功能:对tTranSet进行计数支持度从大到小排序 * ***********************************************/ void sortTransactionSet(CTransactionSet &tTranSet); CFP_Tree(CTransactionSet &tTranSet,double tMinCon,double tMinSup,int tCount); CFP_Tree(double tMinCon,double tMinSup,int tSize); void insertFPTree(CTransaction &tTran,int id,int tCount); void printPath(); void printItemHeaderList(); bool isSinglePath(CFP_TreeNode *tRoot); void destroy(); friend class CFP_Growth; }; #endif
//FP_Tree.cpp /** * Created by xujin on 2014/12/4. All Rights Reserved,but you can use this program. */ #include<algorithm> #include"FP_Tree.h" bool cmp(ItemSupport &a,ItemSupport &b) { return a.m_nSupportCount<b.m_nSupportCount; } void CFP_Tree::sortMapItemHeaderList() { vector<ItemSupport>tItemSupportSet; for(map<ItemType,CItemHeaderNode>::iterator iter=m_mapItemHeaderList.begin();iter!=m_mapItemHeaderList.end();++iter) { tItemSupportSet.push_back(ItemSupport(iter->first,iter->second.m_nSupportCount)); } sort(tItemSupportSet.begin(),tItemSupportSet.end(),cmp); m_vecItemSupportSet.clear(); for(vector<ItemSupport>::iterator iter=tItemSupportSet.begin();iter!=tItemSupportSet.end();++iter) { // cout<<"--->("<<iter->m_ITItemName<<","<<iter->m_nSupportCount<<")"<<endl; m_vecItemSupportSet.push_back(*iter); } } CFP_Tree::CFP_Tree(CTransactionSet &tTranSet,double tMinCon,double tMinSup,int tCount) { this->m_mapItemHeaderList.clear(); this->m_vecItemSupportSet.clear(); this->m_dMinConfidence=tMinCon; this->m_dMinSupport =tMinSup; this->m_nSize=tTranSet.getSize(); this->m_nMinConfidence = (this->m_dMinConfidence)*(this->m_nSize); this->m_nMinSupport=(this->m_dMinSupport)*(this->m_nSize); this->m_pCFP_TreeRoot=new CFP_TreeNode(tCount); for(vector<CTransaction>::iterator iter=tTranSet.getVeCTransaction().begin();iter!=tTranSet.getVeCTransaction().end();++iter) { for(vector<string>::iterator strIter=iter->getVecItem().begin();strIter!=iter->getVecItem().end();++strIter) { this->addItem(*strIter,1); } } this->sortTransactionSet(tTranSet); //cout<<"**********CFP_Tree::m_mapItemHeaderList.size()="<<CFP_Tree::m_mapItemHeaderList.size()<<endl; this->eraseInfrequent1ItemSet(); this->sortMapItemHeaderList(); //cout<<"**********CFP_Tree::m_mapItemHeaderList.size()="<<CFP_Tree::m_mapItemHeaderList.size()<<endl; } CFP_Tree::CFP_Tree(double tMinCon,double tMinSup,int tSize) { this->m_mapItemHeaderList.clear(); this->m_vecItemSupportSet.clear(); this->m_dMinConfidence=tMinCon; this->m_dMinSupport =tMinSup; this->m_nSize=tSize; this->m_nMinConfidence = (this->m_dMinConfidence)*tSize; this->m_nMinSupport=(this->m_dMinSupport)*tSize; this->m_pCFP_TreeRoot=new CFP_TreeNode(); } void CFP_Tree::addItem(ItemType tItem,int tCount) { map<ItemType,CItemHeaderNode>::iterator iter=m_mapItemHeaderList.find(tItem); if(iter!=m_mapItemHeaderList.end()) { // cout<<"&&&&&&&&&&&"<<endl; iter->second.m_nSupportCount +=tCount; } else { //cout<<"**********"<<endl; CItemHeaderNode p(tCount); // cout<<"**********p.m_nSupportCount"<<p.m_nSupportCount<<endl; pair<ItemType,CItemHeaderNode> node=make_pair(tItem,p); m_mapItemHeaderList.insert(node); } } void CFP_Tree::eraseInfrequent1ItemSet() { for(map<ItemType,CItemHeaderNode>::iterator iter=m_mapItemHeaderList.begin();iter!=m_mapItemHeaderList.end();) { if(iter->second.m_nSupportCount<this->m_nMinSupport) { m_mapItemHeaderList.erase(iter); } else { ++iter; } } } class CTransactionItemSort { public: static bool cmp(ItemType a,ItemType b); static CFP_Tree *g_pRoot; //std::sort要求函数对象,或是静态/全局函数指针 //非静态成员函数指针不能直接传递给std::sort }; CFP_Tree *CTransactionItemSort::g_pRoot=NULL; bool CTransactionItemSort::cmp(ItemType a,ItemType b) { int aCount,bCount; map<ItemType,CItemHeaderNode>::iterator iter=CTransactionItemSort::g_pRoot->m_mapItemHeaderList.find(a); if(iter!=CTransactionItemSort::g_pRoot->m_mapItemHeaderList.end()) { aCount=iter->second.m_nSupportCount; } else { aCount=-1; } iter=CTransactionItemSort::g_pRoot->m_mapItemHeaderList.find(b); if(iter!=CTransactionItemSort::g_pRoot->m_mapItemHeaderList.end()) { bCount=iter->second.m_nSupportCount; } else { bCount=-1; } return aCount>bCount; } void CFP_Tree::sortTransactionSet(CTransactionSet &tTranSet) { CTransactionItemSort::g_pRoot=this; for(vector<CTransaction>::iterator iter=tTranSet.getVeCTransaction().begin();iter!=tTranSet.getVeCTransaction().end();++iter) { sort(iter->getVecItem().begin(),iter->getVecItem().end(),CTransactionItemSort::cmp);//cmp调用出错 } } void CFP_Tree::insertFPTree(CFP_TreeNode *tRoot,CTransaction &tTran,int id,int tCount) { if(id>=tTran.getVecItem().size()||this->m_mapItemHeaderList.find(tTran.getVecItem()[id])==this->m_mapItemHeaderList.end()) return ; CFP_TreeNode *pChild; //cout<<"**********this->m_nChildSize="<<this->m_nChildSize<<endl; for(int i=0;i<tRoot->m_nChildSize;++i) { pChild=tRoot->m_pChildNode[i]; if(pChild!=NULL&&pChild->m_ITItemName==tTran.getVecItem()[id]) { // cout<<"pChild!=NULL&&pChild->m_ITItemName==tTran.getVecItem()[id]"<<endl; pChild->m_nSupportCount+=tCount; this->insertFPTree(pChild,tTran,id+1,tCount); return ; } } //cout<<"&&&&&&&this->m_nChildSize="<<this->m_nChildSize<<endl; ItemType item=tTran.getVecItem()[id]; pChild=new CFP_TreeNode(item,tRoot, this->m_mapItemHeaderList[item].m_pFPFirst,tCount); this->m_mapItemHeaderList[item].m_pFPFirst=pChild; tRoot->m_pChildNode[tRoot->m_nChildSize]=pChild; ++tRoot->m_nChildSize; insertFPTree(pChild,tTran,id+1,tCount); //cout<<"!!!!!!!!!!!!this->m_nChildSize="<<this->m_nChildSize<<endl; } void CFP_Tree::insertFPTree(CTransaction &tTran,int id,int tCount) { this->insertFPTree(this->m_pCFP_TreeRoot,tTran,id,tCount); } void CFP_Tree::DFSPrintPath(CFP_TreeNode *tRoot,vector<ItemSupport> &tItemSupportSet) { if(tRoot->m_nChildSize==0) { for(vector<ItemSupport>::iterator iter=tItemSupportSet.begin(); iter!=tItemSupportSet.end();++iter) cout<<"--->("<<iter->m_ITItemName<<","<<iter->m_nSupportCount<<")"; cout<<endl; return ; } //cout<<"&&&&&&&&this->m_nChildSize="<<this->m_nChildSize<<endl; for(int i=0;i<tRoot->m_nChildSize;++i) { CFP_TreeNode *pChild= tRoot->m_pChildNode[i]; tItemSupportSet.push_back(ItemSupport(pChild->m_ITItemName,pChild->m_nSupportCount)); DFSPrintPath(pChild,tItemSupportSet); tItemSupportSet.pop_back(); } } void CFP_Tree::printPath() { vector<ItemSupport>tItemSupportSet; cout<<"打印FP_Tree树:"<<endl; this->DFSPrintPath(m_pCFP_TreeRoot,tItemSupportSet); } void CFP_Tree::printLinkList(CFP_TreeNode *tRoot) { cout<<"--->("<<tRoot->m_ITItemName<<","<<tRoot->m_nSupportCount<<")"; if(tRoot->m_pLinkedNode!=NULL) printLinkList(tRoot->m_pLinkedNode); } void CFP_Tree::printItemHeaderList() { cout<<"打印顶点表中每个单链表:"<<endl; for(map<ItemType,CItemHeaderNode>::iterator iter=m_mapItemHeaderList.begin();iter!=m_mapItemHeaderList.end();++iter) { if(iter->second.m_pFPFirst!=NULL) { cout<<"("<<iter->first<<","<<iter->second.m_nSupportCount<<") :"; printLinkList(iter->second.m_pFPFirst); cout<<endl; } } } void CFP_Tree::destroy(CFP_TreeNode *tRoot) { CFP_TreeNode *pChild; for(int i=0;i<tRoot->m_nChildSize;++i) { pChild=tRoot->m_pChildNode[i]; if(pChild!=NULL) { destroy(pChild); } } tRoot->m_pFatherNode=NULL; tRoot->m_pLinkedNode=NULL; for(int i=0;i<tRoot->m_nChildSize;++i) { if(tRoot->m_pChildNode[i]!=NULL) { delete tRoot->m_pChildNode[i]; tRoot->m_pChildNode[i]=NULL; } } } void CFP_Tree::destroy() { this->destroy(this->m_pCFP_TreeRoot); } bool CFP_Tree::isSinglePath(CFP_TreeNode *tRoot) { if(0==tRoot->m_nChildSize) return true; else if(tRoot->m_nChildSize>1) return false; return isSinglePath(tRoot->m_pChildNode[0]); }
//FP_Crowth.h /** * Created by xujin on 2014/12/4. All Rights Reserved,but you can use this program. */ #ifndef FP_GROWTH_H #define FP_GROWTH_H #include"Transaction.h" #include"TransactionSet.h" #include"FP_Tree.h" class CFP_Growth { private: CFP_Tree *m_pCFPTConditionTree; private: void initCFP_Growth(CFP_Tree *tCFPTTree,vector<ItemSupport>& tItemSupportSet); void printOneFreSet(vector<ItemSupport> &tItemSupportSet); void findCombine(CFP_TreeNode *tRoot,vector<ItemSupport> &tItemSupportSet); public: CFP_Growth(CFP_Tree *tCFPTTree,vector<ItemSupport>& tItemSupportSet); void printPath(); void printItemHeaderList(); }; #endif
//FP_Crowth.cpp /** * Created by xujin on 2014/12/4. All Rights Reserved,but you can use this program. */ #include"FP_Crowth.h" #include<algorithm> using namespace std; void CFP_Growth::initCFP_Growth(CFP_Tree *tCFPTTree,vector<ItemSupport>& tItemSupportSet) { for(vector<ItemSupport>::iterator iter=tCFPTTree->m_vecItemSupportSet.begin();iter!=tCFPTTree->m_vecItemSupportSet.end();++iter) { map<ItemType,CItemHeaderNode>::iterator iterMap=tCFPTTree->m_mapItemHeaderList.find(iter->m_ITItemName); //创建条件FP_growth树 CFP_Tree * pCFPTConTree =new CFP_Tree(tCFPTTree->m_dMinConfidence,tCFPTTree->m_dMinSupport,tCFPTTree->m_nSize); for(CFP_TreeNode *next=iterMap->second.m_pFPFirst; next!=NULL; next=next->m_pLinkedNode) { CTransaction tran; CFP_TreeNode *fa=next->m_pFatherNode; int count=next->m_nSupportCount; while(fa!=NULL&&!fa->m_ITItemName.empty()) { tran.addItem(fa->m_ITItemName); pCFPTConTree->addItem(fa->m_ITItemName,count); fa=fa->m_pFatherNode; } CTransaction reve; for(vector<string>::reverse_iterator iter=tran.getVecItem().rbegin();iter!=tran.getVecItem().rend();++iter) { reve.addItem(*iter); } pCFPTConTree->insertFPTree(reve,0,count); } pCFPTConTree->sortMapItemHeaderList(); tItemSupportSet.push_back(ItemSupport(iterMap->first,iterMap->second.m_nSupportCount)); new CFP_Growth(pCFPTConTree,tItemSupportSet); tItemSupportSet.pop_back(); } } CFP_Growth::CFP_Growth(CFP_Tree *tCFPTTree,vector<ItemSupport> &tItemSupportSet) { this->m_pCFPTConditionTree=tCFPTTree; if(0==tCFPTTree->m_pCFP_TreeRoot->m_nChildSize) { this->printOneFreSet(tItemSupportSet); return ; } else if(tCFPTTree->isSinglePath(tCFPTTree->m_pCFP_TreeRoot)) { findCombine(tCFPTTree->m_pCFP_TreeRoot->m_pChildNode[0],tItemSupportSet); return ; } else { initCFP_Growth(tCFPTTree,tItemSupportSet); } } void CFP_Growth::findCombine(CFP_TreeNode *tRoot,vector<ItemSupport> &tItemSupportSet) { if(tRoot==NULL) { printOneFreSet(tItemSupportSet); return ; } findCombine(tRoot->m_pChildNode[0],tItemSupportSet); tItemSupportSet.push_back(ItemSupport(tRoot->m_ITItemName,tRoot->m_nSupportCount)); findCombine(tRoot->m_pChildNode[0],tItemSupportSet); tItemSupportSet.pop_back(); } void CFP_Growth::printOneFreSet(vector<ItemSupport> &tItemSupportSet) { if(1==tItemSupportSet.size()) return ; int count=m_pCFPTConditionTree->m_nSize*10; for(vector<ItemSupport>::reverse_iterator iter=tItemSupportSet.rbegin();iter!=tItemSupportSet.rend();++iter) { if(count>iter->m_nSupportCount) count=iter->m_nSupportCount; } if(count<m_pCFPTConditionTree->m_nMinSupport) return ; cout<<"{ "; for(vector<ItemSupport>::reverse_iterator iter=tItemSupportSet.rbegin();iter!=tItemSupportSet.rend();++iter) { cout<<iter->m_ITItemName<<" "; } cout<<count<<" }"; cout<<endl; } void CFP_Growth::printPath() { vector<ItemSupport>tItemSupportSet; cout<<"打印FP_Tree树:"<<endl; this->m_pCFPTConditionTree->DFSPrintPath(this->m_pCFPTConditionTree->m_pCFP_TreeRoot,tItemSupportSet); } void CFP_Growth::printItemHeaderList() { cout<<"打印顶点表中每个单链表:"<<endl; for(map<ItemType,CItemHeaderNode>::iterator iter=this->m_pCFPTConditionTree->m_mapItemHeaderList.begin();iter!=this->m_pCFPTConditionTree->m_mapItemHeaderList.end();++iter) { if(iter->second.m_pFPFirst!=NULL) { cout<<"("<<iter->first<<","<<iter->second.m_nSupportCount<<") :"; this->m_pCFPTConditionTree->printLinkList(iter->second.m_pFPFirst); cout<<endl; } } }
FP-growth算法比Apriori算法快一个数量级,在空间复杂度方面也比Apriori也有数量级级别的优化。但是对于海量数据,FP-growth的时空复杂度仍然很高,可以采用的改进方法包括数据库划分,数据采样等等。
由于时间有限,在写博文的过程中参考过一些文献,在此表示感谢;同时鉴于水平原因,你难免有不足之处,欢迎斧正!