参考论文:
Srikant R, Agrawal R.Mining sequential patterns: Generalizations and performance improvements[M].Springer Berlin Heidelberg, 1996.
1. 参考论文描述:
Srikant R, Agrawal R. 提出的算法,在原有aporiori的基础上,引入了3个新的概念来定义频繁模式子序列:
1) 加入时间约束,使得原有的aporiori关注的连续变成了只要满足min_gap和max_gap约束的序列,都算是连续的。
2) 加入time_window_size。使得transaction有新的定义。只要在window_size内的item,都可以认为是在同一个itemset。
3) 加入分类标准。
本文的GSP算法实现步骤如下:
1) 候选集生成
a) 合并阶段: 2个subsequence s1和s2能够合并的标准是,去掉s1中的第一个item,去掉s2中的最后一个item,若此时s1和s2相同,则可以对s1和s2进行合并,即将s2中的最后一个item加入到s1中,其中最后一个item是否为合并在原来s1的最后一个itemset,还是自成一个新的itemset,取决于s2的最后一个item是否原来就是一个单独的itemset。
b) 剪枝阶段:不频繁子序列的超集也不频繁。
2) 候选集计算
a) 减少候选集检验是否满足的个数:在这,使用到了哈希树,即将序列中的第p个item映射在第p层上。然后按照候选集进行检查。
b) 检查语料库中是否有满足要求的特定子序列:此处用到了倒排记录表,而倒排记录表的ID可以是item,也可以是transaction-time。简要步骤就是,首先找到第一个itemset,记录时间,然后对于第二个itemset,在第一个itemset的时间上往下继续查找,当找到时,判断2个transaction_time的时间是否满足min_gap和max_gap,如果不满足,则在第二个itemset的基础上先往回找最近第一个itemset时间,如果此时仍不满足,则在第二个itemset的时间的基础上往后查找,如此forward和
backward的步骤。
3) 分类。
2. 算法实现细节:
大致流程:
读入数据à 建立倒排索引表(item to time)à 产生1-frequency itemsetà 产生2-frequency itemsetà …
源文件Ubuntu下编译: c++ gsp.cpp–o gsp
输入参数: ./gsp –i data.txt–t minSupport -min min_gap –max max_gap
例如: ./gsp –i data.txt–t 0.9 -min 2 -max 4
(1). 数据读入:
读入的数据类型为:
TransactionID itemNum item1 item2 item3…
TransactionID itemNum item1 item2 item3…
其中,其中itemNum为一个itemset的item个数,比如读入:
1 2 3 3
1 1 2
1 2 3 4
2 2 1 2
则此时的sequence分别为 <(3 3) (2) (3 4)> , <(1 2)>,此时根据TransactionID来判断是否在一个sequence中。
(2). 采用STL的vector和map来实现,其中主要的数据结构如下:
// An element in thedata sequence.
struct Itemset
{
std::vector<elemType> item; // store the single itemset‘s items
int timestamp; // record current itemsettimestamp
Itemset(){
timestamp = 0;
}
};
// data sequence.
struct Sequence{
int tid; // transation id
int num; // the number of itemset.
std::vector<Itemset> itemset; // store the itemsets.
Sequence() {
tid = 0;
num = 0;
}
};
(3) 建立倒排索引(inverted index)
函数:voidCreateInvertedIndex(int seqNum)
实现细节:vector<vector<map<elemType,int>>> elem2time, 用此来存储每个sequence中每个item到timestamp的映射。在计算支持度的时候将用到。
(4) 产生1-frequency itemset.
函数: std::map<elemType,int> GenerateOneSequence(int seqNum)
实现细节: 对给定数据扫描,用map来映射每个item,然后对然后累加每个item在某个sequence中出现的次数,计算是否满足min_support.
(5) 产生2-frequncy itemset.
函数:vector<Sequence>GenerateTwoSequence(vector<int> oneSeq)
实现细节: 对于1-frequency itemset, 分别由此得到<(1 2)> 、<(1) (2)>如此的itemset, 然后对得到的所有sequence进行剪枝,剪枝策略是通过计算他们在原来数据中的支持度。
(6) 产生3-more frequency itemset.
函数:vector<Sequence> JoinPhase(vector<Sequence> tmpCandidateSet)
实现细节:按照论文的说法,去掉s1中的第一个item和去掉s2中的最后一个item, 合并成新的sequence. 其中还包含判断加入s2的最后一个item是否自成一个itemset,或者合并在已有的itemset中。
函数:vector<Sequence>PrunePhase(std::vector<Sequence> duplicateSet)
实现细节:调用CountSupport函数,进行计算支持度,如果支持度大于阈值,则加入candidateSet。
函数:intCountSupport(Sequence curSequence)
实现细节:对每个sequence,根据已经建好的倒排记录表(数据集中的items映射到时间),首先找到每个itemset的时间,进行存储在vector中,然后进行dfs查询,判断是否满足min_gap和max_gap,如果满足则是一个满足的k-frequency sequence。
函数:boolDfs(vector<vector<int>>storeTime, int len, int cur, int id)
实现细节: 对已经得到的某个sequence每个itemset的对应时间,递归查找是否满足要求min_gap和max_gap,是的话使当前查询的sequence的支持度加1.
(7) 输出
函数:voidOutput(std::vector<Sequence> candidateSet, int id)
实现细节:输出满足的frequent sequence。
#include "structure.h" using namespace std; // data literals. vector<Sequence> literals; // inverted index vector<vector<map<elemType, int>>> elem2time; vector<Sequence> infrequencySeq; // argv[0]='data.txt', argv[1] = min_support // argv[2]=min_gap, argv[3] = max_gap int main(/*int argc, char *argv[]*/) { freopen("100.txt", "r", stdin); // minimum support minSupport = 0.9; // the number of sequence. seqNum = 0; // continues min_gap min_gap = 2; // continuous max_gap max_gap = 4; //Sequence seq; ReadData(seqNum); //count support min_support = seqNum * minSupport; // create inverted index CreateInvertedIndex(seqNum); // record start time. int tim1 = time(0); // Implementation of Gsp algorithm; Gsp(seqNum); // record the end time. int tim2 = time(0); OutputParameter(min_gap, max_gap, minSupport, seqNum, tim1, tim2); return 0; } /******************************************************************************* * @Reference: Mining Sequence Patterns: Generalizations * and Performance Improvements Srikant.R, Agrawal.R, VLDB 96; * @description: The main strategy by GSP algorithm here is: * 1. Candidate Generation: * 1) Join phase * 2) Prune phase. * 2. Counting Candidates: * 1) Reducing the number of candidates that need * to be checked. * a) Adding candidate sequences to the hash-tree * b) Finding the candidates contained in the * data-sequence * 2) Checking whether a data-sequence contains a specific * sequence * Contains test. * a) Forward phase * b) Backward phase * Here we simply using DFS to implement it. * 3. Taxonomies. * @CreateTime: 2014-12-7 10:00 * @LastModifiedTime: 2014-12-9 1:25 *******************************************************************************/ void Gsp(int seqNum) { freopen("output.txt", "w", stdout); map<elemType, int> cand; // Scan the database to find 1-sequence. //-----------------------------first scan----------------------------------- cand = GenerateOneSequence(seqNum); //--------------------------first scan end---------------------------------- vector<elemType> oneSeq; // Get 1-frequency sequence for(map<elemType, int>::iterator iter1 = cand.begin(); iter1 != cand.end(); iter1 ++) oneSeq.push_back(iter1->first); // Get 2-frequency sequence; vector<Sequence> candidateSet; candidateSet = GenerateTwoSequence(oneSeq); Output(candidateSet, 2); for(int k = 3;; k ++) { candidateSet = JoinPhase(candidateSet); if(candidateSet.size() == 0) { cout << "No further " << k-1 << "-frequency patterns." << endl; break; } else{ //candidateSet = PrunePhase(candidateSet); Output(candidateSet, k); } } } // First scan the database to get frequency is one itemset. map<elemType, int> GenerateOneSequence(int seqNum) { map<elemType, int> dup; map<elemType, int> cand; // count every item in the sequence. for(int k = 0; k < literals.size(); k ++) { map<elemType, int> item; for(int i = 0; i < literals[k].itemset.size(); i ++) { for(int j = 0; j < literals[k].itemset[i].item.size(); j ++) { item[literals[k].itemset[i].item[j]] ++; } } map<elemType, int>::iterator iter; for(iter = item.begin(); iter != item.end(); iter ++) { if(iter->second > 0) dup[iter->first] ++; } } // add up to become support. map<elemType, int>::iterator iter1, iter2; for (iter1 = dup.begin(); iter1 != dup.end(); iter1 ++) if(iter1->second >= min_support) { cand[iter1->first] = iter1->second; } cout << "min_support = " << minSupport * seqNum << endl; cout << "1-frequent candidate set" << endl; for(iter1 = cand.begin(); iter1 != cand.end(); iter1 ++) printf("<%d>: %d\n", iter1->first, iter1->second); return cand; } // Generate two frequency sequences. vector<Sequence> GenerateTwoSequence(vector<int> oneSeq) { vector<Sequence> candidateSet; // gather all the two-frequency sequence. for(int i = 0; i < oneSeq.size(); i ++) for(int j = 0; j < oneSeq.size(); j ++) { // <aa> <ab> Sequence duplicate; Itemset tmpItem1, tmpItem2; tmpItem1.item.push_back(oneSeq[i]); duplicate.itemset.push_back(tmpItem1); tmpItem2.item.push_back(oneSeq[j]); duplicate.itemset.push_back(tmpItem2); int cnt = CountSupport(duplicate); if(cnt >= min_support) candidateSet.push_back(duplicate); else infrequencySeq.push_back(duplicate); } // Generate <(ab)>. <(ac)> for(int i = 0; i < oneSeq.size() - 1; i ++) for(int j = i+1; j < oneSeq.size(); j ++) { Sequence duplicate; Itemset tmpItem; tmpItem.item.push_back(oneSeq[i]); tmpItem.item.push_back(oneSeq[j]); duplicate.itemset.push_back(tmpItem); int cnt = CountSupport(duplicate); duplicate.num = cnt; if(cnt >= min_support) candidateSet.push_back(duplicate); else infrequencySeq.push_back(duplicate); } //return PrunePhase(candidateSet); return candidateSet; } // generate k >= 3-frequency sequences vector<Sequence> JoinPhase(vector<Sequence> tmpCandidateSet) { vector<vector<elemType>> seq; vector<Sequence> candidateSet; for(int k = 0; k < tmpCandidateSet.size(); k ++) { vector<elemType> items; for(int i = 0; i < tmpCandidateSet[k].itemset.size(); i ++) for(int j = 0; j < tmpCandidateSet[k].itemset[i].item.size(); j ++) items.push_back(tmpCandidateSet[k].itemset[i].item[j]); seq.push_back(items); } for(int i = 0; i < seq.size(); i ++) for(int j = 0; j < seq.size(); j ++) { vector<elemType> s1; vector<elemType> s2; for(int p = 1; p < seq[i].size(); p ++) s1.push_back(seq[i][p]); for(int q = 0; q < seq[j].size(); q ++) s2.push_back(seq[j][q]); bool flag = false; for(int k = 0; k < s1.size(); k ++) if(s1[k] != s2[k]) flag = true; if(!flag){ Sequence tmpSeq; for(int p = 0; p < tmpCandidateSet[i].itemset.size(); p ++) { Itemset tmpItems; for(int q = 0; q < tmpCandidateSet[i].itemset[p].item.size(); q ++) tmpItems.item.push_back(tmpCandidateSet[i].itemset[p].item[q]); tmpSeq.itemset.push_back(tmpItems); } int tp = tmpCandidateSet[j].itemset.size()-1; int tq = tmpCandidateSet[j].itemset[tp].item.size()-1; int sp = tmpCandidateSet[i].itemset.size()-1; int sq = tmpCandidateSet[i].itemset[sp].item.size()-1; if(tmpCandidateSet[j].itemset[tp].item.size() > 1) { tmpSeq.itemset[sp].item.push_back(s2[s2.size()-1]); } else{ Itemset tmpItems; tmpItems.item.push_back(s2[s2.size()-1]); tmpSeq.itemset.push_back(tmpItems); } int cnt = CountSupport(tmpSeq); tmpSeq.num = cnt; if(cnt >= min_support) candidateSet.push_back(tmpSeq); else infrequencySeq.push_back(tmpSeq); } } return candidateSet; } // Prune Phase; // Strategy here, we will use forward phase, and backward phase. vector<Sequence> PrunePhase(std::vector<Sequence> duplicateSet) { vector<Sequence> candidateSet; for(int k = 0; k < duplicateSet.size(); k ++) { Sequence duplicate; for(int i = 0; i < duplicateSet[k].itemset.size(); i ++) { Itemset items; for(int j = 0; j < duplicateSet[k].itemset[i].item.size(); j ++) items.item.push_back(duplicateSet[k].itemset[i].item[j]); duplicate.itemset.push_back(items); } int cnt = CountSupport(duplicate); if(cnt >= min_support) { duplicate.num = cnt; candidateSet.push_back(duplicate); } } return candidateSet; } // Count candidate set's support int CountSupport(Sequence curSequence) { int cnt = 0; //cout << "CountSupport" << endl; vector<vector<int>> seq; for(int i = 0; i < curSequence.itemset.size(); i ++) { vector<int> subSeq; for(int j = 0; j < curSequence.itemset[i].item.size(); j ++) subSeq.push_back(curSequence.itemset[i].item[j]); seq.push_back(subSeq); } for(int k = 0; k < literals.size(); k ++) { vector<vector<int>> storeTime; for(int i = 0; i < seq.size(); i ++) { int matchNum; int t = -1; vector<int> localTime; for(int p = 0; p < elem2time[k].size(); p ++) { matchNum = 0; for(int j = 0; j < seq[i].size(); j ++) if(elem2time[k][p][seq[i][j]] > 0) { t = elem2time[k][p][seq[i][j]]; matchNum ++; } if(matchNum == seq[i].size()) localTime.push_back(t); } if(localTime.size() > 0) storeTime.push_back(localTime); } if(storeTime.size() != seq.size()) continue; for(int i = 0; i < storeTime[0].size(); i ++) { bool flag = false; flag = Dfs(storeTime, storeTime.size(), storeTime[0][i], 1); if (flag) { cnt ++; break; } } } return cnt; } // Dfs to find a sequence contain the min_gap and max_gap conditions bool Dfs(vector<vector<int>>storeTime, int len, int cur, int id) { if(id == len) return true; for(int i = 0; i < storeTime[id].size(); i ++){ if(storeTime[id][i] - cur >= min_gap && storeTime[id][i] - cur <= max_gap) { return Dfs(storeTime, storeTime.size(), storeTime[id][i], id + 1); } } return false; } // Output k-th candidateSet. void Output(std::vector<Sequence> candidateSet, int id) { int len = candidateSet.size(); if(len != 0) cout << id << "-frequency candidate set" << endl; for(int k = 0; k < len; k ++) { cout << "<"; for(int i = 0; i < candidateSet[k].itemset.size(); i ++) { cout << "("; for(int j = 0; j < candidateSet[k].itemset[i].item.size(); j ++) { cout << candidateSet[k].itemset[i].item[j] << " "; } cout << ")"; } cout << ">: "; cout << candidateSet[k].num << endl; } cout << endl; } // Read data from file void ReadData(/*Sequence *seq, */int &seqNum) { int id, num, tmp, t; int cnt = 1; map<int, int> transactionID; while(cin >> id) { Sequence seq; if(transactionID[id] == 0) { Itemset items; cin >> num; t = 0; for(int i = 0; i < num; i ++) { cin >> tmp; items.item.push_back(tmp); } items.timestamp = t; t ++; seq.itemset.push_back(items); seq.tid = id; literals.push_back(seq); transactionID[id] = cnt ++; } else{ Itemset items; cin >> num; for(int i = 0; i < num; i ++) { cin >> tmp; items.item.push_back(tmp); } items.timestamp = t; t ++; literals[transactionID[id]-1].itemset.push_back(items); } } seqNum = literals.size(); } inline int min(int a, int b) { return a > b ? b : a; } // Create Inverted Index void CreateInvertedIndex(int seqNum) { for(int k = 0; k < seqNum; k ++) { vector<map<elemType, int>> invertedIndex; for(int i = 0; i < literals[k].itemset.size(); i ++) { map<elemType, int> tmp; for(int j = 0; j < literals[k].itemset[i].item.size(); j ++) tmp[literals[k].itemset[i].item[j]] = i+1; invertedIndex.push_back(tmp); } elem2time.push_back(invertedIndex); } } // output parameters void OutputParameter(int min_gap, int max_gap, double min_support, int seqNum, int tim1, int tim2) { cout << "---------------------------------" << endl; cout << "\nmin_gap = " << min_gap << endl; cout << "max_gap = " << max_gap << endl; cout << "min_support = " << min_support << endl; cout << "the number of sequence = " << seqNum << endl; // output time cout << "\nbegin time: " << tim1 << '\n'; cout << "end time : " << tim2 << '\n'; cout << "The execution time is:\n" << tim2-tim1; cout << "\n\nEnd the program" << endl; cout << "---------------------------------" << endl; }