基于用户的协同过滤算法-参考《推荐系统实践》一书,作者:项亮
1 import random 2 import math 3 class UserBasedCF: 4 def __init__(self,datafile = None): 5 self.datafile = datafile 6 self.readData() 7 self.splitData(3,47) 8 def readData(self,datafile = None): 9 """ 10 read the data from the data file which is a data set 11 """ 12 self.datafile = datafile or self.datafile 13 self.data = [] 14 for line in open(self.datafile): 15 userid,itemid,record,_ = line.split() 16 self.data.append((userid,itemid,int(record))) 17 def splitData(self,k,seed,data=None,M = 8): 18 """ 19 split the data set 20 testdata is a test data set 21 traindata is a train set 22 test data set / train data set is 1:M-1 23 """ 24 self.testdata = {} 25 self.traindata = {} 26 data = data or self.data 27 random.seed(seed) 28 for user,item, record in self.data: 29 if random.randint(0,M) == k: 30 self.testdata.setdefault(user,{}) 31 self.testdata[user][item] = record 32 else: 33 self.traindata.setdefault(user,{}) 34 self.traindata[user][item] = record 35 def userSimilarity(self,train = None): 36 """ 37 One method of getting user similarity matrix 38 """ 39 train = train or self.traindata 40 self.userSim = dict() 41 for u in train.keys(): 42 for v in train.keys(): 43 if u == v: 44 continue 45 self.userSim.setdefault(u,{}) 46 self.userSim[u][v] = len(set(train[u].keys()) & set(train[v].keys())) 47 self.userSim[u][v] /=math.sqrt(len(train[u]) * len(train[v]) *1.0) 48 def userSimilarityBest(self,train = None): 49 """ 50 the other method of getting user similarity which is better than above 51 you can get the method on page 46 52 In this experiment,we use this method 53 """ 54 train = train or self.traindata 55 self.userSimBest = dict() 56 item_users = dict() 57 for u,item in train.items(): 58 for i in item.keys(): 59 item_users.setdefault(i,set()) 60 item_users[i].add(u) 61 user_item_count = dict() 62 count = dict() 63 for item,users in item_users.items(): 64 for u in users: 65 user_item_count.setdefault(u,0) 66 user_item_count[u] += 1 67 for v in users: 68 if u == v:continue 69 count.setdefault(u,{}) 70 count[u].setdefault(v,0) 71 count[u][v] += 1 72 for u ,related_users in count.items(): 73 self.userSimBest.setdefault(u,dict()) 74 for v, cuv in related_users.items(): 75 self.userSimBest[u][v] = cuv / math.sqrt(user_item_count[u] * user_item_count[v] * 1.0) 76 77 def recommend(self,user,train = None,k = 8,nitem = 40): 78 train = train or self.traindata 79 rank = dict() 80 interacted_items = train.get(user,{}) 81 for v ,wuv in sorted(self.userSimBest[user].items(),key = lambda x : x[1],reverse = True)[0:k]: 82 for i , rvi in train[v].items(): 83 if i in interacted_items: 84 continue 85 rank.setdefault(i,0) 86 rank[i] += wuv 87 return dict(sorted(rank.items(),key = lambda x :x[1],reverse = True)[0:nitem]) 88 def recallAndPrecision(self,train = None,test = None,k = 8,nitem = 10): 89 """ 90 Get the recall and precision, the method you want to know is listed 91 in the page 43 92 """ 93 train = train or self.traindata 94 test = test or self.testdata 95 hit = 0 96 recall = 0 97 precision = 0 98 for user in train.keys(): 99 tu = test.get(user,{}) 100 rank = self.recommend(user, train = train,k = k,nitem = nitem) 101 for item,_ in rank.items(): 102 if item in tu: 103 hit += 1 104 recall += len(tu) 105 precision += nitem 106 return (hit / (recall * 1.0),hit / (precision * 1.0)) 107 def coverage(self,train = None,test = None,k = 8,nitem = 10): 108 train = train or self.traindata 109 test = test or self.testdata 110 recommend_items = set() 111 all_items = set() 112 for user in train.keys(): 113 for item in train[user].keys(): 114 all_items.add(item) 115 rank = self.recommend(user, train, k = k, nitem = nitem) 116 for item,_ in rank.items(): 117 recommend_items.add(item) 118 return len(recommend_items) / (len(all_items) * 1.0) 119 def popularity(self,train = None,test = None,k = 8,nitem = 10): 120 """ 121 Get the popularity 122 the algorithm on page 44 123 """ 124 train = train or self.traindata 125 test = test or self.testdata 126 item_popularity = dict() 127 for user ,items in train.items(): 128 for item in items.keys(): 129 item_popularity.setdefault(item,0) 130 item_popularity[item] += 1 131 ret = 0 132 n = 0 133 for user in train.keys(): 134 rank = self.recommend(user, train, k = k, nitem = nitem) 135 for item ,_ in rank.items(): 136 ret += math.log(1+item_popularity[item]) 137 n += 1 138 return ret / (n * 1.0) 139 140 def testRecommend(): 141 ubcf = UserBasedCF(‘u.data‘) 142 ubcf.readData() 143 ubcf.splitData(4,100) 144 ubcf.userSimilarity() 145 user = "345" 146 rank = ubcf.recommend(user,k = 3) 147 for i,rvi in rank.items(): 148 149 items = ubcf.testdata.get(user,{}) 150 record = items.get(i,0) 151 print "%5s: %.4f--%.4f" %(i,rvi,record) 152 def testUserBasedCF(): 153 cf = UserBasedCF(‘u.data‘) 154 cf.userSimilarityBest() 155 print "%3s%20s%20s%20s%20s" % (‘K‘,"recall",‘precision‘,‘coverage‘,‘popularity‘) 156 for k in [5,10,20,40,80,160]: 157 recall,precision = cf.recallAndPrecision( k = k) 158 coverage = cf.coverage(k = k) 159 popularity = cf.popularity(k = k) 160 print "%3d%19.3f%%%19.3f%%%19.3f%%%20.3f" % (k,recall * 100,precision * 100,coverage * 100,popularity) 161 162 if __name__ == "__main__": 163 testUserBasedCF()
时间: 2024-10-11 07:25:27