1 #encoding:utf-8 2 from numpy import * 3 import feedparser 4 5 #加载数据集 6 def loadDataSet(): 7 postingList = [[‘my‘, ‘dog‘, ‘has‘, ‘flea‘, ‘problems‘, ‘help‘, ‘please‘], 8 [‘maybe‘, ‘not‘, ‘take‘, ‘him‘, ‘to‘, ‘dog‘, ‘park‘, ‘stupid‘], 9 [‘my‘, ‘dalmation‘, ‘is‘, ‘so‘, ‘cute‘, ‘I‘, ‘love‘, ‘him‘], 10 [‘stop‘, ‘posting‘, ‘stupid‘, ‘worthless‘, ‘garbage‘], 11 [‘mr‘, ‘licks‘, ‘ate‘, ‘my‘, ‘steak‘, ‘how‘, ‘to‘, ‘stop‘, ‘him‘], 12 [‘quit‘, ‘buying‘, ‘worthless‘, ‘dog‘, ‘food‘, ‘stupid‘]] 13 classVec = [0, 1, 0, 1, 0, 1] # 1表示侮辱性言论,0表示正常言论 14 return postingList, classVec 15 16 17 def createVocabList(dataSet): #得到词汇集合 18 vocabSet = set([]) 19 for document in dataSet: 20 vocabSet = vocabSet | set(document) #两个集合的并集 21 return list(vocabSet) 22 23 24 def setOfWords2Vec(vocabList, inputSet): #内容转化为向量 25 returnVec = [0] * len(vocabList) #所有元素都为0的向量 26 for word in inputSet: 27 if word in vocabList: 28 returnVec[vocabList.index(word)] = 1 #如果有这个词条,则置1 29 else: 30 print "这个词条: %s 不在我的词典中!" % word 31 return returnVec 32 33 34 def trainNB0(trainMatrix, trainCategory): #朴素贝叶斯分类器训练函数 35 numTrainDocs = len(trainMatrix) #文档数量 36 numWords = len(trainMatrix[0]) #每篇文档的词条数 37 pAbusive = sum(trainCategory) / float(numTrainDocs) #侮辱性文档的比例 38 p0Num = ones(numWords) 39 p1Num = ones(numWords) # change to ones() 40 p0Denom = 2.0 41 p1Denom = 2.0 # change to 2.0 42 for i in range(numTrainDocs): 43 if trainCategory[i] == 1: 44 p1Num += trainMatrix[i] 45 p1Denom += sum(trainMatrix[i]) #侮辱性文档的总词数 46 else: 47 p0Num += trainMatrix[i] 48 p0Denom += sum(trainMatrix[i]) #正常文档的总词数 49 p1Vect = log(p1Num / p1Denom) # change to log() 表示p(wi/c1) 50 p0Vect = log(p0Num / p0Denom) # change to log() 表示p(wi/c0) 51 return p0Vect, p1Vect, pAbusive 52 53 54 def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1): #对输入的vec2Classify进行分类 55 p1 = sum(vec2Classify * p1Vec) + log(pClass1) # element-wise mult 56 p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1) 57 if p1 > p0: 58 return 1 59 else: 60 return 0 61 62 63 def testingNB(): #是对以上方法的整合,方便运行和调试 64 listOPosts, listClasses = loadDataSet() #加载数据 65 myVocabList = createVocabList(listOPosts) #得到字典 66 trainMat = [] 67 for postinDoc in listOPosts: 68 trainMat.append(setOfWords2Vec(myVocabList, postinDoc)) #转化为0,1向量 69 p0V, p1V, pAb = trainNB0(array(trainMat), array(listClasses)) 70 testEntry = [‘love‘, ‘my‘, ‘dalmation‘] #代分类的输入 71 thisDoc = array(setOfWords2Vec(myVocabList, testEntry)) #转化 72 print testEntry, ‘classified as: ‘, classifyNB(thisDoc, p0V, p1V, pAb) #进行分类 73 testEntry = [‘stupid‘, ‘garbage‘] 74 thisDoc = array(setOfWords2Vec(myVocabList, testEntry)) 75 print testEntry, ‘classified as: ‘, classifyNB(thisDoc, p0V, p1V, pAb) 76 77 78 def bagOfWords2VecMN(vocabList, inputSet): 79 returnVec = [0] * len(vocabList) 80 for word in inputSet: 81 if word in vocabList: 82 returnVec[vocabList.index(word)] += 1 83 return returnVec 84 85 86 def textParse(bigString): # 处理字符串得到字符串列表,并过滤 87 import re 88 listOfTokens = re.split(r‘\W*‘, bigString) 89 return [tok.lower() for tok in listOfTokens if len(tok) > 2] 90 91 #垃圾邮件测试函数 92 def spamTest(): 93 docList = []; 94 classList = []; 95 fullText = [] 96 for i in range(1, 26): 97 wordList = textParse(open(‘email/spam/%d.txt‘ % i).read()) #读取文件 98 docList.append(wordList) #注意append和extend 99 fullText.extend(wordList) 100 classList.append(1) 101 wordList = textParse(open(‘email/ham/%d.txt‘ % i).read()) 102 docList.append(wordList) 103 fullText.extend(wordList) 104 classList.append(0) 105 vocabList = createVocabList(docList) # 建立字典 106 trainingSet = range(50); 107 testSet = [] 108 for i in range(10): 109 randIndex = int(random.uniform(0, len(trainingSet))) #随机取10个作为测试集 110 testSet.append(trainingSet[randIndex]) 111 del (trainingSet[randIndex]) #从训练集中删除 112 trainMat = []; 113 trainClasses = [] 114 for docIndex in trainingSet: # 训练 115 trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex])) 116 trainClasses.append(classList[docIndex]) 117 p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses)) 118 errorCount = 0 119 for docIndex in testSet: # 分类 120 #wordVector = bagOfWords2VecMN(vocabList, docList[docIndex]) 121 wordVector = setOfWords2Vec(vocabList, docList[docIndex]) 122 if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: 123 errorCount += 1 124 print "classification error", docList[docIndex] 125 print ‘the error rate is: ‘, float(errorCount) / len(testSet) 126 # return vocabList,fullText 127 128 129 def calcMostFreq(vocabList, fullText): #计算出现频率 130 import operator 131 freqDict = {} 132 for token in vocabList: 133 freqDict[token] = fullText.count(token) 134 sortedFreq = sorted(freqDict.iteritems(), key=operator.itemgetter(1), reverse=True) #排序 135 return sortedFreq[:30] #返回前30 136 137 138 def localWords(feed1, feed0): 139 import feedparser 140 docList = []; 141 classList = []; 142 fullText = [] 143 minLen = min(len(feed1[‘entries‘]), len(feed0[‘entries‘])) 144 for i in range(minLen): 145 wordList = textParse(feed1[‘entries‘][i][‘summary‘]) #每次访问一条RSS源 146 docList.append(wordList) 147 fullText.extend(wordList) 148 classList.append(1) # NY is class 1 149 wordList = textParse(feed0[‘entries‘][i][‘summary‘]) 150 docList.append(wordList) 151 fullText.extend(wordList) 152 classList.append(0) 153 vocabList = createVocabList(docList) # 建立字典 154 top30Words = calcMostFreq(vocabList, fullText) # 去掉次数最高的前30个词 155 for pairW in top30Words: 156 if pairW[0] in vocabList: vocabList.remove(pairW[0]) 157 trainingSet = range(2 * minLen); 158 testSet = [] # create test set 159 for i in range(20): 160 randIndex = int(random.uniform(0, len(trainingSet))) 161 testSet.append(trainingSet[randIndex]) 162 del (trainingSet[randIndex]) 163 trainMat = []; 164 trainClasses = [] 165 for docIndex in trainingSet: # train the classifier (get probs) trainNB0 166 trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex])) 167 trainClasses.append(classList[docIndex]) 168 p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses)) 169 errorCount = 0 170 for docIndex in testSet: # classify the remaining items 171 wordVector = bagOfWords2VecMN(vocabList, docList[docIndex]) 172 if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: 173 errorCount += 1 174 print ‘the error rate is: ‘, float(errorCount) / len(testSet) 175 return vocabList, p0V, p1V 176 177 178 def getTopWords(ny, sf): 179 import operator 180 vocabList, p0V, p1V = localWords(ny, sf) 181 topNY = []; 182 topSF = [] 183 for i in range(len(p0V)): 184 if p0V[i] > -6.0: topSF.append((vocabList[i], p0V[i])) 185 if p1V[i] > -6.0: topNY.append((vocabList[i], p1V[i])) 186 sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True) 187 print "SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**" 188 for item in sortedSF: 189 print item[0] 190 sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True) 191 print "NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**" 192 for item in sortedNY: 193 print item[0] 194 195 if __name__ == ‘__main__‘: 196 # postingList, classVec = loadDataSet() 197 # vocabList = createVocabList(postingList) 198 # trainMat = [] 199 # for line in postingList: 200 # trainMat.append(setOfWords2Vec(vocabList,line)) 201 # p0V,p1V,p = trainNB0(trainMat,classVec) 202 # print p0V 203 # print p1V 204 # print p 205 # spamTest() 206 ny = feedparser.parse(‘http://newyork.craigslist.org/stp/index.rss‘) 207 sf = feedparser.parse(‘http://sfbay.craigslist.org/stp/index.rss‘) 208 getTopWords(ny,sf)
时间: 2024-10-13 12:20:23