贝叶斯的应用
-
过滤垃圾邮件
贝叶斯分类器的著名的应用就是垃圾邮件过滤了,这方面推荐想详细了解的可以去看看《黑客与画家》或是《数学之美》中对应的章节,贝叶斯的基础实现看这里
数据集
两个文件夹,分别是正常邮件和垃圾邮件,其中各有25封邮件
测试方法
从50封邮件中随机选取10封做为测试数据
实现细节
1.首先我们需要将文本转成我们需要的向量的样子,这里需要使用一点正则表达式
2.由于采取交叉验证的方式,随机过程会导致每次的结果不尽相同
1 #coding=utf-8 2 from numpy import * 3 4 #解析文档的函数 5 def textParse(bigString): 6 import re 7 listOfTokens = re.split(r‘\W*‘,bigString) 8 return [tok.lower() for tok in listOfTokens if len(tok) > 2] 9 10 11 #创建一个带有所有单词的列表 12 def createVocabList(dataSet): 13 vocabSet = set([]) 14 for document in dataSet: 15 vocabSet = vocabSet | set(document) 16 return list(vocabSet) 17 18 def setOfWords2Vec(vocabList, inputSet): 19 retVocabList = [0] * len(vocabList) 20 for word in inputSet: 21 if word in vocabList: 22 retVocabList[vocabList.index(word)] = 1 23 else: 24 print ‘word ‘,word ,‘not in dict‘ 25 return retVocabList 26 27 #另一种模型 28 def bagOfWords2VecMN(vocabList, inputSet): 29 returnVec = [0]*len(vocabList) 30 for word in inputSet: 31 if word in vocabList: 32 returnVec[vocabList.index(word)] += 1 33 return returnVec 34 35 def trainNB0(trainMatrix,trainCatergory): 36 numTrainDoc = len(trainMatrix) 37 numWords = len(trainMatrix[0]) 38 pAbusive = sum(trainCatergory)/float(numTrainDoc) 39 #防止多个概率的成绩当中的一个为0 40 p0Num = ones(numWords) 41 p1Num = ones(numWords) 42 p0Denom = 2.0 43 p1Denom = 2.0 44 for i in range(numTrainDoc): 45 if trainCatergory[i] == 1: 46 p1Num +=trainMatrix[i] 47 p1Denom += sum(trainMatrix[i]) 48 else: 49 p0Num +=trainMatrix[i] 50 p0Denom += sum(trainMatrix[i]) 51 p1Vect = log(p1Num/p1Denom)#处于精度的考虑,否则很可能到限归零 52 p0Vect = log(p0Num/p0Denom) 53 return p0Vect,p1Vect,pAbusive 54 55 def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1): 56 p1 = sum(vec2Classify * p1Vec) + log(pClass1) #element-wise mult 57 p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1) 58 if p1 > p0: 59 return 1 60 else: 61 return 0 62 63 def spamTest(spamFloder, hamFloder): 64 docList = [] 65 classList = [] 66 fullText = [] 67 for i in range(1,26): 68 wordList = textParse(open(spamFloder+str(i)+‘.txt‘).read()) 69 docList.append(wordList) 70 fullText.extend(wordList) 71 classList.append(1) 72 wordList = textParse(open(hamFloder+str(i)+‘.txt‘).read()) 73 docList.append(wordList) 74 fullText.extend(wordList) 75 classList.append(0) 76 vocabList = createVocabList(docList) 77 trainingSet = range(50) 78 testSet = [] 79 for i in range(10): 80 randIndex = int(random.uniform(0,len(trainingSet))) 81 testSet.append(trainingSet[randIndex]) 82 del(trainingSet[randIndex]) 83 trainMat = [] 84 trianClasses = [] 85 print trainingSet 86 for docIndex in trainingSet: 87 trainMat.append(setOfWords2Vec(vocabList, docList[docIndex])) 88 #trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex])) 89 trianClasses.append(classList[docIndex]) 90 p0V,p1V,pSpam = trainNB0(array(trainMat),array(trianClasses)) 91 errorCount = 0 92 for docIndex in testSet: #classify the remaining items 93 #wordVector = bagOfWords2VecMN(vocabList, docList[docIndex]) 94 wordVector = setOfWords2Vec(vocabList, docList[docIndex]) 95 if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]: 96 errorCount += 1 97 print "classification error",docList[docIndex] 98 print ‘the error rate is: ‘,float(errorCount)/len(testSet) 99 #return vocabList,fullText 100 101 102 def main(): 103 spamTest(‘email/spam/‘,‘email/ham/‘) 104 105 if __name__ == ‘__main__‘: 106 main()
-
从个人广告中获取地区倾向
这个是从某个网站上提取了不同地区板块的信息,分析他们的用词是不是有某些规律
数据集
这里的数据使用RSS获取的,用到了python的feedparse包,想了解可以看这里.这里分别获取了某网站两个地区板块中的信息
测试方法
交叉验证
实现细节
1.这里有两种字符需要特别处理(其实他们有很大重合),一种是频率最高的一些,另一种是所谓的停用词(我的理解其实就是那些使用频率很高但没什么实际意义的),各种语言的停用词可以看这里。
我们需要移除这些词以使得结果更能体现出地区差异。
2.getTopWords函数实际上就是对这个概率统计了一下特征。对学习贝叶斯来说不是必要代码
3.除了数据来源不同实现细节和上面的很相似
-
1 #coding=utf-8 2 from numpy import * 3 4 #解析文档的函数 5 def textParse(bigString): 6 import re 7 listOfTokens = re.split(r‘\W*‘,bigString) 8 return [tok.lower() for tok in listOfTokens if len(tok) > 2] 9 10 11 #创建一个带有所有单词的列表 12 def createVocabList(dataSet): 13 vocabSet = set([]) 14 for document in dataSet: 15 vocabSet = vocabSet | set(document) 16 return list(vocabSet) 17 18 def setOfWords2Vec(vocabList, inputSet): 19 retVocabList = [0] * len(vocabList) 20 for word in inputSet: 21 if word in vocabList: 22 retVocabList[vocabList.index(word)] = 1 23 else: 24 print ‘word ‘,word ,‘not in dict‘ 25 return retVocabList 26 27 #另一种模型 28 def bagOfWords2VecMN(vocabList, inputSet): 29 returnVec = [0]*len(vocabList) 30 for word in inputSet: 31 if word in vocabList: 32 returnVec[vocabList.index(word)] += 1 33 return returnVec 34 35 def trainNB0(trainMatrix,trainCatergory): 36 numTrainDoc = len(trainMatrix) 37 numWords = len(trainMatrix[0]) 38 pAbusive = sum(trainCatergory)/float(numTrainDoc) 39 #防止多个概率的成绩当中的一个为0 40 p0Num = ones(numWords) 41 p1Num = ones(numWords) 42 p0Denom = 2.0 43 p1Denom = 2.0 44 for i in range(numTrainDoc): 45 if trainCatergory[i] == 1: 46 p1Num +=trainMatrix[i] 47 p1Denom += sum(trainMatrix[i]) 48 else: 49 p0Num +=trainMatrix[i] 50 p0Denom += sum(trainMatrix[i]) 51 p1Vect = log(p1Num/p1Denom)#处于精度的考虑,否则很可能到限归零 52 p0Vect = log(p0Num/p0Denom) 53 return p0Vect,p1Vect,pAbusive 54 55 def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1): 56 p1 = sum(vec2Classify * p1Vec) + log(pClass1) #element-wise mult 57 p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1) 58 if p1 > p0: 59 return 1 60 else: 61 return 0 62 63 def stopWords(): 64 stopW = [] 65 f = open(‘stopwords.txt‘).readlines() 66 for eachLine in f: 67 stopW.append(eachLine[:-1]) 68 return stopW 69 70 def calcMostFreq(vocabList,fullText): 71 import operator 72 freqDict = {} 73 for token in vocabList: 74 freqDict[token]=fullText.count(token) 75 sortedFreq = sorted(freqDict.iteritems(), key=operator.itemgetter(1), reverse=True) 76 return sortedFreq[:30] 77 78 def localWords(rss1,rss0): 79 import feedparser 80 feed1 = feedparser.parse(rss1) 81 feed0 = feedparser.parse(rss0) 82 docList=[]; classList = []; fullText =[] 83 minLen = min(len(feed1[‘entries‘]),len(feed0[‘entries‘])) 84 for i in range(minLen): 85 wordList = textParse(feed1[‘entries‘][i][‘summary‘]) 86 docList.append(wordList) 87 fullText.extend(wordList) 88 classList.append(1) #NY is class 1 89 wordList = textParse(feed0[‘entries‘][i][‘summary‘]) 90 docList.append(wordList) 91 fullText.extend(wordList) 92 classList.append(0) 93 vocabList = createVocabList(docList)#create vocabulary 94 top30Words = calcMostFreq(vocabList,fullText) #remove top 30 words 95 for pairW in top30Words: 96 if pairW[0] in vocabList: vocabList.remove(pairW[0]) 97 stopW = stopWords() 98 for pairW in stopW: 99 if pairW[0] in vocabList: 100 vocabList.remove(pairW[0]) 101 trainingSet = range(2*minLen); testSet=[] #create test set 102 for i in range(20): 103 randIndex = int(random.uniform(0,len(trainingSet))) 104 testSet.append(trainingSet[randIndex]) 105 del(trainingSet[randIndex]) 106 trainMat=[]; trainClasses = [] 107 for docIndex in trainingSet:#train the classifier (get probs) trainNB0 108 trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex])) 109 trainClasses.append(classList[docIndex]) 110 p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses)) 111 errorCount = 0 112 for docIndex in testSet: #classify the remaining items 113 wordVector = bagOfWords2VecMN(vocabList, docList[docIndex]) 114 if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]: 115 errorCount += 1 116 print ‘the error rate is: ‘,float(errorCount)/len(testSet) 117 return vocabList,p0V,p1V 118 119 def getTopWords(ny,sf): 120 import operator 121 vocabList,p0V,p1V=localWords(ny,sf) 122 topNY=[]; topSF=[] 123 for i in range(len(p0V)): 124 if p0V[i] > -6.0 : topSF.append((vocabList[i],p0V[i])) 125 if p1V[i] > -6.0 : topNY.append((vocabList[i],p1V[i])) 126 sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True) 127 print "SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**" 128 for item in sortedSF: 129 print item[0] 130 sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True) 131 print "NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**" 132 for item in sortedNY: 133 print item[0] 134 135 def main(): 136 #print stopWords() 137 localWords(‘http://newyork.craigslist.org/stp/index.rss‘,‘http://sfbay.craigslist.org/stp/index.rss‘) 138 139 if __name__ == ‘__main__‘: 140 main()
时间: 2024-11-05 23:29:22