
圖片.png
關(guān)于樸素貝葉斯的理論介紹,請(qǐng)參見下方鏈接:
帶你搞懂樸素貝葉斯分類算法
python代碼實(shí)現(xiàn)樸素貝葉斯分類算法
'''判斷留言是否屬于敏感類留言'''
import numpy as np
def loadDataSet():#生成一個(gè)文本集
postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec = [0,1,0,1,0,1] #0代表正常留言,1代表敏感留言
return postingList,classVec
def createVocabList(dataSet):#利用文本集,生成一個(gè)詞匯表
vocabSet = set() #確保詞匯表的唯一性
for document in dataSet:
vocabSet = vocabSet | set(document)
return list(vocabSet)
def setofwordsvec(vocabList,inputSet):#檢查句中的單詞是否在詞匯表里存在
#(詞集模型,即只判斷該詞是否出現(xiàn))
returnvec = [0]*len(vocabList) #依據(jù)詞匯表的長度生成一個(gè)全為0的向量
for word in inputSet:
if word in vocabList:
returnvec[vocabList.index(word)] = 1 #如果單詞存在詞匯表,則將詞匯表
#對(duì)應(yīng)的值設(shè)為1
else:
print('the word:{}is not in my vocabulary'.format(word))
return returnvec
def bagofwordsvec(vocabList,inputSet):#檢查句中的單詞是否在詞匯表里存在
#(詞袋模型,統(tǒng)計(jì)每個(gè)詞出現(xiàn)的次數(shù))
returnvec = [0]*len(vocabList) #依據(jù)詞匯表的長度生成一個(gè)全為0的向量
for word in inputSet:
if word in vocabList:
returnvec[vocabList.index(word)] += 1 #如果單詞存在詞匯表,則將詞匯表
#對(duì)應(yīng)的值加1
else:
print('the word:{}is not in my vocabulary'.format(word))
return returnvec
#樸素貝葉斯分類器通常有兩種實(shí)現(xiàn)方式,一種基于貝努利模型實(shí)現(xiàn),一種基于多項(xiàng)式模型實(shí)現(xiàn)
#貝努利模型不考慮詞出現(xiàn)的次數(shù),只考慮詞出不出現(xiàn),相當(dāng)于每個(gè)詞的權(quán)重都是一樣的
#多項(xiàng)式模型考慮詞出現(xiàn)的次數(shù),即給詞賦予不一樣的權(quán)重
listOposts,listclasses = loadDataSet()
myvocablist = createVocabList(listOposts)
'''初版
def trainNBO(trainMatrix,trainCategory):
numTrainDocs = len(trainMatrix) #獲取有多少條文本
numWords = len(trainMatrix[0]) #獲取文本長度
pAbusive = np.sum(trainCategory)/float(numTrainDocs)#帶有敏感字的文檔和是3,
#除以總文檔數(shù),即為是敏感文檔的概率p(1)
p0Num = np.zeros(numWords)#根據(jù)文本長度設(shè)定一個(gè)全0向量
p1Num = np.zeros(numWords)#這里注意,生成的類型是np.ndarray
p0Denom = 0
p1Denom = 0
for i in range(numTrainDocs):#遍歷所有文本
if trainCategory[i] == 1:#如果對(duì)應(yīng)的標(biāo)簽是1,即敏感文本
p1Num += trainMatrix[i] #統(tǒng)計(jì)文本中所有單詞出現(xiàn)的次數(shù)
#因?yàn)轭愋褪莕p.ndarray,所以這里對(duì)應(yīng)位置的值是直接相加的
p1Denom += np.sum(trainMatrix[i]) #這里統(tǒng)計(jì)共有多少詞
else:#因?yàn)榇死挥袃蓚€(gè)特征,即0或1,所以if后可直接else,否則要多加判斷,
#且要新增對(duì)應(yīng)的統(tǒng)計(jì)變量
p0Num += trainMatrix[i]
p0Denom += np.sum(trainMatrix[i])
p1Vect = p1Num/p1Denom #這里計(jì)算敏感文本中,每個(gè)詞占該類型下所有詞的比例,即p(wi|c1)
p0Vect = p0Num/p0Denom #這是p(wi|c0) #i和0是下標(biāo)
return p0Vect,p1Vect,pAbusive
'''
#根據(jù)樸素貝葉斯假設(shè),p(w|c) = p(w1|c)p(w2|c)...p(wn|c),因此我們要避免其中一項(xiàng)為0
#所以上述代碼中,p0Num及p1Num的定義我們改為np.ones(numWords),同時(shí)將p0Denom和p1Denom初始化為2
#關(guān)于p0Vect和p1Vect的定義中,當(dāng)因子非常小時(shí),該變量值也小,那么p(w|c) = p(w1|c)p(w2|c)...p(wn|c)
#就很有可能下溢或者得不到正確答案,這里我們將其采用自然對(duì)數(shù)進(jìn)行處理。改為:p1Vext = np.log(p1Num/p1Denom)
#f(x)和ln(f(x))在趨勢(shì)上一致
'''優(yōu)化后'''
def trainNBO(trainMatrix,trainCategory):#計(jì)算各類文檔概率以及各種詞出現(xiàn)在各類文檔的概率
numTrainDocs = len(trainMatrix) #獲取有多少條文本
numWords = len(trainMatrix[0]) #獲取文本長度
pAbusive = np.sum(trainCategory)/float(numTrainDocs)#帶有敏感字的文檔和是3,
#除以總文檔數(shù),即為是敏感文檔的概率p(1)
p0Num = np.ones(numWords)#根據(jù)文本長度設(shè)定一個(gè)全0向量
p1Num = np.ones(numWords)#這里注意,生成的類型是np.ndarray
p0Denom = 0
p1Denom = 0
for i in range(numTrainDocs):#遍歷所有文本
if trainCategory[i] == 1:#如果對(duì)應(yīng)的標(biāo)簽是1,即敏感文本
p1Num += trainMatrix[i] #統(tǒng)計(jì)文本中所有單詞出現(xiàn)的次數(shù)
#因?yàn)轭愋褪莕p.ndarray,所以這里對(duì)應(yīng)位置的值是直接相加的
p1Denom += np.sum(trainMatrix[i]) #這里統(tǒng)計(jì)共有多少詞
else:#因?yàn)榇死挥袃蓚€(gè)特征,即0或1,所以if后可直接else,否則要多加判斷,
#且要新增對(duì)應(yīng)的統(tǒng)計(jì)變量
p0Num += trainMatrix[i]
p0Denom += np.sum(trainMatrix[i])
p1Vect = log(p1Num/p1Denom) #這里計(jì)算敏感文本中,每個(gè)詞占該類型下所有詞的比例,即p(wi|c1)
p0Vect = log(p0Num/p0Denom) #這是p(wi|c0) #i和0是下標(biāo)
return p0Vect,p1Vect,pAbusive
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):#計(jì)算最終概率并對(duì)比
p1 = np.sum(vec2Classify*p1Vec) + np.log(pClass1)#因?yàn)檗D(zhuǎn)成log了,所以原定理中
#相乘的部分通過相加實(shí)現(xiàn),另外定理中還有一個(gè)分母,這里也沒用,是因?yàn)橐獙?duì)比的分母是一樣的
#因此,這里只對(duì)比分子
p0 = np.sum(vec2Classify*p0Vec) + np.log(1-pClass1)
if p1 > p0:
return 1
else:
return 0
def testingNB():#定義測試函數(shù)
listOPosts,listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
trainMat = []
for postinDoc in listOPosts:
trainMat.append(setofwordsvec(myVocabList,postinDoc))
p0V,p1V,pAb = trainNBO(np.array(trainMat),np.array(listClasses))
testEntry = ['love','my','dalmation']
thisDoc = np.array(setofwordsvec(myVocabList,testEntry))
print(testEntry,'classified as :',classifyNB(thisDoc,p0V,p1V,pAb))
testEntry = ['stupid','garbage']
thisDoc = np.array(setofwordsvec(myVocabList,testEntry))
print(testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb))
示例:使用樸素貝葉斯過濾垃圾郵件
import re
def textParse(bigString):#定義一個(gè)規(guī)整化郵件文本函數(shù)
listoftokens = re.split(r'\W*',bigString)#將字符串進(jìn)行分割
return [tok.lower() for tok in listoftokens if len(tok)>2] #將分割后的詞匯全部轉(zhuǎn)為小寫,
#并排除長度小于2的詞
def spamTest():
doclist = []
classlist = []
fulltext = []
for i in range(1,26):#打開所有郵件樣本,并匯總其中的文本及詞匯
emailText = open('D:/Anaconda/test/機(jī)器學(xué)習(xí)/Ch04/email/spam/{}.txt'.format(i),encoding='gbk').read()
wordlist = textParse(emailText)
doclist.append(wordlist)
fulltext.extend(wordlist)
classlist.append(1)
emailText = open('D:/Anaconda/test/機(jī)器學(xué)習(xí)/Ch04/email/ham/{}.txt'.format(i),encoding='gbk').read()
wordlist = textParse(emailText)
doclist.append(wordlist)
fulltext.extend(wordlist)
classlist.append(0)
vocablist = createVocabList(doclist)#建立詞匯集
trainingSet = list(range(50))#總文檔數(shù)是50
testSet = []
for i in range(10):#隨機(jī)抽取10個(gè)文檔作為測試集
randIndex = int(np.random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex])
del trainingSet[randIndex]
trainMat = []
trainClasses = []
for docIndex in trainingSet:#剩下的40個(gè)文檔為訓(xùn)練集
trainMat.append(setofwordsvec(vocablist,doclist[docIndex]))
#將剩下40個(gè)文檔轉(zhuǎn)化為向量后放入trainMat列表中
trainClasses.append(classlist[docIndex])#將剩下40個(gè)文檔的對(duì)應(yīng)類型放到trainClasses列表中
p0V,p1V,pSpam = trainNBO(np.array(trainMat),np.array(trainClasses))
#計(jì)算概率
errorCount = 0
for docIndex in testSet:#利用測試集中的文檔驗(yàn)證錯(cuò)誤率
wordVector = setofwordsvec(vocablist,doclist[docIndex])
if classifyNB(np.array(wordVector),p0V,p1V,pSpam) != classlist[docIndex]:
errorCount += 1
print('the error rate is :',float(errorCount)/len(testSet))
#隨機(jī)選擇一部分作為訓(xùn)練集,二剩余部分作為測試集的過程稱為留存交叉驗(yàn)證