朴素贝叶斯(NBM)
实现思路
朴素贝叶斯(NBM)是以贝叶斯定理为基础并且假设特征条件之间相互独立的方法,先通过已给定的训练集,以特征词之间独立作为前提假设,学习从输入到输出的联合概率分布,再基于学习到的模型,输入$x$求出使得后验概率最大的输出$y$。
数学原理
1.贝叶斯定理
$$P(A|B)=\frac{P(A,B)}{P(B)}=\frac{P(B|A)P(A)}{P(B)}$$
2.独立条件概率
$$P(x_{1}, x_{2}, ... , x_{n}|y) = \prod_{i=1}^{n}P(x_{i}|y)$$
注意
- 因为是不断计算$P(x_{i}|y)$的乘积,如果某一项等于0会使整个乘积等于0,所以要默认$x_{i}=1$
- 另一个是存在下溢出,如果每一个概率都很小,那么连乘时会最后四舍五入等于0,所以可以对概率取对数,即使用$\log P\rightarrow P$,将连乘变成连加这样加法不会出现这样的问题
代码部分
import numpy as np
from math import *
import re
from random import *
def loadDataSet():
postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec = [0, 1, 0, 1, 0, 1]
return postingList, classVec
def createvocablelist(data):
vocable = set()
for document in data:
vocable = vocable | set(document)
return list(vocable)
def setwordvec(vocable, input):
veclist = [0] * len(vocable)
for word in input:
if word in vocable:
veclist[vocable.index(word)] += 1
return veclist
def train(traindata, traincategory):
numtraindocs = len(traindata)
numwords = len(traindata[0])
PA = sum(traincategory) / float(numtraindocs)
p0num = np.ones(numwords)
p1num = np.ones(numwords)
p0denom = 2
p1denom = 2
for i in range(numtraindocs):
if traincategory[i] == 1:
p1num += traindata[i]
p1denom += sum(traindata[i])
else:
p0num += traindata[i]
p0denom += sum(traindata[i])
p1vect = np.log(p1num / p1denom)
p0vect = np.log(p0num / p0denom)
return PA, p1vect, p0vect
def classify(vec2classify, p0vec, p1vec, PA):
p1 = sum(vec2classify * p1vec) + log(PA)
p0 = sum(vec2classify * p0vec) + log(1 - PA)
if p1 > p0:
return 1
else:
return 0
def testNB():
postingList, classVec = loadDataSet()
vocable = createvocablelist(postingList)
traindata = []
for posting in postingList:
traindata.append(setwordvec(vocable, posting))
PA, p1vect, p0vect = train(traindata, classVec)
testEntry = ['love', 'my', 'dalmation']
thisDoc = np.array(setwordvec(vocable, testEntry))
print(testEntry, 'classified as: ', classify(thisDoc, p0vect, p1vect, PA))
testEntry = ['stupid', 'garbage']
thisDoc = np.array(setwordvec(vocable, testEntry))
print(testEntry, 'classified as: ', classify(thisDoc, p0vect, p1vect, PA))
def textparse(sentense):
stringlist = re.split(r'\W', sentense)
return [word.lower() for word in stringlist if len(word) > 2]
def emailtest():
doclist = []
classlist = []
for i in range(1, 26):
wordlist = textparse(open('./email/spam/%d.txt' % i).read())
doclist.append(wordlist)
classlist.append(1)
wordlist = textparse(open('./email/ham/%d.txt' % i).read())
doclist.append(wordlist)
classlist.append(0)
emailvocable = createvocablelist(doclist)
trainingset = list(range(50))
testset = []
for i in range(10):
randindex = int(uniform(0, len(trainingset)))
testset.append(trainingset[randindex])
del (trainingset[randindex])
traindata = []
trainclass = []
for trainindex in trainingset:
traindata.append(setwordvec(emailvocable, doclist[trainindex]))
trainclass.append(classlist[trainindex])
pspam, p1, p0 = train(traindata, trainclass)
error = 0
for testindex in testset:
testvec = setwordvec(emailvocable, doclist[testindex])
if classify(np.array(testvec), p0, p1, pspam) != classlist[testindex]:
error += 1
return float(error) / len(testset)
if __name__ == "__main__":
caculate = 0
for i in range(1000):
caculate += emailtest()
print(caculate/1000)
总结
随着学着python也是越码越会了,而且有了代码反理解比较方便,要不然只看西瓜书🍉比较头疼,还是一起弄比较舒服,下一篇应该是Logistic回归了
Comments | NOTHING