五、(2)朴素贝叶斯预测邮件类别
from numpy
import *
from functools
import reduce
adClass
= 1
def loadDataSet():
"""加载数据集合及其对应的分类,数据可以随便选取广告和正常的邮件"""
wordsList
= [
['优惠', '返利', '打折', '优惠', '金融', '理财'],
['人生', '迷茫', '不知', '何去何从', '失望', '轻生', '维持', '恩爱'],
['大促销', '跳楼价', '买就送', '优惠', '增值税', '打折'],
['贵阳', '今天', '天气', '炎热', '不宜', '外出', '紫外线', '强烈'],
['招聘', '兼职', '全职', '薪资', '年收入','千万'],
['汽车', '展览', '上海', '概念','炫酷'],
['赌博', '澳门', '线上', '荷官','美女']
]
classVec
= [ 1, 0, 1, 0, 1, 0, 1]
return wordsList
, classVec
def doc2VecList(docList
):
a
= list(reduce(lambda x
, y
: set(x
) | set(y
), docList
))
return a
def words2Vec(vecList
, inputWords
):
"""把单词转化为词向量"""
resultVec
= [0] * len(vecList
)
for word
in inputWords
:
if word
in vecList
:
resultVec
[vecList
.index
(word
)] += 1
else:
print('没有发现此单词')
return array
(resultVec
)
def trainNB(trainMatrix
, trainClass
):
"""计算,生成每个词对于类别上的概率"""
numTrainClass
= len(trainClass
)
numWords
= len(trainMatrix
[0])
p0Num
= ones
(numWords
)
p1Num
= ones
(numWords
)
p0Words
= 2.0
p1Words
= 2.0
for i
in range(numTrainClass
):
if trainClass
[i
] == 1:
p1Num
+= trainMatrix
[i
]
p1Words
+= sum(trainMatrix
[i
])
else:
p0Num
+= trainMatrix
[i
]
p0Words
+= sum(trainMatrix
[i
])
p0Vec
= log
(p0Num
/ p0Words
)
p1Vec
= log
(p1Num
/ p1Words
)
pClass1
= sum(trainClass
) / float(numTrainClass
)
return p0Vec
, p1Vec
, pClass1
def classifyNB(testVec
, p0Vec
, p1Vec
, pClass1
):
p1
= sum(testVec
* p1Vec
) + log
(pClass1
)
p0
= sum(testVec
* p0Vec
) + log
(1 - pClass1
)
if p0
> p1
:
return 0
return 1
def printClass(words
, testClass
):
if testClass
== adClass
:
print(words
, '推测为:广告邮件')
else:
print(words
, '推测为:正常邮件')
def tNB():
docList
, classVec
= loadDataSet
()
allWordsVec
= doc2VecList
(docList
)
trainMat
= list(map(lambda x
: words2Vec
(allWordsVec
, x
), docList
))
p0V
, p1V
, pClass1
= trainNB
(trainMat
, classVec
)
testWords
= ['贵阳', '概念', '荷官']
testVec
= words2Vec
(allWordsVec
, testWords
)
testClass
= classifyNB
(testVec
, p0V
, p1V
, pClass1
)
printClass
(testWords
, testClass
)
testWords
= ['轻生', '迷茫', '汽车']
testVec
= words2Vec
(allWordsVec
, testWords
)
testClass
= classifyNB
(testVec
, p0V
, p1V
, pClass1
)
printClass
(testWords
, testClass
)
testWords
= ['美女', '赌博', '汽车']
testVec
= words2Vec
(allWordsVec
, testWords
)
testClass
= classifyNB
(testVec
, p0V
, p1V
, pClass1
)
printClass
(testWords
, testClass
)
testWords
= ['线上', '赌博', '荷官']
testVec
= words2Vec
(allWordsVec
, testWords
)
testClass
= classifyNB
(testVec
, p0V
, p1V
, pClass1
)
printClass
(testWords
, testClass
)
if __name__
== '__main__':
tNB
()
运行结果如下:
本文参考文章链接https://blog.csdn.net/stevesea/article/details/82877686
搞定收工。
“☺☺☺ 若本篇文章对你有一丝丝帮助,请帮顶、评论点赞,谢谢。☺☺☺”
↓↓↓↓