五、(3)朴素贝叶斯预测文章类别
"""
Created on Mon May 20 12:25:00 2019
@author: zhangzongji
"""
import os
import jieba
from sklearn
.feature_extraction
.text
import TfidfTransformer
from sklearn
.feature_extraction
.text
import CountVectorizer
from sklearn
.naive_bayes
import MultinomialNB
def preprocess(path_name
):
text_with_spaces
=""
textfile
=open(path_name
,"r",encoding
="utf-8").read
()
textcut
=jieba
.cut
(textfile
)
for word
in textcut
:
text_with_spaces
+=word
+" "
return text_with_spaces
def loadtrainset(path
,classtag
):
allfiles
=os
.listdir
(path
)
processed_textset
=[]
allclasstags
=[]
for thisfile
in allfiles
:
path_name
=path
+"/"+thisfile
processed_textset
.append
(preprocess
(path_name
))
allclasstags
.append
(classtag
)
return processed_textset
,allclasstags
processed_textdata1
,class1
=loadtrainset
(r
"C:\Users\sun\Desktop\论文\算法代码\朴素贝叶斯算法\预测文章类别\所有文章类别\交通","交通")
processed_textdata2
,class2
=loadtrainset
(r
"C:\Users\sun\Desktop\论文\算法代码\朴素贝叶斯算法\预测文章类别\所有文章类别\教育","教育")
processed_textdata3
,class3
=loadtrainset
(r
"C:\Users\sun\Desktop\论文\算法代码\朴素贝叶斯算法\预测文章类别\所有文章类别\经济","经济")
processed_textdata4
,class4
=loadtrainset
(r
"C:\Users\sun\Desktop\论文\算法代码\朴素贝叶斯算法\预测文章类别\所有文章类别\军事","军事")
integrated_train_data
=processed_textdata1
+processed_textdata2
+processed_textdata3
+processed_textdata4
classtags_list
=class1
+class2
+class3
+class4
count_vector
= CountVectorizer
()
vector_matrix
= count_vector
.fit_transform
(integrated_train_data
)
train_tfidf
= TfidfTransformer
(use_idf
=False).fit_transform
(vector_matrix
)
clf
= MultinomialNB
().fit
(train_tfidf
,classtags_list
)
testset
=[]
testset
.append
(preprocess
(r
"C:\Users\sun\Desktop\论文\算法代码\朴素贝叶斯算法\预测文章类别\把预测文章放入此文件夹\军事新闻.txt"))
new_count_vector
= count_vector
.transform
(testset
)
new_tfidf
= TfidfTransformer
(use_idf
=False).fit_transform
(new_count_vector
)
predict_result
= clf
.predict
(new_tfidf
)
print("预测文章类别为:",predict_result
)
运行结果如图示;
本文参考链接https://www.cnblogs.com/d0main/p/6914742.html
搞定收工。
“☺☺☺ 若本篇文章对你有一丝丝帮助,请帮顶、评论点赞,谢谢。☺☺☺”
↓↓↓↓