https://stackoverflow.com/
文本分类代码 编译通过
import nltk from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer import csv import numpy as np from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB import sklearn.metrics as metrics from sklearn import tree from sklearn.linear_model import SGDClassifier from sklearn.svm import LinearSVC from sklearn.ensemble import RandomForestClassifier # 文本清洗预处理 def preprocessing(text): # text = text.encode("gbk").decode("utf8") # tokons to word 句子标记解析 单词标记解析 tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] # print("单词标记解析:", tokens) # 停用词删除 stop = stopwords.words('english') tokens = [token for token in tokens if token not in stop] # print("停用词删除", tokens) # 单词字数小于3删除 并转换成小写 tokens = [word.lower() for word in tokens if len(word) >= 3] # print("字数小于3删除", tokens) # lemmatize # 把(文中的词)按屈折变化形式(或异体形式)进行归类 lemter = WordNetLemmatizer() tokens = [lemter.lemmatize(word) for word in tokens] # print("词性归类", tokens) preprosses_text = " ".join(tokens) # print("处理后的文本:", preprosses_text) return preprosses_text with open('SMSSpamCollection', 'r', newline='', encoding='mac_roman') as csvfile: smsdata_data = [] sms_lablel = [] csv_reader = csv.reader(csvfile, delimiter='\t') for line in csv_reader: sms_lablel.append(line[0]) smsdata_data.append(preprocessing(line[1])) csvfile.close() print('smsdata_data', smsdata_data) print('sms_lablel', sms_lablel) # 采样 # 训练集和样本集分开 7:3 trainset_size = int(round(len(smsdata_data)*0.70)) # i chose this threshold for 70:30 train and test split. print('The training set size for this classifier is ' + str(trainset_size) + '\n') x_train = np.array([''.join(el) for el in smsdata_data[0:trainset_size]]) y_train = np.array([el for el in sms_lablel[0:trainset_size]]) x_test = np.array([''.join(el) for el in smsdata_data[trainset_size+1:len(smsdata_data)]]) # or el in sms_labels[trainset_size+1:len(sms_lablel)]]) y_test = np.array([el for el in sms_lablel[trainset_size+1:len(sms_lablel)]]) print("x_train:====", x_train) print("y_train:====", y_train) # 术语文档矩阵 bow 词袋 with open('SMSSpamCollection', 'r', newline='', encoding='mac_roman') as csvfile: sms_exp = [] csv_reader = csv.reader(csvfile, delimiter='\t') for line in csv_reader: sms_exp.append(preprocessing(line[1])) vectorizer = CountVectorizer(min_df=1) X_exp = vectorizer.fit_transform(sms_exp) print("||".join(vectorizer.get_feature_names())) print('X_exp>>>>>', X_exp.toarray()) csvfile.close() # TF/IDF vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2), stop_words='english', strip_accents='unicode', norm='l2') X_train = vectorizer.fit_transform(x_train) X_test = vectorizer.transform(x_test) print('x_train:.....>>>>', X_train) print('x_test:>>>>>>>', X_test) # 朴素贝叶斯 朴素贝叶斯分类器 clf = MultinomialNB().fit(X_train, y_train) y_nb_predicted = clf.predict(X_test) print("y_nb_predicted>>", y_nb_predicted) print('\n confusion_matrix \n ') cm = metrics.confusion_matrix(y_test, y_nb_predicted) print(cm) print('\n Here is the classification report:') print(metrics.classification_report(y_test, y_nb_predicted)) # 得到前n个特征值 feature_names = vectorizer.get_feature_names() coefs = clf.coef_ intercept = clf.intercept_ coefs_with_fns = sorted(zip(clf.coef_[0], feature_names)) n = 15 top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1]) for (coef_1, fn_1), (coef_2, fn_2) in top: print('\t%.4f\t%-15s\t\t%.4f\t%-15s' % (coef_1, fn_1, coef_2, fn_2)) # 决策树 分类器 # clf = tree.DecisionTreeClassifier.fit(X_train.toarray(), y_train) # y_tree_predicted = clf.predict(X_test.toarray()) # print(y_tree_predicted) # print(' \n Here is the classification report: y_tree_predicted') # print(metrics.classification_report(y_test, y_tree_predicted)) # 随机梯度下降 clf = SGDClassifier(alpha=0.001, max_iter=50).fit(X_train, y_train) y_pred = clf.predict(X_test) print('\n Here is the classification report:') print(metrics.classification_report(y_test, y_pred)) print(' \n confusion_matrix \n ') cm = (metrics.confusion_matrix(y_test, y_pred)) print(cm) # 支持向量机 svm_classifier = LinearSVC().fit(X_train, y_train) y_svm_predicted = svm_classifier.predict(X_test) print('\n Here is the classification report:') print(metrics.classification_report(y_test, y_svm_predicted)) print(' \n confusion_matrix \n ') cm = (metrics.confusion_matrix(y_test, y_svm_predicted)) print(cm) # 随机森林 # RandomForestClassifier clf = RandomForestClassifier(n_estimators=10) clf.fit(X_train, y_train) y_RF_pred = clf.predict(X_test) print('RF_confusion_matrix:') print(metrics.confusion_matrix(y_test, y_RF_pred)) print('RF_classification_report:') print(metrics.classification_report(y_test, y_RF_pred))
UCI垃圾邮件数据集下载
http://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection