https://github.com/RaRe-Technologies/gensim
pip install -U gensim
代码示例
import nltk from nltk import collections from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer import csv import numpy as np from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB import sklearn.metrics as metrics from sklearn import tree from sklearn.linear_model import SGDClassifier from sklearn.svm import LinearSVC from sklearn.ensemble import RandomForestClassifier from sklearn.cluster import KMeans, MiniBatchKMeans import gensim from gensim import corpora, models, similarities from itertools import chain from operator import itemgetter import re # 文本清洗预处理 def preprocessing(text): # text = text.encode("gbk").decode("utf8") # tokons to word 句子标记解析 单词标记解析 tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] # print("单词标记解析:", tokens) # 停用词删除 stop = stopwords.words('english') tokens = [token for token in tokens if token not in stop] # print("停用词删除", tokens) # 单词字数小于3删除 并转换成小写 tokens = [word.lower() for word in tokens if len(word) >= 3] # print("字数小于3删除", tokens) # lemmatize # 把(文中的词)按屈折变化形式(或异体形式)进行归类 lemter = WordNetLemmatizer() tokens = [lemter.lemmatize(word) for word in tokens] # print("词性归类", tokens) preprosses_text = " ".join(tokens) # print("处理后的文本:", preprosses_text) return preprosses_text with open('SMSSpamCollection', 'r', newline='', encoding='mac_roman') as csvfile: smsdata_data = [] sms_lablel = [] csv_reader = csv.reader(csvfile, delimiter='\t') for line in csv_reader: sms_lablel.append(line[0]) smsdata_data.append(preprocessing(line[1])) csvfile.close() print('smsdata_data', smsdata_data) print('sms_lablel', sms_lablel) # 采样 # 训练集和样本集分开 7:3 trainset_size = int(round(len(smsdata_data)*0.70)) # i chose this threshold for 70:30 train and test split. print('The training set size for this classifier is ' + str(trainset_size) + '\n') x_train = np.array([''.join(el) for el in smsdata_data[0:trainset_size]]) y_train = np.array([el for el in sms_lablel[0:trainset_size]]) x_test = np.array([''.join(el) for el in smsdata_data[trainset_size+1:len(smsdata_data)]]) # or el in sms_labels[trainset_size+1:len(sms_lablel)]]) y_test = np.array([el for el in sms_lablel[trainset_size+1:len(sms_lablel)]]) print("x_train:====", x_train) print("y_train:====", y_train) # 术语文档矩阵 bow 词袋 with open('SMSSpamCollection', 'r', newline='', encoding='mac_roman') as csvfile: sms_exp = [] csv_reader = csv.reader(csvfile, delimiter='\t') for line in csv_reader: sms_exp.append(preprocessing(line[1])) vectorizer = CountVectorizer(min_df=1) X_exp = vectorizer.fit_transform(sms_exp) print("||".join(vectorizer.get_feature_names())) print('X_exp>>>>>', X_exp.toarray()) csvfile.close() # TF/IDF vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2), stop_words='english', strip_accents='unicode', norm='l2') X_train = vectorizer.fit_transform(x_train) X_test = vectorizer.transform(x_test) print('x_train:.....>>>>', X_train) print('x_test:>>>>>>>', X_test) # 朴素贝叶斯 朴素贝叶斯分类器 clf = MultinomialNB().fit(X_train, y_train) y_nb_predicted = clf.predict(X_test) print("y_nb_predicted>>", y_nb_predicted) print('\n confusion_matrix \n ') cm = metrics.confusion_matrix(y_test, y_nb_predicted) print(cm) print('\n Here is the classification report:') print(metrics.classification_report(y_test, y_nb_predicted)) # 得到前n个特征值 feature_names = vectorizer.get_feature_names() coefs = clf.coef_ intercept = clf.intercept_ coefs_with_fns = sorted(zip(clf.coef_[0], feature_names)) n = 15 top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1]) for (coef_1, fn_1), (coef_2, fn_2) in top: print('\t%.4f\t%-15s\t\t%.4f\t%-15s' % (coef_1, fn_1, coef_2, fn_2)) # 决策树 分类器 # clf = tree.DecisionTreeClassifier.fit(X_train.toarray(), y_train) # y_tree_predicted = clf.predict(X_test.toarray()) # print(y_tree_predicted) # print(' \n Here is the classification report: y_tree_predicted') # print(metrics.classification_report(y_test, y_tree_predicted)) # 随机梯度下降 clf = SGDClassifier(alpha=0.001, max_iter=50).fit(X_train, y_train) y_pred = clf.predict(X_test) print('\n Here is the classification report:') print(metrics.classification_report(y_test, y_pred)) print(' \n confusion_matrix \n ') cm = (metrics.confusion_matrix(y_test, y_pred)) print(cm) # 支持向量机 svm_classifier = LinearSVC().fit(X_train, y_train) y_svm_predicted = svm_classifier.predict(X_test) print('\n Here is the classification report:') print(metrics.classification_report(y_test, y_svm_predicted)) print(' \n confusion_matrix \n ') cm = (metrics.confusion_matrix(y_test, y_svm_predicted)) print(cm) # 随机森林 # RandomForestClassifier clf = RandomForestClassifier(n_estimators=10) clf.fit(X_train, y_train) y_RF_pred = clf.predict(X_test) print('RF_confusion_matrix:') print(metrics.confusion_matrix(y_test, y_RF_pred)) print('RF_classification_report:') print(metrics.classification_report(y_test, y_RF_pred)) # 文本聚类 K 均值算法 true_k = 5 km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1) kmini = MiniBatchKMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1, init_size=1000, batch_size=1000) km_model = km.fit(X_train) kmini_model = kmini.fit(X_train) print("For K-mean clustering ") clustering = collections.defaultdict(list) for idx, label in enumerate(km_model.labels_): clustering[label].append(idx) print(clustering) print("For K-mean Mini batch clustering ") clustering = collections.defaultdict(list) for idx, label in enumerate(kmini_model.labels_): clustering[label].append(idx) print(clustering) # 主题模型 genism documents = [document for document in smsdata_data] stoplist = stopwords.words('english') texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=100) print('lsi.print_topics(20)>>>>>', lsi.print_topics(20)) n_topics = 5 lda = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=n_topics) for i in range(0, n_topics): temp = lda.show_topic(i, 10) terms = [] for term in temp: terms.append(term[1]) print("Top 10 terms for topic #", str(i), ": ", terms) # + ",".join(term))