本博客参照了复旦大学计算机科学技术学院邱锡鹏教授的文章https://www.zhihu.com/question/324189960
题目:实现基于logistic/softmax regression的文本分类
参考
文本分类《神经网络与深度学习》 第2/3章数据集:Classify the sentiment of sentences from the Rotten Tomatoes dataset
实现要求:NumPy
需要了解的知识点:
文本特征表示:Bag-of-Word,N-gram分类器:logistic/softmax regression,损失函数、(随机)梯度下降、特征选择数据集:训练集/验证集/测试集的划分实验:
分析不同的特征、损失函数、学习率对最终分类性能的影响shuffle 、batch、mini-batch注:代码并没有严格参照要求去做,而是使用了sklearn封装好的工具。
import nltk import pandas as pd from nltk.corpus import stopwords from nltk.stem.snowball import SnowballStemmer import matplotlib.pylab as plt %matplotlib inline #载入数据 df_train = pd.read_csv(r'sentiment-analysis-on-movie-reviews/train.tsv',delimiter='\t') df_test = pd.read_csv(r'sentiment-analysis-on-movie-reviews/test.tsv',delimiter='\t') df_train.head() #对文本数据做预处理 df_train['Phrase'] = df_train['Phrase'].apply(lambda x: x.lower()) # print(df_train['Phrase']) #不能使用默认的停用词表,因为类似于'a'这样的字母会被去除掉,而有些项就是单单一个'a' # stop_word = set(stopwords.words('english')) #采用英文停用词表 # df_train['Phrase'] = df_train['Phrase'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_word])) df_train['tokenizer_sents'] = df_train['Phrase'].apply(lambda x: nltk.word_tokenize(x)) #对句子分词 # print(df_train['tokenizer_sents']) #输出类似[a, joke, in, the, united, states] #提取词干 stemmer = SnowballStemmer('english') df_train['tokenizer_sents'] = df_train['tokenizer_sents'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x])) # print(df_train['tokenizer_sents']) #有些效果并不好,如forced被变成了forc #划分并制作数据集 from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import CountVectorizer X = df_train['tokenizer_sents'] y = df_train['Sentiment'] X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 1) vect = CountVectorizer() #可以加上不同参数:CountVectorizer(ngram_range = (1,1),analyzer = 'word',min_df = 0.001),其中ngram_range表示N元特征 X_train_df = vect.fit_transform(X_train) #对文本进行编码 X_test_df = vect.transform(X_test) #注意不是fit_transform print('特征数量:',len(vect.get_feature_names())) #特征数量: 10730 #构建不同机器学习模型 from sklearn.linear_model import LogisticRegression lr = LogisticRegression() lr.fit(X_train_df,y_train) y_pred_class = lr.predict(X_test_df) print('LR:',metrics.accuracy_score(y_test,y_pred_class)) #LR:0.6295014737921313 from sklearn.naive_bayes import MultinomialNB from sklearn import metrics nb = MultinomialNB() nb.fit(X_train_df,y_train) # print(X_train_df) #稀疏矩阵,如下 # (0, 635) 1 # (1, 3495) 1 # print(X_test_df) y_pred_class = nb.predict(X_test_df) print('NB:',metrics.accuracy_score(y_test,y_pred_class)) #NB: 0.612392669486095 from sklearn.linear_model import SGDClassifier sgd = SGDClassifier() sgd.fit(X_train_df,y_train) y_pred_class = sgd.predict(X_test_df) print('SGD:',metrics.accuracy_score(y_test,y_pred_class)) #SGD: 0.6094771241830066 from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier() rfc.fit(X_train_df, y_train) y_pred_class = rfc.predict(X_test_df) print('RF:',metrics.accuracy_score(y_test, y_pred_class)) #RF: 0.6248237857234397 from xgboost import XGBClassifier xgb = XGBClassifier() xgb.fit(X_train_df, y_train) y_pred_class = xgb.predict(X_test_df) print('XGB:',metrics.accuracy_score(y_test, y_pred_class)) #XGB: 0.5396642317057542 #使用xgboost的工具衡量特征重要性 from xgboost import plot_importance fig,ax = plt.subplots(figsize=(10,15)) plot_importance(xgb,height=0.5,max_num_features=64,ax=ax) plt.show()