1 问题描述
问题:邮件分类问题(Email classification)
任务:将邮件分为两类(spam or ham)
数据集:https://www.kaggle.com/uciml/sms-spam-collection-dataset#spam.csv
2 数据处理
import pandas
as pd
from nltk
.corpus
import stopwords
from nltk
.stem
import PorterStemmer
from textblob
import Word
import re
from sklearn
.model_selection
import train_test_split
读取数据
data
= pd
.read_csv
('spam.csv', encoding
= "ISO-8859-1")
data
.columns
Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')
data
.head
()
v1v2Unnamed: 2Unnamed: 3Unnamed: 4
0hamGo until jurong point, crazy.. Available only ...NaNNaNNaN1hamOk lar... Joking wif u oni...NaNNaNNaN2spamFree entry in 2 a wkly comp to win FA Cup fina...NaNNaNNaN3hamU dun say so early hor... U c already then say...NaNNaNNaN4hamNah I don't think he goes to usf, he lives aro...NaNNaNNaN
去除无用数据
data
= data
[['v1', 'v2']]
data
.head
()
v1v2
0hamGo until jurong point, crazy.. Available only ...1hamOk lar... Joking wif u oni...2spamFree entry in 2 a wkly comp to win FA Cup fina...3hamU dun say so early hor... U c already then say...4hamNah I don't think he goes to usf, he lives aro...
修改表头信息
data
= data
.rename
(columns
={"v1":"label","v2":"text"})
data
.head
()
labeltext
0hamGo until jurong point, crazy.. Available only ...1hamOk lar... Joking wif u oni...2spamFree entry in 2 a wkly comp to win FA Cup fina...3hamU dun say so early hor... U c already then say...4hamNah I don't think he goes to usf, he lives aro...
去除标点符号及多余的空格
data
['text'] = data
['text'].apply(lambda x
:re
.sub
('[!@#$:).;,?&]', ' ', x
.lower
()))
data
['text'] = data
['text'].apply(lambda x
:re
.sub
(' ', ' ', x
))
data
['text'][0]
'go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat '
单词转换为小写
data
['text'] = data
['text'].apply(lambda x
:" ".join
(x
.lower
() for x
in x
.split
()))
data
['text'][0]
'go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat'
去除停止词
stop
= stopwords
.words
('english')
data
['text'] = data
['text'].apply(lambda x
: " ".join
(x
for x
in x
.split
() if x
not in stop
))
data
['text'][0]
'go jurong point crazy available bugis n great world la e buffet cine got amore wat'
分词处理
st
= PorterStemmer
()
data
['text'] = data
['text'].apply(lambda x
: " ".join
([st
.stem
(word
) for word
in x
.split
()]))
data
['text'] = data
['text'].apply(lambda x
: " ".join
([Word
(word
).lemmatize
() for word
in x
.split
()]))
data
['text'][0]
'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'
data
.head
()
labeltext
0hamgo jurong point crazi avail bugi n great world...1hamok lar joke wif u oni2spamfree entri 2 wkli comp win fa cup final tkt 21...3hamu dun say earli hor u c alreadi say4hamnah think goe usf live around though
3 特征提取
from keras
.preprocessing
.text
import Tokenizer
from keras
.preprocessing
.sequence
import pad_sequences
Using TensorFlow backend.
分出训练集和测试集
train
, test
= train_test_split
(data
, test_size
=0.2)
设置参数
max_sequence_length
= 300
num_words
= 20000
embedding_dim
= 100
构建分词器
tokenizer
= Tokenizer
(num_words
=num_words
)
tokenizer
.fit_on_texts
(train
.text
)
train_sequences
= tokenizer
.texts_to_sequences
(train
.text
)
test_sequences
= tokenizer
.texts_to_sequences
(test
.text
)
word_index
= tokenizer
.word_index
print('Found %s unique tokens.' % len(word_index
))
train_x
= pad_sequences
(train_sequences
, maxlen
=max_sequence_length
)
test_x
= pad_sequences
(test_sequences
, maxlen
=max_sequence_length
)
print(train_x
.shape
)
print(test_x
.shape
)
Found 6702 unique tokens.
(4457, 300)
(1115, 300)
标签向量化
import numpy
as np
def lable_vectorize(labels
):
label_vec
= np
.zeros
([len(labels
),2])
for i
, label
in enumerate(labels
):
if str(label
)=='ham':
label_vec
[i
][0] = 1
else:
label_vec
[i
][1] = 1
return label_vec
train_y
= lable_vectorize
(train
['label'])
test_y
= lable_vectorize
(test
['label'])
from sklearn
.preprocessing
import LabelEncoder
from keras
.utils
import to_categorical
train_labels
= train
['label']
test_labels
= test
['label']
le
= LabelEncoder
()
le
.fit
(train_labels
)
train_labels
= le
.transform
(train_labels
)
test_labels
= le
.transform
(test_labels
)
labels_train
= to_categorical
(np
.asarray
(train_labels
))
labels_test
= to_categorical
(np
.asarray
(test_labels
))
4 构建模型并训练
import sys
, os
, re
, csv
, codecs
, numpy
as np
, pandas
as pd
from keras
.preprocessing
.text
import Tokenizer
from keras
.preprocessing
.sequence
import pad_sequences
from keras
.utils
import to_categorical
from keras
.layers
import Dense
, Input
, LSTM
, Embedding
,Dropout
, Activation
from keras
.layers
import Bidirectional
, GlobalMaxPool1D
,Conv1D
, SimpleRNN
from keras
.models
import Model
from keras
.models
import Sequential
from keras
import initializers
, regularizers
, constraints
,optimizers
, layers
from keras
.layers
import Dense
, Input
, Flatten
, Dropout
,BatchNormalization
from keras
.layers
import Conv1D
, MaxPooling1D
, Embedding
from keras
.models
import Sequential
model
= Sequential
()
model
.add
(Embedding
(num_words
,
embedding_dim
,
input_length
=max_sequence_length
))
model
.add
(Dropout
(0.5))
model
.add
(Conv1D
(128, 5, activation
='relu'))
model
.add
(MaxPooling1D
(5))
model
.add
(Dropout
(0.5))
model
.add
(BatchNormalization
())
model
.add
(Conv1D
(128, 5, activation
='relu'))
model
.add
(MaxPooling1D
(5))
model
.add
(Dropout
(0.5))
model
.add
(BatchNormalization
())
model
.add
(Flatten
())
model
.add
(Dense
(128, activation
='relu'))
model
.add
(Dense
(2, activation
='softmax'))
model
.compile(loss
='categorical_crossentropy',
optimizer
='rmsprop',
metrics
=['acc'])
model
.fit
(train_x
, train_y
,
batch_size
=64,
epochs
=5,
validation_split
=0.2)
Train on 3565 samples, validate on 892 samples
Epoch 1/5
3565/3565 [==============================] - 25s 7ms/step - loss: 0.3923 - acc: 0.8480 - val_loss: 0.1514 - val_acc: 0.9451
Epoch 2/5
3565/3565 [==============================] - 23s 7ms/step - loss: 0.1729 - acc: 0.9372 - val_loss: 0.0789 - val_acc: 0.9753
Epoch 3/5
3565/3565 [==============================] - 25s 7ms/step - loss: 0.0940 - acc: 0.9731 - val_loss: 0.2079 - val_acc: 0.9787
Epoch 4/5
3565/3565 [==============================] - 23s 7ms/step - loss: 0.0590 - acc: 0.9857 - val_loss: 0.3246 - val_acc: 0.9843
Epoch 5/5
3565/3565 [==============================] - 23s 7ms/step - loss: 0.0493 - acc: 0.9882 - val_loss: 0.3150 - val_acc: 0.9877
<keras.callbacks.History at 0x1cac6187940>
5 模型评估
model
.evaluate
(test_x
, test_y
)
1115/1115 [==============================] - 2s 2ms/step
[0.32723046118903054, 0.97847533632287]
predicted
=model
.predict
(test_x
)
predicted
array([[0.71038646, 0.28961352],
[0.71285075, 0.28714925],
[0.7101978 , 0.28980213],
...,
[0.7092874 , 0.29071262],
[0.70976096, 0.290239 ],
[0.70463425, 0.29536578]], dtype=float32)
import sklearn
from sklearn
.metrics
import precision_recall_fscore_support
as score
precision
, recall
, fscore
, support
= score
(test_y
,predicted
.round())
print('precision: {}'.format(precision
))
print('recall: {}'.format(recall
))
print('fscore: {}'.format(fscore
))
print('support: {}'.format(support
))
print("############################")
print(sklearn
.metrics
.classification_report
(test_y
,predicted
.round()))
precision: [0.97961264 0.97014925]
recall: [0.99585492 0.86666667]
fscore: [0.98766701 0.91549296]
support: [965 150]
############################
precision recall f1-score support
0 0.98 1.00 0.99 965
1 0.97 0.87 0.92 150
avg / total 0.98 0.98 0.98 1115
文章来源: https://foochane.cn/article/2019052202.html