文章目录
读取数据把每个词转换为词向量搭建原始RNN单层RNN多层RNN训练模型获取预测结果定义随机选择的函数作为训练集输入训练函数训练过程绘制损失函数
使用混淆矩阵来评估模型预测输入的名字来自于哪种语言,并给出最高的三种可能
读取数据
glob 文件名模式匹配,不用遍历整个目录判断每个文件是不是符合。
from __future__
import unicode_literals
, print_function
, division
from io
import open
import glob
import os
def findFiles(path
): return glob
.glob
(path
)
print(findFiles
('data/names/*.txt'))
import unicodedata
import string
all_letters
= string
.ascii_letters
+ " .,;'"
n_letters
= len(all_letters
)
def unicodeToAscii(s
):
return ''.join
(
c
for c
in unicodedata
.normalize
('NFD', s
)
if unicodedata
.category
(c
) != 'Mn'
and c
in all_letters
)
all_categories
= []
category_lines
= {}
training_lines
= {}
validation_lines
= {}
def readLines(filename
):
lines
= open(filename
, encoding
='utf-8').read
().strip
().split
('\n')
return [unicodeToAscii
(line
) for line
in lines
]
for filename
in findFiles
('data/names/*.txt'):
category
= os
.path
.splitext
(os
.path
.basename
(filename
))[0]
all_categories
.append
(category
)
lines
= readLines
(filename
)
category_lines
[category
] = lines
num_of_training_set
= int(len(lines
)*0.8)
training_lines
[category
] = lines
[:num_of_training_set
]
validation_lines
[category
] = lines
[num_of_training_set
:]
print(all_categories
)
['data/names\\Arabic.txt', 'data/names\\Chinese.txt', 'data/names\\Czech.txt', 'data/names\\Dutch.txt', 'data/names\\English.txt', 'data/names\\French.txt', 'data/names\\German.txt', 'data/names\\Greek.txt', 'data/names\\Irish.txt', 'data/names\\Italian.txt', 'data/names\\Japanese.txt', 'data/names\\Korean.txt', 'data/names\\Polish.txt', 'data/names\\Portuguese.txt', 'data/names\\Russian.txt', 'data/names\\Scottish.txt', 'data/names\\Spanish.txt', 'data/names\\Vietnamese.txt']
['Arabic', 'Chinese', 'Czech', 'Dutch', 'English', 'French', 'German', 'Greek', 'Irish', 'Italian', 'Japanese', 'Korean', 'Polish', 'Portuguese', 'Russian', 'Scottish', 'Spanish', 'Vietnamese']
查看意大利语对应的前五个单词
print(category_lines
['Italian'][:5])
['Abandonato', 'Abatangelo', 'Abatantuono', 'Abate', 'Abategiovanni']
把每个词转换为词向量
all_letters
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .,;'"
import torch
def letterToIndex(letter
):
return all_letters
.find
(letter
)
def letterToTensor(letter
):
tensor
= torch
.zeros
(1, n_letters
)
tensor
[0][letterToIndex
(letter
)] = 1
if is_cuda
:
tensor
= tensor
.cuda
()
return tensor
def lineToTensor(line
):
tensor
= torch
.zeros
(len(line
), 1, n_letters
)
for li
, letter
in enumerate(line
):
tensor
[li
][0][letterToIndex
(letter
)] = 1
if is_cuda
:
tensor
= tensor
.cuda
()
return tensor
print(letterToTensor
('J'))
print(lineToTensor
('Jones').size
())
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0.]], device='cuda:0')
torch.Size([5, 1, 57])
搭建原始RNN
单层RNN
import torch
.nn
as nn
class BaseRNN(nn
.Module
):
def __init__(self
, input_size
, hidden_size
, output_size
):
super(BaseRNN
, self
).__init__
()
self
.hidden_size
= hidden_size
self
.i2h
= nn
.Linear
(input_size
, hidden_size
)
self
.h2h
= nn
.Linear
(hidden_size
, hidden_size
)
self
.activation
= nn
.Tanh
()
self
.h2o
= nn
.Linear
(hidden_size
, output_size
)
def step(self
, letter
, hidden
):
i2h
= self
.i2h
(letter
)
h2h
= self
.h2h
(hidden
)
hidden
= self
.activation
( h2h
+i2h
)
output
= self
.h2o
(hidden
)
return output
, hidden
def forward(self
, word
):
hidden
= self
.initHidden
()
for i
in range(word
.size
()[0]):
output
, hidden
= self
.step
(word
[i
], hidden
)
return output
def initHidden(self
, is_cuda
=True):
if is_cuda
:
return torch
.zeros
(1, self
.hidden_size
).cuda
()
else:
return torch
.zeros
(1, self
.hidden_size
)
n_hidden
= 128
rnn
= BaseRNN
(n_letters
, n_hidden
, n_categories
)
if is_cuda
:
rnn
= rnn
.cuda
()
多层RNN
class DeeperRNN(nn
.Module
):
def __init__(self
, input_size
, hidden_size
, output_size
):
super(DeeperRNN
, self
).__init__
()
self
.hidden1_size
= hidden_size
self
.hidden2_size
= hidden_size
self
.layer1
= BaseRNN
(input_size
, hidden_size
, output_size
)
self
.layer2
= BaseRNN
(hidden_size
, hidden_size
, output_size
)
def step(self
, letter
, hidden1
, hidden2
):
output1
, hidden1
= self
.layer1
.step
(letter
, hidden1
)
output2
, hidden2
= self
.layer2
.step
(hidden1
, hidden2
)
return output2
, hidden1
, hidden2
def forward(self
, word
):
hidden1
, hidden2
= self
.initHidden
()
for i
in range(word
.size
()[0]):
output
, hidden1
, hidden2
= self
.step
(word
[i
], hidden1
, hidden2
)
return output
def initHidden(self
, is_cuda
=True):
if is_cuda
:
return torch
.zeros
(1, self
.hidden1_size
).cuda
(), torch
.zeros
(1, self
.hidden2_size
).cuda
()
else:
return torch
.zeros
(1, self
.hidden1_size
), torch
.zeros
(1, self
.hidden2_size
)
n_hidden
= 128
rnn
= DeeperRNN
(n_letters
, n_hidden
, n_categories
)
rnn
= rnn
.cuda
() if is_cuda
else rnn
训练模型
获取预测结果
def categoryFromOutput(output
):
top_n
, top_i
= output
.topk
(1)
category_i
= top_i
[0].item
()
return all_categories
[category_i
], category_i
print(categoryFromOutput
(output
))
('Arabic', 2)
定义随机选择的函数作为训练集输入
import random
def randomChoice(l
):
return l
[random
.randint
(0, len(l
) - 1)]
def randomTrainingExample():
category
= randomChoice
(all_categories
)
line
= randomChoice
(training_lines
[category
])
category_tensor
= torch
.tensor
([all_categories
.index
(category
)], dtype
=torch
.long)
category_tensor
= category_tensor
.cuda
() if is_cuda
else category_tensor
line_tensor
= lineToTensor
(line
)
return category
, line
, category_tensor
, line_tensor
def randomValidationExample():
category
= randomChoice
(all_categories
)
line
= randomChoice
(validation_lines
[category
])
category_tensor
= torch
.tensor
([all_categories
.index
(category
)], dtype
=torch
.long)
category_tensor
= category_tensor
.cuda
() if is_cuda
else category_tensor
line_tensor
= lineToTensor
(line
)
return category
, line
, category_tensor
, line_tensor
for i
in range(10):
category
, line
, category_tensor
, line_tensor
= randomTrainingExample
()
print('category =', category
, '/ line =', line
)
category = Dutch / line = Achthoven
category = Chinese / line = Chew
category = English / line = Lucas
category = Chinese / line = Chieu
category = Korean / line = Chou
category = Chinese / line = Feng
category = Greek / line = Kefalas
category = Dutch / line = Hautem
category = Vietnamese / line = Do
category = Chinese / line = Zhu
训练函数
criterion
= nn
.CrossEntropyLoss
()
learning_rate
= 0.005
def train(category_tensor
, line_tensor
):
output
= rnn
(line_tensor
)
rnn
.zero_grad
()
loss
= criterion
(output
, category_tensor
)
loss
.backward
()
for p
in rnn
.parameters
():
if hasattr(p
.grad
, "data"):
p
.data
.add_
(-learning_rate
, p
.grad
.data
)
return output
, loss
.item
()
训练过程
import time
import math
n_iters
= 100000
print_every
= 5000
plot_every
= 1000
current_loss
= 0
all_losses
= []
def timeSince(since
):
now
= time
.time
()
s
= now
- since
m
= math
.floor
(s
/ 60)
s
-= m
* 60
return '%dm %ds' % (m
, s
)
start
= time
.time
()
for iter in range(1, n_iters
+ 1):
category
, line
, category_tensor
, line_tensor
= randomTrainingExample
()
output
, loss
= train
(category_tensor
, line_tensor
)
current_loss
+= loss
if iter % print_every
== 0:
guess
, guess_i
= categoryFromOutput
(output
)
correct
= '✓' if guess
== category
else '✗ (%s)' % category
print('%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters
* 100, timeSince
(start
), loss
, line
, guess
, correct
))
if iter % plot_every
== 0:
all_losses
.append
(current_loss
/ plot_every
)
current_loss
= 0
5000 5% (0m 18s) 2.7723 Kolen / Korean ✗ (Dutch)
10000 10% (0m 37s) 3.8367 Dubois / Greek ✗ (French)
15000 15% (0m 56s) 1.2542 Rypka / Czech ✓
20000 20% (1m 15s) 0.8456 Bang / Korean ✓
25000 25% (1m 32s) 1.6992 Albuquerque / French ✗ (Portuguese)
30000 30% (1m 51s) 0.2303 Sedmikova / Czech ✓
35000 35% (2m 9s) 0.5837 Freudenberger / German ✓
40000 40% (2m 27s) 0.0136 Ableuhov / Russian ✓
45000 45% (2m 46s) 3.5326 Martin / French ✗ (German)
50000 50% (3m 4s) 0.3752 Adamczyk / Polish ✓
55000 55% (3m 22s) 1.3756 Ngo / Korean ✗ (Vietnamese)
60000 60% (3m 40s) 0.5298 Cruz / Portuguese ✓
65000 65% (3m 58s) 0.5359 Sobol / Polish ✓
70000 70% (4m 16s) 0.4308 Cheung / Chinese ✓
75000 75% (4m 35s) 1.0134 Oldland / English ✓
80000 80% (4m 53s) 0.0083 Koustoubos / Greek ✓
85000 85% (5m 12s) 0.9083 Albero / Spanish ✗ (Italian)
90000 90% (5m 27s) 0.0357 Daher / Arabic ✓
95000 95% (5m 42s) 0.0983 Cousineau / French ✓
100000 100% (5m 57s) 0.2204 Araujo / Portuguese ✓
绘制损失函数
import matplotlib
.pyplot
as plt
plt
.plot
(list(range(len(all_losses
))), all_losses
)
plt
.xlabel
('iterator')
plt
.ylabel
('loss values')
plt
.show
()
使用混淆矩阵来评估模型
混淆矩阵是数据科学、数据分析和机器学习中总结分类模型预测结果的情形分析表,以矩阵形式将数据集中的记录按照真实的类别与分类模型作出的分类判断两个标准进行汇总。列对应答案,行对应模型预测的结果,对角线上都是正确答案。
confusion_training
= torch
.zeros
(n_categories
, n_categories
)
confusion_validation
= torch
.zeros
(n_categories
, n_categories
)
n_confusion
= 5000
def evaluate(line_tensor
):
rnn
.eval()
output
= rnn
(line_tensor
)
return output
for i
in range(n_confusion
):
category
, line
, category_tensor
, line_tensor
= randomTrainingExample
()
output
= evaluate
(line_tensor
)
guess
, guess_i
= categoryFromOutput
(output
)
category_i
= all_categories
.index
(category
)
confusion_training
[category_i
][guess_i
] += 1
for i
in range(n_confusion
):
category
, line
, category_tensor
, line_tensor
= randomValidationExample
()
output
= evaluate
(line_tensor
)
guess
, guess_i
= categoryFromOutput
(output
)
category_i
= all_categories
.index
(category
)
confusion_validation
[category_i
][guess_i
] += 1
right_train
= 0
right_valid
= 0
for i
in range(n_categories
):
right_train
+= confusion_training
[i
][i
]
right_valid
+= confusion_validation
[i
][i
]
acc_train
= right_train
/ n_confusion
acc_valid
= right_valid
/ n_confusion
for i
in range(n_categories
):
confusion_training
[i
] = confusion_training
[i
] / confusion_training
[i
].sum()
confusion_validation
[i
] = confusion_validation
[i
] / confusion_validation
[i
].sum()
fig
= plt
.figure
()
ax1
= fig
.add_subplot
(121)
cax1
= ax1
.matshow
(confusion_training
.numpy
())
ax2
= fig
.add_subplot
(122)
cax2
= ax2
.matshow
(confusion_validation
.numpy
())
ax1
.set_xticklabels
([''] + all_categories
, rotation
=90)
ax1
.set_yticklabels
([''] + all_categories
)
ax2
.set_xticklabels
([''] + all_categories
, rotation
=90)
plt
.show
()
print("Traing set Acc is", acc_train
.item
())
print("validation set Acc is", acc_valid
.item
())
Traing set Acc is 0.8082000017166138
validation set Acc is 0.46380001306533813
预测输入的名字来自于哪种语言,并给出最高的三种可能
torch.topk(input, k, dim=None, largest=True, sorted=True, out=None) -> (Tensor, LongTensor)
def predict(input_line
, n_predictions
=3):
print('\n> %s' % input_line
)
with torch
.no_grad
():
output
= evaluate
(lineToTensor
(input_line
))
output
= torch
.nn
.functional
.softmax
(output
, dim
=1)
topv
, topi
= output
.topk
(n_predictions
, 1, True)
predictions
= []
for i
in range(n_predictions
):
value
= topv
[0][i
].item
()
category_index
= topi
[0][i
].item
()
print('Probability (%.2f) %s' % (value
, all_categories
[category_index
]))
predictions
.append
([value
, all_categories
[category_index
]])
predict
('Dovesky')
predict
('Jackson')
predict
('Satoshi')
predict
("Cui")
predict
("Zhuang")
predict
("Xue")
predict
("Wang")
> Dovesky
Probability (0.68) Czech
Probability (0.21) Russian
Probability (0.09) English
> Jackson
Probability (0.54) English
Probability (0.28) Scottish
Probability (0.06) Russian
> Satoshi
Probability (0.98) Japanese
Probability (0.01) Polish
Probability (0.00) Czech
> Cui
Probability (0.69) Chinese
Probability (0.14) Korean
Probability (0.08) Vietnamese
> Zhuang
Probability (0.54) Scottish
Probability (0.31) Chinese
Probability (0.04) Polish
> Xue
Probability (0.96) Chinese
Probability (0.02) Vietnamese
Probability (0.01) French
> Wang
Probability (0.99) Chinese
Probability (0.01) German
Probability (0.01) Scottish