【Task5(2天)】PyTorch实现L1,L2正则化以及Dropout
了解知道Dropout原理用代码实现正则化(L1、L2、Dropout)Dropout的numpy实现PyTorch中实现dropout
了解知道Dropout原理
Dropout是防止过拟合的一种方法(过拟合overfitting指:模型在训练数据上损失函数较小,预测准确率较高;但是在测试数据上损失函数比较大,预测准确率较低。) 训练神经网络模型时,如果训练样本较少,为了防止模型过拟合,Dropout可以作为一种优化方法。
Dropout是指在神经网络的每次训练中以一个参数p为概率,使部分隐层部分神经元失活,以此来解决过拟合问题,效果可以当作用多个不同的神经网络模型在同一训练集上进行训练,最后集成求平均。Dropout还可以消除某些神经元之间的联系,增强模型的鲁棒性。
用代码实现正则化(L1、L2、Dropout)
L1范数
L1范数是参数矩阵W中元素的绝对值之和,L1范数相对于L0范数不同点在于,L0范数求解是NP问题,而L1范数是L0范数的最优凸近似,求解较为容易。L1常被称为LASSO.
regularization_loss
= 0
for param
in model
.parameters
():
regularization_loss
+= torch
.sum(abs(param
))
for epoch
in range(EPOCHS
):
y_pred
= model
(x_train
)
classify_loss
= criterion
(y_pred
, y_train
.float().view
(-1, 1))
loss
= classify_loss
+ 0.001 * regularization_loss
L2范数
L2范数是参数矩阵W中元素的平方之和,这使得参数矩阵中的元素更稀疏,与前两个范数不同的是,它不会让参数变为0,而是使得参数大部分都接近于0。L1追求稀疏化,从而丢弃了一部分特征(参数为0),而L2范数只是使参数尽可能为0,保留了特征。L2被称为Rigde.
optimizer
= torch
.optim
.SGD
(model
.parameters
(), lr
=1e-1, momentum
=0.9, weight_decay
=0.001)
Dropout
import numpy
as np
X
= np
.array
([ [0,0,1],[0,1,1],[1,0,1],[1,1,1] ])
y
= np
.array
([[0,1,1,0]]).T
alpha
,hidden_dim
,dropout_percent
,do_dropout
= (0.5,4,0.2,True)
synapse_0
= 2*np
.random
.random
((3,hidden_dim
)) - 1
synapse_1
= 2*np
.random
.random
((hidden_dim
,1)) - 1
for j
in xrange(60000):
layer_1
= (1/(1+np
.exp
(-(np
.dot
(X
,synapse_0
)))))
if(do_dropout
):
layer_1
*= np
.random
.binomial
([np
.ones
((len(X
),hidden_dim
))],1-dropout_percent
)[0] * (1.0/(1-dropout_percent
))
layer_2
= 1/(1+np
.exp
(-(np
.dot
(layer_1
,synapse_1
))))
layer_2_delta
= (layer_2
- y
)*(layer_2
*(1-layer_2
))
layer_1_delta
= layer_2_delta
.dot
(synapse_1
.T
) * (layer_1
* (1-layer_1
))
synapse_1
-= (alpha
* layer_1
.T
.dot
(layer_2_delta
))
synapse_0
-= (alpha
* X
.T
.dot
(layer_1_delta
))
PyTorch中实现Dropout
import torch
from torch
.autograd
import Variable
import matplotlib
.pyplot
as plt
N_SAMPLES
= 20
N_HIDDEN
= 300
x
= torch
.unsqueeze
(torch
.linspace
(-1, 1, N_SAMPLES
), 1)
y
= x
+ 0.3*torch
.normal
(torch
.zeros
(N_SAMPLES
, 1), torch
.ones
(N_SAMPLES
, 1))
x
, y
= Variable
(x
), Variable
(y
)
test_x
= torch
.unsqueeze
(torch
.linspace
(-1, 1, N_SAMPLES
), 1)
test_y
= test_x
+ 0.3*torch
.normal
(torch
.zeros
(N_SAMPLES
, 1), torch
.ones
(N_SAMPLES
, 1))
test_x
, test_y
= Variable
(test_x
, volatile
=True), Variable
(test_y
, volatile
=True)
'''
plt.scatter(x.data.numpy(), y.data.numpy(), c='magenta', s=50, alpha=0.5, label='train')
plt.scatter(test_x.data.numpy(), test_y.data.numpy(), c='cyan', s=50, alpha=0.5, label='test')
plt.legend(loc='upper left')
plt.ylim((-2.5, 2.5))
plt.show()
'''
net_overfitting
= torch
.nn
.Sequential
(
torch
.nn
.Linear
(1, N_HIDDEN
),
torch
.nn
.ReLU
(),
torch
.nn
.Linear
(N_HIDDEN
, N_HIDDEN
),
torch
.nn
.ReLU
(),
torch
.nn
.Linear
(N_HIDDEN
, 1),
)
net_dropped
= torch
.nn
.Sequential
(
torch
.nn
.Linear
(1, N_HIDDEN
),
torch
.nn
.Dropout
(0.5),
torch
.nn
.ReLU
(),
torch
.nn
.Linear
(N_HIDDEN
, N_HIDDEN
),
torch
.nn
.Dropout
(0.5),
torch
.nn
.ReLU
(),
torch
.nn
.Linear
(N_HIDDEN
, 1),
)
print(net_overfitting
)
print(net_dropped
)
optimizer_ofit
= torch
.optim
.Adam
(net_overfitting
.parameters
(), lr
=0.01)
optimizer_drop
= torch
.optim
.Adam
(net_dropped
.parameters
(), lr
=0.01)
loss_func
= torch
.nn
.MSELoss
()
plt
.ion
()
for t
in range(500):
pred_ofit
= net_overfitting
(x
)
pred_drop
= net_dropped
(x
)
loss_ofit
= loss_func
(pred_ofit
, y
)
loss_drop
= loss_func
(pred_drop
, y
)
optimizer_ofit
.zero_grad
()
optimizer_drop
.zero_grad
()
loss_ofit
.backward
()
loss_drop
.backward
()
optimizer_ofit
.step
()
optimizer_drop
.step
()
if t
% 10 == 0:
net_overfitting
.eval()
net_dropped
.eval()
plt
.cla
()
test_pred_ofit
= net_overfitting
(test_x
)
test_pred_drop
= net_dropped
(test_x
)
plt
.scatter
(x
.data
.numpy
(), y
.data
.numpy
(), c
='magenta', s
=50, alpha
=0.3, label
='train')
plt
.scatter
(test_x
.data
.numpy
(), test_y
.data
.numpy
(), c
='cyan', s
=50, alpha
=0.3, label
='test')
plt
.plot
(test_x
.data
.numpy
(), test_pred_ofit
.data
.numpy
(), 'r-', lw
=3, label
='overfitting')
plt
.plot
(test_x
.data
.numpy
(), test_pred_drop
.data
.numpy
(), 'b--', lw
=3, label
='dropout(50%)')
plt
.text
(0, -1.2, 'overfitting loss=%.4f' % loss_func
(test_pred_ofit
, test_y
).data
[0], fontdict
={'size': 20, 'color': 'red'})
plt
.text
(0, -1.5, 'dropout loss=%.4f' % loss_func
(test_pred_drop
, test_y
).data
[0], fontdict
={'size': 20, 'color': 'blue'})
plt
.legend
(loc
='upper left'); plt
.ylim
((-2.5, 2.5));plt
.pause
(0.1)
net_overfitting
.train
()
net_dropped
.train
()
plt
.ioff
()
plt
.show
()
参考:
最优化方法:L1和L2正则化regularization
PyTorch实现L1,L2正则化以及Dropout
pytorch 加正则化的方法