pytorch 学习笔记5 —— 实例学习

xiaoxiao2022-07-14 161

首先是一个两层网络，使用numpy版本和pytorch版本

其实如果仔细对比，差别不大。代码量也相似。因为这里没有使用pytorch的自动梯度计算。

numpy版本

# -*- coding: utf-8 -*- import numpy as np # N is batch size; D_in is input dimension; # H is hidden dimension; D_out is output dimension. N, D_in, H, D_out = 64, 1000, 100, 10 # Create random input and output data x = np.random.randn(N, D_in) y = np.random.randn(N, D_out) # Randomly initialize weights w1 = np.random.randn(D_in, H) w2 = np.random.randn(H, D_out) learning_rate = 1e-6 for t in range(500): # Forward pass: compute predicted y h = x.dot(w1) h_relu = np.maximum(h, 0) y_pred = h_relu.dot(w2) # Compute and print loss loss = np.square(y_pred - y).sum() print(t, loss) # Backprop to compute gradients of w1 and w2 with respect to loss grad_y_pred = 2.0 * (y_pred - y) grad_w2 = h_relu.T.dot(grad_y_pred) grad_h_relu = grad_y_pred.dot(w2.T) grad_h = grad_h_relu.copy() grad_h[h < 0] = 0 grad_w1 = x.T.dot(grad_h) # Update weights w1 -= learning_rate * grad_w1 w2 -= learning_rate * grad_w2

pytorch版本

# -*- coding: utf-8 -*- import torch dtype = torch.float device = torch.device("cpu") # device = torch.device("cuda:0") # Uncomment this to run on GPU # N is batch size; D_in is input dimension; # H is hidden dimension; D_out is output dimension. N, D_in, H, D_out = 64, 1000, 100, 10 # Create random input and output data x = torch.randn(N, D_in, device=device, dtype=dtype) y = torch.randn(N, D_out, device=device, dtype=dtype) # Randomly initialize weights w1 = torch.randn(D_in, H, device=device, dtype=dtype) w2 = torch.randn(H, D_out, device=device, dtype=dtype) learning_rate = 1e-6 for t in range(500): # Forward pass: compute predicted y h = x.mm(w1) h_relu = h.clamp(min=0) y_pred = h_relu.mm(w2) # Compute and print loss loss = (y_pred - y).pow(2).sum().item() print(t, loss) # Backprop to compute gradients of w1 and w2 with respect to loss grad_y_pred = 2.0 * (y_pred - y) grad_w2 = h_relu.t().mm(grad_y_pred) grad_h_relu = grad_y_pred.mm(w2.t()) grad_h = grad_h_relu.clone() grad_h[h < 0] = 0 grad_w1 = x.t().mm(grad_h) # Update weights using gradient descent w1 -= learning_rate * grad_w1 w2 -= learning_rate * grad_w2 使用autograd 神经网络的计算会转化成为计算图。图中的节点会变成张量tensor，边是从输出节点到下一个节点的函数。 use automatic differentiation to compute gradients When using autograd, the forward pass of your network will define a computational graph; nodes in the graph will be Tensors, and edges will be functions that produce output Tensors from input Tensors. # -*- coding: utf-8 -*- import torch dtype = torch.float device = torch.device("cpu") # device = torch.device("cuda:0") # Uncomment this to run on GPU # N is batch size; D_in is input dimension; # H is hidden dimension; D_out is output dimension. N, D_in, H, D_out = 64, 1000, 100, 10 # Create random Tensors to hold input and outputs. # Setting requires_grad=False indicates that we do not need to compute gradients # with respect to these Tensors during the backward pass. x = torch.randn(N, D_in, device=device, dtype=dtype) y = torch.randn(N, D_out, device=device, dtype=dtype) # Create random Tensors for weights. # Setting requires_grad=True indicates that we want to compute gradients with # respect to these Tensors during the backward pass. w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True) w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True) learning_rate = 1e-6 for t in range(500): # Forward pass: compute predicted y using operations on Tensors; these # are exactly the same operations we used to compute the forward pass using # Tensors, but we do not need to keep references to intermediate values since # we are not implementing the backward pass by hand. y_pred = x.mm(w1).clamp(min=0).mm(w2) # Compute and print loss using operations on Tensors. # Now loss is a Tensor of shape (1,) # loss.item() gets the a scalar value held in the loss. loss = (y_pred - y).pow(2).sum() print(t, loss.item()) # Use autograd to compute the backward pass. This call will compute the # gradient of loss with respect to all Tensors with requires_grad=True. # After this call w1.grad and w2.grad will be Tensors holding the gradient # of the loss with respect to w1 and w2 respectively. loss.backward() # Manually update weights using gradient descent. Wrap in torch.no_grad() # because weights have requires_grad=True, but we don't need to track this # in autograd. # An alternative way is to operate on weight.data and weight.grad.data. # Recall that tensor.data gives a tensor that shares the storage with # tensor, but doesn't track history. # You can also use torch.optim.SGD to achieve this. with torch.no_grad(): w1 -= learning_rate * w1.grad w2 -= learning_rate * w2.grad # Manually zero the gradients after updating weights w1.grad.zero_() w2.grad.zero_()

自定义autograd函数

需要定义torch.autograd.Function的一个子类，并且实现forward和backward函数。

# -*- coding: utf-8 -*- import torch class MyReLU(torch.autograd.Function): """ We can implement our own custom autograd Functions by subclassing torch.autograd.Function and implementing the forward and backward passes which operate on Tensors. """ @staticmethod def forward(ctx, input): """ In the forward pass we receive a Tensor containing the input and return a Tensor containing the output. ctx is a context object that can be used to stash information for backward computation. You can cache arbitrary objects for use in the backward pass using the ctx.save_for_backward method. """ ctx.save_for_backward(input) return input.clamp(min=0) @staticmethod def backward(ctx, grad_output): """ In the backward pass we receive a Tensor containing the gradient of the loss with respect to the output, and we need to compute the gradient of the loss with respect to the input. """ input, = ctx.saved_tensors grad_input = grad_output.clone() grad_input[input < 0] = 0 return grad_input dtype = torch.float device = torch.device("cpu") # device = torch.device("cuda:0") # Uncomment this to run on GPU # N is batch size; D_in is input dimension; # H is hidden dimension; D_out is output dimension. N, D_in, H, D_out = 64, 1000, 100, 10 # Create random Tensors to hold input and outputs. x = torch.randn(N, D_in, device=device, dtype=dtype) y = torch.randn(N, D_out, device=device, dtype=dtype) # Create random Tensors for weights. w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True) w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True) learning_rate = 1e-6 for t in range(500): # To apply our Function, we use Function.apply method. We alias this as 'relu'. relu = MyReLU.apply # Forward pass: compute predicted y using operations; we compute # ReLU using our custom autograd operation. y_pred = relu(x.mm(w1)).mm(w2) # Compute and print loss loss = (y_pred - y).pow(2).sum() print(t, loss.item()) # Use autograd to compute the backward pass. loss.backward() # Update weights using gradient descent with torch.no_grad(): w1 -= learning_rate * w1.grad w2 -= learning_rate * w2.grad # Manually zero the gradients after updating weights w1.grad.zero_() w2.grad.zero_()

关于计算图

其实tensorflow和pytorch很相似，都是使用图的方式来跟踪计算。区别是：tensorflow使用静态图，而pytorch使用动态图。

静态图的好处是：你可以提前优化图的结构，这对于多次重复计算很有好处。

动态图的好处是：你可以使用命令对每个数据点执行不同的计算。

TensorFlow版本

# -*- coding: utf-8 -*- import tensorflow as tf import numpy as np # First we set up the computational graph: # N is batch size; D_in is input dimension; # H is hidden dimension; D_out is output dimension. N, D_in, H, D_out = 64, 1000, 100, 10 # Create placeholders for the input and target data; these will be filled # with real data when we execute the graph. x = tf.placeholder(tf.float32, shape=(None, D_in)) y = tf.placeholder(tf.float32, shape=(None, D_out)) # Create Variables for the weights and initialize them with random data. # A TensorFlow Variable persists its value across executions of the graph. w1 = tf.Variable(tf.random_normal((D_in, H))) w2 = tf.Variable(tf.random_normal((H, D_out))) # Forward pass: Compute the predicted y using operations on TensorFlow Tensors. # Note that this code does not actually perform any numeric operations; it # merely sets up the computational graph that we will later execute. h = tf.matmul(x, w1) h_relu = tf.maximum(h, tf.zeros(1)) y_pred = tf.matmul(h_relu, w2) # Compute loss using operations on TensorFlow Tensors loss = tf.reduce_sum((y - y_pred) ** 2.0) # Compute gradient of the loss with respect to w1 and w2. grad_w1, grad_w2 = tf.gradients(loss, [w1, w2]) # Update the weights using gradient descent. To actually update the weights # we need to evaluate new_w1 and new_w2 when executing the graph. Note that # in TensorFlow the the act of updating the value of the weights is part of # the computational graph; in PyTorch this happens outside the computational # graph. learning_rate = 1e-6 new_w1 = w1.assign(w1 - learning_rate * grad_w1) new_w2 = w2.assign(w2 - learning_rate * grad_w2) # Now we have built our computational graph, so we enter a TensorFlow session to # actually execute the graph. with tf.Session() as sess: # Run the graph once to initialize the Variables w1 and w2. sess.run(tf.global_variables_initializer()) # Create numpy arrays holding the actual data for the inputs x and targets # y x_value = np.random.randn(N, D_in) y_value = np.random.randn(N, D_out) for _ in range(500): # Execute the graph many times. Each time it executes we want to bind # x_value to x and y_value to y, specified with the feed_dict argument. # Each time we execute the graph we want to compute the values for loss, # new_w1, and new_w2; the values of these Tensors are returned as numpy # arrays. loss_value, _, _ = sess.run([loss, new_w1, new_w2], feed_dict={x: x_value, y: y_value}) print(loss_value)

关于nn模块

nn包定义了一组模块，它们大致相当于神经网络层。

模块接收输入张量并计算输出张量，但也可以保持内部状态，例如包含可学习参数的张量。

nn包还定义了一组在训练神经网络时常用的有用损失函数。

nn版本

# -*- coding: utf-8 -*- import torch # N is batch size; D_in is input dimension; # H is hidden dimension; D_out is output dimension. N, D_in, H, D_out = 64, 1000, 100, 10 # Create random Tensors to hold inputs and outputs x = torch.randn(N, D_in) y = torch.randn(N, D_out) # Use the nn package to define our model as a sequence of layers. nn.Sequential # is a Module which contains other Modules, and applies them in sequence to # produce its output. Each Linear Module computes output from input using a # linear function, and holds internal Tensors for its weight and bias. model = torch.nn.Sequential( torch.nn.Linear(D_in, H), torch.nn.ReLU(), torch.nn.Linear(H, D_out), ) # The nn package also contains definitions of popular loss functions; in this # case we will use Mean Squared Error (MSE) as our loss function. loss_fn = torch.nn.MSELoss(reduction='sum') learning_rate = 1e-4 for t in range(500): # Forward pass: compute predicted y by passing x to the model. Module objects # override the __call__ operator so you can call them like functions. When # doing so you pass a Tensor of input data to the Module and it produces # a Tensor of output data. y_pred = model(x) # Compute and print loss. We pass Tensors containing the predicted and true # values of y, and the loss function returns a Tensor containing the # loss. loss = loss_fn(y_pred, y) print(t, loss.item()) # Zero the gradients before running the backward pass. model.zero_grad() # Backward pass: compute gradient of the loss with respect to all the learnable # parameters of the model. Internally, the parameters of each Module are stored # in Tensors with requires_grad=True, so this call will compute gradients for # all learnable parameters in the model. loss.backward() # Update the weights using gradient descent. Each parameter is a Tensor, so # we can access its gradients like we did before. with torch.no_grad(): for param in model.parameters(): param -= learning_rate * param.grad

nn中的optim

PyTorch中的optim包提取了优化算法的思想，并提供了常用优化算法的实现。

使用nn和nn.optim

# -*- coding: utf-8 -*- import torch # N is batch size; D_in is input dimension; # H is hidden dimension; D_out is output dimension. N, D_in, H, D_out = 64, 1000, 100, 10 # Create random Tensors to hold inputs and outputs x = torch.randn(N, D_in) y = torch.randn(N, D_out) # Use the nn package to define our model and loss function. model = torch.nn.Sequential( torch.nn.Linear(D_in, H), torch.nn.ReLU(), torch.nn.Linear(H, D_out), ) loss_fn = torch.nn.MSELoss(reduction='sum') # Use the optim package to define an Optimizer that will update the weights of # the model for us. Here we will use Adam; the optim package contains many other # optimization algoriths. The first argument to the Adam constructor tells the # optimizer which Tensors it should update. learning_rate = 1e-4 optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) for t in range(500): # Forward pass: compute predicted y by passing x to the model. y_pred = model(x) # Compute and print loss. loss = loss_fn(y_pred, y) print(t, loss.item()) # Before the backward pass, use the optimizer object to zero all of the # gradients for the variables it will update (which are the learnable # weights of the model). This is because by default, gradients are # accumulated in buffers( i.e, not overwritten) whenever .backward() # is called. Checkout docs of torch.autograd.backward for more details. optimizer.zero_grad() # Backward pass: compute gradient of the loss with respect to model # parameters loss.backward() # Calling the step function on an Optimizer makes an update to its # parameters optimizer.step()

自定义模块

可以自定义自己所想要的网络。

自定义模块版本

（注意区别前面的定义方式）

# -*- coding: utf-8 -*- import torch class TwoLayerNet(torch.nn.Module): def __init__(self, D_in, H, D_out): """ In the constructor we instantiate two nn.Linear modules and assign them as member variables. """ super(TwoLayerNet, self).__init__() self.linear1 = torch.nn.Linear(D_in, H) self.linear2 = torch.nn.Linear(H, D_out) def forward(self, x): """ In the forward function we accept a Tensor of input data and we must return a Tensor of output data. We can use Modules defined in the constructor as well as arbitrary operators on Tensors. """ h_relu = self.linear1(x).clamp(min=0) y_pred = self.linear2(h_relu) return y_pred # N is batch size; D_in is input dimension; # H is hidden dimension; D_out is output dimension. N, D_in, H, D_out = 64, 1000, 100, 10 # Create random Tensors to hold inputs and outputs x = torch.randn(N, D_in) y = torch.randn(N, D_out) # Construct our model by instantiating the class defined above model = TwoLayerNet(D_in, H, D_out) # 之前的定义方式 # model = torch.nn.Sequential( # torch.nn.Linear(D_in, H), # torch.nn.ReLU(), # torch.nn.Linear(H, D_out), # ) # Construct our loss function and an Optimizer. The call to model.parameters() # in the SGD constructor will contain the learnable parameters of the two # nn.Linear modules which are members of the model. criterion = torch.nn.MSELoss(reduction='sum') optimizer = torch.optim.SGD(model.parameters(), lr=1e-4) for t in range(500): # Forward pass: Compute predicted y by passing x to the model y_pred = model(x) # Compute and print loss loss = criterion(y_pred, y) print(t, loss.item()) # Zero gradients, perform a backward pass, and update the weights. optimizer.zero_grad() loss.backward() optimizer.step()

最新回复(0)