CUDA入门（二）cuda编程的基本知识与第一个cuda程序

xiaoxiao2026-05-15 17

多首先，先来了解一下GPU与CPU的区别，如图可以看到CPU（Central Processing Unit，中央处理单元），由Control（控制台），ALU（Arithmetic Logic Unit，逻辑计算单元），Cache（高速缓存），而GPU（Graphic Processing Unit，图形处理单元）也是由相同的部件组成，但GPU的计算单元远比CPU多，这就决定了GPU适合大量简单，精度要求低的计算，CPU则适合复杂的，精度要求高的计算。（如果还不了解，可以回学校恶补一下微机原理或计算机组成原理）。在c编程中，一个工程中可以调用声明的函数；但在cuda编程中不是这样，我们人为规定在CPU为host，即主机，GPU为device，即设备，主机上的函数与设备上的函数是不同的，在原来C串行编程时使用的各种库里的函数可能不能在device上运用（刚开始时被这害的好惨开始），当然想cos，sin等一些基本的计算函数还是可以用的，但精度会比较低。我们可以在函数声明时，在函数前加前缀来改变函数是在host，还device上执行：

__global__ 执行：device 调用:host __device__ 执行：device 调用：device __host__ 执行：host 调用：host

还有一个重要的就是线程，块的概念，先在这边简单地提一下，具体会在下次博客中具体讲，简单的线程就是GPU的计算单元同时对不同对象进行相同的操作，即可以并行操作。接下来，介绍一下第一个程序：先建一个工程：选择CUDA 7.5 Runtime 进去之后会直接看到一个程序：

#include "cuda_runtime.h" #include "device_launch_parameters.h" #include <stdio.h> //包含库 cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size); __global__ void addKernel(int *c, const int *a, const int *b)//host调用的在device上执行的函数 { int i = threadIdx.x; c[i] = a[i] + b[i]; } int main() { const int arraySize = 5; const int a[arraySize] = { 1, 2, 3, 4, 5 }; const int b[arraySize] = { 10, 20, 30, 40, 50 }; int c[arraySize] = { 0 }; //声明函数 // Add vectors in parallel. cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize); if (cudaStatus != cudaSuccess) { fprintf(stderr, "addWithCuda failed!"); return 1; }//保证调用成功 printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n", c[0], c[1], c[2], c[3], c[4]); // cudaDeviceReset must be called before exiting in order for profiling and // tracing tools such as Nsight and Visual Profiler to show complete traces. cudaStatus = cudaDeviceReset(); //初始化设备 if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaDeviceReset failed!"); return 1; } return 0; } // Helper function for using CUDA to add vectors in parallel. cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size) { int *dev_a = 0; int *dev_b = 0; int *dev_c = 0; cudaError_t cudaStatus; // Choose which GPU to run on, change this on a multi-GPU system. cudaStatus = cudaSetDevice(0); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); goto Error; } // Allocate GPU buffers for three vectors (two input, one output) . cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int)); //开辟出空间类似malloc if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } // Copy input vectors from host memory to GPU buffers. cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice); //将host的数据传到device变量上 if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } // Launch a kernel on the GPU with one thread for each element. addKernel<<<1, size>>>(dev_c, dev_a, dev_b); // Check for any errors launching the kernel cudaStatus = cudaGetLastError(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); goto Error; } // cudaDeviceSynchronize waits for the kernel to finish, and returns // any errors encountered during the launch. cudaStatus = cudaDeviceSynchronize(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus); goto Error; } // Copy output vector from GPU buffer to host memory. cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } Error: cudaFree(dev_c); cudaFree(dev_a); cudaFree(dev_b); //释放cudamalloc的内存 return cudaStatus; }

这段代码实现了[1,2,3,4,5]与[10,20,30,40,50]两个矩阵的并行相加，即同时对进行5次 c[i] = a[i] + b[i]；而不是串行时的一个for循环。 cuda程序: （1）host数据传到device上：cudaMemcpy （2）device处理：global 和device函数（3）device数据传回host上： cudaMemcpy，别忘了free cudamalloc的空间具体API可以看英伟达提供的API手册： https://cudazone.nvidia.cn/nvidia-gpu-computing-documentation/

最新回复(0)