多首先,先来了解一下GPU与CPU的区别,如图 可以看到CPU(Central Processing Unit,中央处理单元),由Control(控制台),ALU(Arithmetic Logic Unit,逻辑计算单元),Cache(高速缓存),而GPU(Graphic Processing Unit,图形处理单元)也是由相同的部件组成,但GPU的计算单元远比CPU多,这就决定了GPU适合大量简单,精度要求低的计算,CPU则适合复杂的,精度要求高的计算。(如果还不了解,可以回学校恶补一下微机原理或计算机组成原理)。 在c编程中,一个工程中可以调用声明的函数;但在cuda编程中不是这样,我们人为规定在CPU为host,即主机,GPU为device,即设备,主机上的函数与设备上的函数是不同的,在原来C串行编程时使用的各种库里的函数可能不能在device上运用(刚开始时被这害的好惨开始),当然想cos,sin等一些基本的计算函数还是可以用的,但精度会比较低。 我们可以在函数声明时,在函数前加前缀来改变函数是在host,还device上执行:
__global__ 执行:device 调用:host __device__ 执行:device 调用:device __host__ 执行:host 调用:host还有一个重要的就是线程,块的概念,先在这边简单地提一下,具体会在下次博客中具体讲,简单的线程就是GPU的计算单元同时对不同对象进行相同的操作,即可以并行操作。 接下来,介绍一下第一个程序: 先建一个工程: 选择CUDA 7.5 Runtime 进去之后会直接看到一个程序:
#include "cuda_runtime.h" #include "device_launch_parameters.h" #include <stdio.h> //包含库 cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size); __global__ void addKernel(int *c, const int *a, const int *b)//host调用的在device上执行的函数 { int i = threadIdx.x; c[i] = a[i] + b[i]; } int main() { const int arraySize = 5; const int a[arraySize] = { 1, 2, 3, 4, 5 }; const int b[arraySize] = { 10, 20, 30, 40, 50 }; int c[arraySize] = { 0 }; //声明函数 // Add vectors in parallel. cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize); if (cudaStatus != cudaSuccess) { fprintf(stderr, "addWithCuda failed!"); return 1; }//保证调用成功 printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n", c[0], c[1], c[2], c[3], c[4]); // cudaDeviceReset must be called before exiting in order for profiling and // tracing tools such as Nsight and Visual Profiler to show complete traces. cudaStatus = cudaDeviceReset(); //初始化设备 if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaDeviceReset failed!"); return 1; } return 0; } // Helper function for using CUDA to add vectors in parallel. cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size) { int *dev_a = 0; int *dev_b = 0; int *dev_c = 0; cudaError_t cudaStatus; // Choose which GPU to run on, change this on a multi-GPU system. cudaStatus = cudaSetDevice(0); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); goto Error; } // Allocate GPU buffers for three vectors (two input, one output) . cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int)); //开辟出空间类似malloc if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } // Copy input vectors from host memory to GPU buffers. cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice); //将host的数据传到device变量上 if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } // Launch a kernel on the GPU with one thread for each element. addKernel<<<1, size>>>(dev_c, dev_a, dev_b); // Check for any errors launching the kernel cudaStatus = cudaGetLastError(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); goto Error; } // cudaDeviceSynchronize waits for the kernel to finish, and returns // any errors encountered during the launch. cudaStatus = cudaDeviceSynchronize(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus); goto Error; } // Copy output vector from GPU buffer to host memory. cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } Error: cudaFree(dev_c); cudaFree(dev_a); cudaFree(dev_b); //释放cudamalloc的内存 return cudaStatus; }这段代码实现了[1,2,3,4,5]与[10,20,30,40,50]两个矩阵的并行相加,即同时对进行5次 c[i] = a[i] + b[i];而不是串行时的一个for循环。 cuda程序: (1)host数据传到device上:cudaMemcpy (2)device处理:global 和device函数 (3)device数据传回host上: cudaMemcpy,别忘了free cudamalloc的空间 具体API可以看英伟达提供的API手册: https://cudazone.nvidia.cn/nvidia-gpu-computing-documentation/
`
