学习GPU加速 参考例子和网上博文写的GPU矩阵计算
//矩阵转置 __global__ void transposeGPU(double *outdata, double *indata, int width, int height) { __shared__ double block[BLOCK_DIM][BLOCK_DIM + 1]; unsigned int xIndex = blockIdx.x*BLOCK_DIM + threadIdx.x; unsigned int yIndex = blockIdx.y*BLOCK_DIM + threadIdx.y;
if ((xIndex < width) && (yIndex < height)) { unsigned int index_in = yIndex*width + xIndex; block[threadIdx.y][threadIdx.x] = indata[index_in]; }
__syncthreads();
xIndex = blockIdx.y*BLOCK_DIM + threadIdx.x; yIndex = blockIdx.x*BLOCK_DIM + threadIdx.y;
if ((xIndex < height) && (yIndex < width)) { unsigned int index_out = yIndex*height + xIndex; outdata[index_out] = block[threadIdx.x][threadIdx.y]; // std::cout <<" !!index_out:" <<index_out<<" block[threadIdx.x][threadIdx.y]:"<< block[threadIdx.x][threadIdx.y] <<std::endl; }
}
void transpose(double *outdata, double *indata, int width, int height) { double *d_a, *d_b; cudaMalloc((double**)&d_a, sizeof(double) * width * height); cudaMalloc((double**)&d_b, sizeof(double) * height * width);
cudaMemcpy((void*)d_a, (void*)indata, sizeof(double) * width * height, cudaMemcpyHostToDevice);
dim3 Threads(BLOCK_DIM, BLOCK_DIM); int b_x = ceil((float)width / (float)BLOCK_DIM); int b_y = ceil((float)height / (float)BLOCK_DIM); dim3 Blocks(b_x, b_y);
transposeGPU << <Blocks, Threads >> > (d_b, d_a, width, height); cudaMemcpy((void*)outdata, (void*)d_b, sizeof(double) * width * height, cudaMemcpyDeviceToHost);
} //矩阵乘法
template<int BLOCK_SIZE> __global__ void MatrixMulGPU(double *c, const double *a, const double *b, unsigned int WA, unsigned int WB) { // Block index int bx = blockIdx.x; int by = blockIdx.y;
// Thread index int tx = threadIdx.x; int ty = threadIdx.y;
// Index of the first sub-matrix of A processed by the block int aBegin = WA * BLOCK_SIZE * by;
// Index of the last sub-matrix of A processed by the block int aEnd = aBegin + WA - 1;
// Step size used to iterate through the sub-matrices of A int aStep = BLOCK_SIZE;
// Index of the first sub-matrix of B processed by the block int bBegin = BLOCK_SIZE * bx;
// Step size used to iterate through the sub-matrices of B int bStep = BLOCK_SIZE * WB;
// Csub is used to store the element of the block sub-matrix // that is computed by the thread double Csub = 0;
// Loop over all the sub-matrices of A and B // required to compute the block sub-matrix for (int i = aBegin, j = bBegin; i <= aEnd; i += aStep, j += bStep) {
// Declaration of the shared memory array As used to // store the sub-matrix of A __shared__ double As[BLOCK_SIZE][BLOCK_SIZE];
// Declaration of the shared memory array Bs used to // store the sub-matrix of B __shared__ double Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load the matrices from device memory // to shared memory; each thread loads // one element of each matrix As[ty][tx] = a[i + WA * ty + tx]; Bs[ty][tx] = b[j + WB * ty + tx];
// Synchronize to make sure the matrices are loaded __syncthreads();
// Multiply the two matrices together; // each thread computes one element // of the block sub-matrix #pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k) { Csub += As[ty][k] * Bs[k][tx]; }
// Synchronize to make sure that the preceding // computation is done before loading two new // sub-matrices of A and B in the next iteration __syncthreads(); }
// Write the block sub-matrix to device memory; // each thread writes one element int k = WB * BLOCK_SIZE * by + BLOCK_SIZE * bx; c[k + WB * ty + tx] = Csub; }
void MultiMatrix(double **A, double **B, double **R, int Arow, int Acol, int Brow, int Bcol) { double *dev_a = 0; double *dev_b = 0; double *dev_c = 0; cudaMalloc((void**)&dev_a, Arow * Acol * sizeof(double)); cudaMalloc((void**)&dev_b, Brow * Bcol * sizeof(double)); cudaMalloc((void**)&dev_c, Arow * Bcol * sizeof(double));
cudaMemcpy(dev_a, *A, Arow * Acol * sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(dev_b, *B, Brow * Bcol * sizeof(double), cudaMemcpyHostToDevice);
dim3 Threads(BLOCK_DIM, BLOCK_DIM); int b_x = ceil((float)Bcol / (float)BLOCK_DIM); int b_y = ceil((float)Arow / (float)BLOCK_DIM); dim3 Blocks(b_x, b_y); MatrixMulGPU<BLOCK_DIM> << <Blocks, Threads >> > (dev_c, dev_a, dev_b, Acol, Bcol);
cudaMemcpy(*R, dev_c, Arow * Bcol * sizeof(double), cudaMemcpyDeviceToHost); cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c); }
//矩阵求逆 __global__ void InvertAGPU(double *A, double *R,int matrix_size) { __shared__ double MXI[BLOCK_DIM][BLOCK_DIM];
int isx = threadIdx.x; int isy = threadIdx.y; double tmpIn; double tmpInv; //initialize E if (isx == isy) MXI[isy][isx] = 1; else MXI[isy][isx] = 0;
for (int i = 0; i < matrix_size; i++) { if (i == isy && isx < matrix_size && isy < matrix_size) { //消除对角线上的元素(主元)为1 tmpIn = A[i*matrix_size + i]; A[i*matrix_size + isx] /= tmpIn; MXI[i][isx] /= tmpIn; } __syncthreads(); if (i != isy && isx < matrix_size && isy < matrix_size) { //将主元所在列的元素化为0 所在行的元素同时变化 tmpInv = A[isy*matrix_size + i]; A[isy*matrix_size + isx] -= tmpInv * A[i*matrix_size + isx]; MXI[isy][isx] -= tmpInv * MXI[i][isx]; } __syncthreads(); }
R[isx*matrix_size + isy] = MXI[isx][isy]; }
void InvertA(double *A, double *R, int matrix_size) { double *Agpu; double *Rgpu;
cudaMalloc((void**)&Agpu, matrix_size * matrix_size * sizeof(double)); cudaMalloc((void**)&Rgpu, matrix_size * matrix_size * sizeof(double));
cudaMemcpy(Agpu, A, matrix_size * matrix_size * sizeof(double), cudaMemcpyHostToDevice);
dim3 Threads(BLOCK_DIM, BLOCK_DIM); int b_x = ceil((float)matrix_size / (float)BLOCK_DIM); int b_y = ceil((float)matrix_size / (float)BLOCK_DIM); dim3 Blocks(b_x, b_y); InvertAGPU << <Blocks, Threads >> > (Agpu, Rgpu, matrix_size);
cudaMemcpy(R, Rgpu, matrix_size * matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaFree(Agpu); cudaFree(Rgpu); }
//矩阵加法
__global__ void addGPU(double* a, double* b, double *c, int width, int height)
{ int i = threadIdx.x + blockIdx.x * blockDim.x; int j = threadIdx.y + blockIdx.y * blockDim.y;
if (i < height && j < width) { c[i*width + j] = a[i*width + j] + b[i*width + j]; } }
void add(double* a, double* b, double *result, int width, int height) { double *dev_a = 0; double *dev_b = 0; double *dev_c = 0; cudaMalloc((void**)&dev_a, width * height * sizeof(double)); cudaMalloc((void**)&dev_b, width * height * sizeof(double)); cudaMalloc((void**)&dev_c, width * height * sizeof(double));
cudaMemcpy(dev_a, a, width * height * sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(dev_b, b, width * height * sizeof(double), cudaMemcpyHostToDevice);
dim3 Threads(BLOCK_DIM, BLOCK_DIM); int b_x = ceil((float)width / (float)BLOCK_DIM); int b_y = ceil((float)height / (float)BLOCK_DIM); dim3 Blocks(b_x, b_y); addGPU << <Blocks, Threads >> > (dev_a, dev_b, dev_c, width, height);
cudaMemcpy(result, dev_c, width * height * sizeof(double), cudaMemcpyDeviceToHost);
}
//矩阵减法
__global__ void subGPU(double* a, double* b, double *c, int width, int height)
{ int i = threadIdx.x + blockIdx.x * blockDim.x; int j = threadIdx.y + blockIdx.y * blockDim.y;
if (i < height && j < width) { c[i*width + j] = a[i*width + j] - b[i*width + j]; } }
void sub(double* a, double* b, double *result, int width, int height) { double *dev_a = 0; double *dev_b = 0; double *dev_c = 0; cudaMalloc((void**)&dev_a, width * height * sizeof(double)); cudaMalloc((void**)&dev_b, width * height * sizeof(double)); cudaMalloc((void**)&dev_c, width * height * sizeof(double));
cudaMemcpy(dev_a, a, width * height * sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(dev_b, b, width * height * sizeof(double), cudaMemcpyHostToDevice);
dim3 Threads(BLOCK_DIM, BLOCK_DIM); int b_x = ceil((float)width / (float)BLOCK_DIM); int b_y = ceil((float)height / (float)BLOCK_DIM); dim3 Blocks(b_x, b_y); subGPU << <Blocks, Threads >> > (dev_a, dev_b, dev_c, width, height);
cudaMemcpy(result, dev_c, width * height * sizeof(double), cudaMemcpyDeviceToHost);
}