GPU矩阵计算

xiaoxiao2022-06-30 308

学习GPU加速参考例子和网上博文写的GPU矩阵计算

//矩阵转置 __global__ void transposeGPU(double *outdata, double *indata, int width, int height) { __shared__ double block[BLOCK_DIM][BLOCK_DIM + 1]; unsigned int xIndex = blockIdx.x*BLOCK_DIM + threadIdx.x; unsigned int yIndex = blockIdx.y*BLOCK_DIM + threadIdx.y;

if ((xIndex < width) && (yIndex < height)) { unsigned int index_in = yIndex*width + xIndex; block[threadIdx.y][threadIdx.x] = indata[index_in]; }

__syncthreads();

xIndex = blockIdx.y*BLOCK_DIM + threadIdx.x; yIndex = blockIdx.x*BLOCK_DIM + threadIdx.y;

if ((xIndex < height) && (yIndex < width)) { unsigned int index_out = yIndex*height + xIndex; outdata[index_out] = block[threadIdx.x][threadIdx.y]; // std::cout <<" !!index_out:" <<index_out<<" block[threadIdx.x][threadIdx.y]:"<< block[threadIdx.x][threadIdx.y] <<std::endl; }

}

void transpose(double *outdata, double *indata, int width, int height) { double *d_a, *d_b; cudaMalloc((double**)&d_a, sizeof(double) * width * height); cudaMalloc((double**)&d_b, sizeof(double) * height * width);

cudaMemcpy((void*)d_a, (void*)indata, sizeof(double) * width * height, cudaMemcpyHostToDevice);

dim3 Threads(BLOCK_DIM, BLOCK_DIM); int b_x = ceil((float)width / (float)BLOCK_DIM); int b_y = ceil((float)height / (float)BLOCK_DIM); dim3 Blocks(b_x, b_y);

transposeGPU << <Blocks, Threads >> > (d_b, d_a, width, height); cudaMemcpy((void*)outdata, (void*)d_b, sizeof(double) * width * height, cudaMemcpyDeviceToHost);

} //矩阵乘法

template<int BLOCK_SIZE> __global__ void MatrixMulGPU(double *c, const double *a, const double *b, unsigned int WA, unsigned int WB) { // Block index int bx = blockIdx.x; int by = blockIdx.y;

// Thread index int tx = threadIdx.x; int ty = threadIdx.y;

// Index of the first sub-matrix of A processed by the block int aBegin = WA * BLOCK_SIZE * by;

// Index of the last sub-matrix of A processed by the block int aEnd = aBegin + WA - 1;

// Step size used to iterate through the sub-matrices of A int aStep = BLOCK_SIZE;

// Index of the first sub-matrix of B processed by the block int bBegin = BLOCK_SIZE * bx;

// Step size used to iterate through the sub-matrices of B int bStep = BLOCK_SIZE * WB;

// Csub is used to store the element of the block sub-matrix // that is computed by the thread double Csub = 0;

// Loop over all the sub-matrices of A and B // required to compute the block sub-matrix for (int i = aBegin, j = bBegin; i <= aEnd; i += aStep, j += bStep) {

// Declaration of the shared memory array As used to // store the sub-matrix of A __shared__ double As[BLOCK_SIZE][BLOCK_SIZE];

// Declaration of the shared memory array Bs used to // store the sub-matrix of B __shared__ double Bs[BLOCK_SIZE][BLOCK_SIZE];

// Load the matrices from device memory // to shared memory; each thread loads // one element of each matrix As[ty][tx] = a[i + WA * ty + tx]; Bs[ty][tx] = b[j + WB * ty + tx];

// Synchronize to make sure the matrices are loaded __syncthreads();

// Multiply the two matrices together; // each thread computes one element // of the block sub-matrix #pragma unroll

for (int k = 0; k < BLOCK_SIZE; ++k) { Csub += As[ty][k] * Bs[k][tx]; }

// Synchronize to make sure that the preceding // computation is done before loading two new // sub-matrices of A and B in the next iteration __syncthreads(); }

// Write the block sub-matrix to device memory; // each thread writes one element int k = WB * BLOCK_SIZE * by + BLOCK_SIZE * bx; c[k + WB * ty + tx] = Csub; }

void MultiMatrix(double **A, double **B, double **R, int Arow, int Acol, int Brow, int Bcol) { double *dev_a = 0; double *dev_b = 0; double *dev_c = 0; cudaMalloc((void**)&dev_a, Arow * Acol * sizeof(double)); cudaMalloc((void**)&dev_b, Brow * Bcol * sizeof(double)); cudaMalloc((void**)&dev_c, Arow * Bcol * sizeof(double));

cudaMemcpy(dev_a, *A, Arow * Acol * sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(dev_b, *B, Brow * Bcol * sizeof(double), cudaMemcpyHostToDevice);

dim3 Threads(BLOCK_DIM, BLOCK_DIM); int b_x = ceil((float)Bcol / (float)BLOCK_DIM); int b_y = ceil((float)Arow / (float)BLOCK_DIM); dim3 Blocks(b_x, b_y); MatrixMulGPU<BLOCK_DIM> << <Blocks, Threads >> > (dev_c, dev_a, dev_b, Acol, Bcol);

cudaMemcpy(*R, dev_c, Arow * Bcol * sizeof(double), cudaMemcpyDeviceToHost); cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c); }

//矩阵求逆 __global__ void InvertAGPU(double *A, double *R,int matrix_size) { __shared__ double MXI[BLOCK_DIM][BLOCK_DIM];

int isx = threadIdx.x; int isy = threadIdx.y; double tmpIn; double tmpInv; //initialize E if (isx == isy) MXI[isy][isx] = 1; else MXI[isy][isx] = 0;

for (int i = 0; i < matrix_size; i++) { if (i == isy && isx < matrix_size && isy < matrix_size) { //消除对角线上的元素（主元）为1 tmpIn = A[i*matrix_size + i]; A[i*matrix_size + isx] /= tmpIn; MXI[i][isx] /= tmpIn; } __syncthreads(); if (i != isy && isx < matrix_size && isy < matrix_size) { //将主元所在列的元素化为0 所在行的元素同时变化 tmpInv = A[isy*matrix_size + i]; A[isy*matrix_size + isx] -= tmpInv * A[i*matrix_size + isx]; MXI[isy][isx] -= tmpInv * MXI[i][isx]; } __syncthreads(); }

R[isx*matrix_size + isy] = MXI[isx][isy]; }

void InvertA(double *A, double *R, int matrix_size) { double *Agpu; double *Rgpu;

cudaMalloc((void**)&Agpu, matrix_size * matrix_size * sizeof(double)); cudaMalloc((void**)&Rgpu, matrix_size * matrix_size * sizeof(double));

cudaMemcpy(Agpu, A, matrix_size * matrix_size * sizeof(double), cudaMemcpyHostToDevice);

dim3 Threads(BLOCK_DIM, BLOCK_DIM); int b_x = ceil((float)matrix_size / (float)BLOCK_DIM); int b_y = ceil((float)matrix_size / (float)BLOCK_DIM); dim3 Blocks(b_x, b_y); InvertAGPU << <Blocks, Threads >> > (Agpu, Rgpu, matrix_size);

cudaMemcpy(R, Rgpu, matrix_size * matrix_size * sizeof(double), cudaMemcpyDeviceToHost);

cudaFree(Agpu); cudaFree(Rgpu); }

//矩阵加法

__global__ void addGPU(double* a, double* b, double *c, int width, int height)

{ int i = threadIdx.x + blockIdx.x * blockDim.x; int j = threadIdx.y + blockIdx.y * blockDim.y;

if (i < height && j < width) { c[i*width + j] = a[i*width + j] + b[i*width + j]; } }

void add(double* a, double* b, double *result, int width, int height) { double *dev_a = 0; double *dev_b = 0; double *dev_c = 0; cudaMalloc((void**)&dev_a, width * height * sizeof(double)); cudaMalloc((void**)&dev_b, width * height * sizeof(double)); cudaMalloc((void**)&dev_c, width * height * sizeof(double));

cudaMemcpy(dev_a, a, width * height * sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(dev_b, b, width * height * sizeof(double), cudaMemcpyHostToDevice);

dim3 Threads(BLOCK_DIM, BLOCK_DIM); int b_x = ceil((float)width / (float)BLOCK_DIM); int b_y = ceil((float)height / (float)BLOCK_DIM); dim3 Blocks(b_x, b_y); addGPU << <Blocks, Threads >> > (dev_a, dev_b, dev_c, width, height);

cudaMemcpy(result, dev_c, width * height * sizeof(double), cudaMemcpyDeviceToHost);

}

//矩阵减法

__global__ void subGPU(double* a, double* b, double *c, int width, int height)

{ int i = threadIdx.x + blockIdx.x * blockDim.x; int j = threadIdx.y + blockIdx.y * blockDim.y;

if (i < height && j < width) { c[i*width + j] = a[i*width + j] - b[i*width + j]; } }

void sub(double* a, double* b, double *result, int width, int height) { double *dev_a = 0; double *dev_b = 0; double *dev_c = 0; cudaMalloc((void**)&dev_a, width * height * sizeof(double)); cudaMalloc((void**)&dev_b, width * height * sizeof(double)); cudaMalloc((void**)&dev_c, width * height * sizeof(double));

cudaMemcpy(dev_a, a, width * height * sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(dev_b, b, width * height * sizeof(double), cudaMemcpyHostToDevice);

dim3 Threads(BLOCK_DIM, BLOCK_DIM); int b_x = ceil((float)width / (float)BLOCK_DIM); int b_y = ceil((float)height / (float)BLOCK_DIM); dim3 Blocks(b_x, b_y); subGPU << <Blocks, Threads >> > (dev_a, dev_b, dev_c, width, height);

cudaMemcpy(result, dev_c, width * height * sizeof(double), cudaMemcpyDeviceToHost);

}

专利

最新回复(0)