GPU矩阵计算

    xiaoxiao2022-06-30  161

    学习GPU加速 参考例子和网上博文写的GPU矩阵计算

    //矩阵转置 __global__ void transposeGPU(double *outdata, double *indata, int width, int height) {     __shared__ double block[BLOCK_DIM][BLOCK_DIM + 1];     unsigned int xIndex = blockIdx.x*BLOCK_DIM + threadIdx.x;     unsigned int yIndex = blockIdx.y*BLOCK_DIM + threadIdx.y;

        if ((xIndex < width) && (yIndex < height))     {         unsigned int index_in = yIndex*width + xIndex;         block[threadIdx.y][threadIdx.x] = indata[index_in];     }

        __syncthreads();

        xIndex = blockIdx.y*BLOCK_DIM + threadIdx.x;     yIndex = blockIdx.x*BLOCK_DIM + threadIdx.y;

        if ((xIndex < height) && (yIndex < width))     {         unsigned int index_out = yIndex*height + xIndex;         outdata[index_out] = block[threadIdx.x][threadIdx.y];         //        std::cout <<" !!index_out:" <<index_out<<" block[threadIdx.x][threadIdx.y]:"<< block[threadIdx.x][threadIdx.y] <<std::endl;     }

    }

    void transpose(double *outdata, double *indata, int width, int height) {     double *d_a, *d_b;     cudaMalloc((double**)&d_a, sizeof(double) * width * height);     cudaMalloc((double**)&d_b, sizeof(double) * height * width);

        cudaMemcpy((void*)d_a, (void*)indata, sizeof(double) * width * height, cudaMemcpyHostToDevice);

        dim3 Threads(BLOCK_DIM, BLOCK_DIM);     int b_x = ceil((float)width / (float)BLOCK_DIM);     int b_y = ceil((float)height / (float)BLOCK_DIM);     dim3 Blocks(b_x, b_y);

        transposeGPU << <Blocks, Threads >> > (d_b, d_a, width, height);     cudaMemcpy((void*)outdata, (void*)d_b, sizeof(double) * width * height, cudaMemcpyDeviceToHost);

    } //矩阵乘法

    template<int BLOCK_SIZE> __global__ void MatrixMulGPU(double *c, const double *a, const double *b, unsigned int WA, unsigned int WB) {     // Block index     int bx = blockIdx.x;     int by = blockIdx.y;

        // Thread index     int tx = threadIdx.x;     int ty = threadIdx.y;

        // Index of the first sub-matrix of A processed by the block     int aBegin = WA * BLOCK_SIZE * by;

        // Index of the last sub-matrix of A processed by the block     int aEnd = aBegin + WA - 1;

        // Step size used to iterate through the sub-matrices of A     int aStep = BLOCK_SIZE;

        // Index of the first sub-matrix of B processed by the block     int bBegin = BLOCK_SIZE * bx;

        // Step size used to iterate through the sub-matrices of B     int bStep = BLOCK_SIZE * WB;

        // Csub is used to store the element of the block sub-matrix     // that is computed by the thread     double Csub = 0;

        // Loop over all the sub-matrices of A and B     // required to compute the block sub-matrix     for (int i = aBegin, j = bBegin;         i <= aEnd;         i += aStep, j += bStep)     {

            // Declaration of the shared memory array As used to         // store the sub-matrix of A         __shared__ double As[BLOCK_SIZE][BLOCK_SIZE];

            // Declaration of the shared memory array Bs used to         // store the sub-matrix of B         __shared__ double Bs[BLOCK_SIZE][BLOCK_SIZE];

            // Load the matrices from device memory         // to shared memory; each thread loads         // one element of each matrix         As[ty][tx] = a[i + WA * ty + tx];         Bs[ty][tx] = b[j + WB * ty + tx];

            // Synchronize to make sure the matrices are loaded         __syncthreads();

            // Multiply the two matrices together;         // each thread computes one element         // of the block sub-matrix #pragma unroll

            for (int k = 0; k < BLOCK_SIZE; ++k)         {             Csub += As[ty][k] * Bs[k][tx];         }

            // Synchronize to make sure that the preceding         // computation is done before loading two new         // sub-matrices of A and B in the next iteration         __syncthreads();     }

        // Write the block sub-matrix to device memory;     // each thread writes one element     int k = WB * BLOCK_SIZE * by + BLOCK_SIZE * bx;     c[k + WB * ty + tx] = Csub; }

    void  MultiMatrix(double **A, double **B, double **R, int Arow, int Acol, int Brow, int Bcol) {     double *dev_a = 0;     double *dev_b = 0;     double *dev_c = 0;     cudaMalloc((void**)&dev_a, Arow * Acol * sizeof(double));     cudaMalloc((void**)&dev_b, Brow * Bcol * sizeof(double));     cudaMalloc((void**)&dev_c, Arow * Bcol * sizeof(double));

        cudaMemcpy(dev_a, *A, Arow * Acol * sizeof(double), cudaMemcpyHostToDevice);     cudaMemcpy(dev_b, *B, Brow * Bcol * sizeof(double), cudaMemcpyHostToDevice);

        dim3 Threads(BLOCK_DIM, BLOCK_DIM);     int b_x = ceil((float)Bcol / (float)BLOCK_DIM);     int b_y = ceil((float)Arow / (float)BLOCK_DIM);     dim3 Blocks(b_x, b_y);     MatrixMulGPU<BLOCK_DIM> << <Blocks, Threads >> > (dev_c, dev_a, dev_b, Acol, Bcol);

        cudaMemcpy(*R, dev_c, Arow * Bcol * sizeof(double), cudaMemcpyDeviceToHost);     cudaFree(dev_a);     cudaFree(dev_b);     cudaFree(dev_c); }

    //矩阵求逆 __global__ void InvertAGPU(double *A, double *R,int matrix_size) {     __shared__ double MXI[BLOCK_DIM][BLOCK_DIM];

        int isx = threadIdx.x;     int isy = threadIdx.y;     double tmpIn;     double tmpInv;     //initialize E     if (isx == isy)         MXI[isy][isx] = 1;     else          MXI[isy][isx] = 0;

        for (int i = 0; i < matrix_size; i++)     {         if (i == isy && isx < matrix_size && isy < matrix_size)         {             //消除对角线上的元素(主元)为1             tmpIn = A[i*matrix_size + i];             A[i*matrix_size + isx] /= tmpIn;             MXI[i][isx] /= tmpIn;         }         __syncthreads();         if (i != isy && isx < matrix_size && isy < matrix_size)         {             //将主元所在列的元素化为0 所在行的元素同时变化             tmpInv = A[isy*matrix_size + i];             A[isy*matrix_size + isx] -= tmpInv * A[i*matrix_size + isx];             MXI[isy][isx] -= tmpInv * MXI[i][isx];         }         __syncthreads();     }

        R[isx*matrix_size + isy] = MXI[isx][isy]; }

    void InvertA(double *A, double *R, int matrix_size) {     double *Agpu;     double *Rgpu;

        cudaMalloc((void**)&Agpu, matrix_size * matrix_size * sizeof(double));     cudaMalloc((void**)&Rgpu, matrix_size * matrix_size * sizeof(double));

        cudaMemcpy(Agpu, A, matrix_size * matrix_size * sizeof(double), cudaMemcpyHostToDevice);

        dim3 Threads(BLOCK_DIM, BLOCK_DIM);     int b_x = ceil((float)matrix_size / (float)BLOCK_DIM);     int b_y = ceil((float)matrix_size / (float)BLOCK_DIM);     dim3 Blocks(b_x, b_y);     InvertAGPU << <Blocks, Threads >> > (Agpu, Rgpu, matrix_size);

        cudaMemcpy(R, Rgpu, matrix_size * matrix_size * sizeof(double), cudaMemcpyDeviceToHost);

        cudaFree(Agpu);     cudaFree(Rgpu); }

    //矩阵加法

    __global__ void addGPU(double* a, double* b, double *c, int width, int height)

    {     int i = threadIdx.x + blockIdx.x * blockDim.x;     int j = threadIdx.y + blockIdx.y * blockDim.y;

        if (i < height && j < width)     {         c[i*width + j] = a[i*width + j] + b[i*width + j];     } }

    void  add(double* a, double* b, double *result, int width, int height) {     double *dev_a = 0;     double *dev_b = 0;     double *dev_c = 0;     cudaMalloc((void**)&dev_a, width * height * sizeof(double));     cudaMalloc((void**)&dev_b, width * height * sizeof(double));     cudaMalloc((void**)&dev_c, width * height * sizeof(double));

        cudaMemcpy(dev_a, a, width * height * sizeof(double), cudaMemcpyHostToDevice);     cudaMemcpy(dev_b, b, width * height * sizeof(double), cudaMemcpyHostToDevice);

        dim3 Threads(BLOCK_DIM, BLOCK_DIM);     int b_x = ceil((float)width / (float)BLOCK_DIM);     int b_y = ceil((float)height / (float)BLOCK_DIM);     dim3 Blocks(b_x, b_y);     addGPU << <Blocks, Threads >> > (dev_a, dev_b, dev_c, width, height);

        cudaMemcpy(result, dev_c, width * height * sizeof(double), cudaMemcpyDeviceToHost);

    }

    //矩阵减法

    __global__ void subGPU(double* a, double* b, double *c, int width, int height)

    {     int i = threadIdx.x + blockIdx.x * blockDim.x;     int j = threadIdx.y + blockIdx.y * blockDim.y;

        if (i < height && j < width)     {         c[i*width + j] = a[i*width + j] - b[i*width + j];     } }

    void  sub(double* a, double* b, double *result, int width, int height) {     double *dev_a = 0;     double *dev_b = 0;     double *dev_c = 0;     cudaMalloc((void**)&dev_a, width * height * sizeof(double));     cudaMalloc((void**)&dev_b, width * height * sizeof(double));     cudaMalloc((void**)&dev_c, width * height * sizeof(double));

        cudaMemcpy(dev_a, a, width * height * sizeof(double), cudaMemcpyHostToDevice);     cudaMemcpy(dev_b, b, width * height * sizeof(double), cudaMemcpyHostToDevice);

        dim3 Threads(BLOCK_DIM, BLOCK_DIM);     int b_x = ceil((float)width / (float)BLOCK_DIM);     int b_y = ceil((float)height / (float)BLOCK_DIM);     dim3 Blocks(b_x, b_y);     subGPU << <Blocks, Threads >> > (dev_a, dev_b, dev_c, width, height);

        cudaMemcpy(result, dev_c, width * height * sizeof(double), cudaMemcpyDeviceToHost);

    }


    最新回复(0)