1f1be3500SJunchao Zhang #include <petscdevice.h> 2f1be3500SJunchao Zhang #include <../src/ksp/pc/impls/vpbjacobi/vpbjacobi.h> 3f1be3500SJunchao Zhang 4f1be3500SJunchao Zhang /* A class that manages helper arrays assisting parallel PCApply() with CUDA */ 5f1be3500SJunchao Zhang struct PC_VPBJacobi_CUDA { 6f1be3500SJunchao Zhang /* Cache the old sizes to check if we need realloc */ 7f1be3500SJunchao Zhang PetscInt n; /* number of rows of the local matrix */ 8f1be3500SJunchao Zhang PetscInt nblocks; /* number of point blocks */ 9f1be3500SJunchao Zhang PetscInt nsize; /* sum of sizes of the point blocks */ 10f1be3500SJunchao Zhang 11f1be3500SJunchao Zhang /* Helper arrays that are pre-computed on host and then copied to device. 12f1be3500SJunchao Zhang bs: [nblocks+1], "csr" version of bsizes[], with bs[0] = 0, bs[nblocks] = n. 13*9a56b474SJunchao Zhang bs2: [nblocks+1], "csr" version of squares of bsizes[], with bs2[0] = 0, bs2[nblocks] = nsize. 14f1be3500SJunchao Zhang matIdx: [n], row i of the local matrix belongs to the matIdx_d[i] block 15f1be3500SJunchao Zhang */ 16f1be3500SJunchao Zhang PetscInt *bs_h,*bs2_h,*matIdx_h; 17f1be3500SJunchao Zhang PetscInt *bs_d,*bs2_d,*matIdx_d; 18f1be3500SJunchao Zhang 19f1be3500SJunchao Zhang MatScalar *diag_d; /* [nsize], store inverse of the point blocks on device */ 20f1be3500SJunchao Zhang 21f1be3500SJunchao Zhang PC_VPBJacobi_CUDA(PetscInt n,PetscInt nblocks,PetscInt nsize,const PetscInt *bsizes,MatScalar *diag_h) 22f1be3500SJunchao Zhang : n(n),nblocks(nblocks),nsize(nsize) 23f1be3500SJunchao Zhang { 24f1be3500SJunchao Zhang /* malloc memory on host and device, and then update */ 25f1be3500SJunchao Zhang PetscCallVoid(PetscMalloc3(nblocks+1,&bs_h,nblocks+1,&bs2_h,n,&matIdx_h)); 26f1be3500SJunchao Zhang PetscCallCUDAVoid(cudaMalloc(&bs_d,sizeof(PetscInt)*(nblocks+1))); 27f1be3500SJunchao Zhang PetscCallCUDAVoid(cudaMalloc(&bs2_d,sizeof(PetscInt)*(nblocks+1))); 28f1be3500SJunchao Zhang PetscCallCUDAVoid(cudaMalloc(&matIdx_d,sizeof(PetscInt)*n)); 29f1be3500SJunchao Zhang PetscCallCUDAVoid(cudaMalloc(&diag_d,sizeof(MatScalar)*nsize)); 30f1be3500SJunchao Zhang PetscCallVoid(UpdateOffsetsOnDevice(bsizes,diag_h)); 31f1be3500SJunchao Zhang } 32f1be3500SJunchao Zhang 33f1be3500SJunchao Zhang PetscErrorCode UpdateOffsetsOnDevice(const PetscInt *bsizes,MatScalar *diag_h) 34f1be3500SJunchao Zhang { 35f1be3500SJunchao Zhang PetscFunctionBegin; 36f1be3500SJunchao Zhang PetscCall(ComputeOffsetsOnHost(bsizes)); 37f1be3500SJunchao Zhang PetscCallCUDA(cudaMemcpy(bs_d,bs_h,sizeof(PetscInt)*(nblocks+1),cudaMemcpyHostToDevice)); 38f1be3500SJunchao Zhang PetscCallCUDA(cudaMemcpy(bs2_d,bs2_h,sizeof(PetscInt)*(nblocks+1),cudaMemcpyHostToDevice)); 39f1be3500SJunchao Zhang PetscCallCUDA(cudaMemcpy(matIdx_d,matIdx_h,sizeof(PetscInt)*n,cudaMemcpyHostToDevice)); 40f1be3500SJunchao Zhang PetscCallCUDA(cudaMemcpy(diag_d,diag_h,sizeof(MatScalar)*nsize,cudaMemcpyHostToDevice)); 41*9a56b474SJunchao Zhang PetscCall(PetscLogCpuToGpu(sizeof(PetscInt)*(2*nblocks+2+n) + sizeof(MatScalar)*nsize)); 42f1be3500SJunchao Zhang PetscFunctionReturn(0); 43f1be3500SJunchao Zhang } 44f1be3500SJunchao Zhang 45f1be3500SJunchao Zhang ~PC_VPBJacobi_CUDA() 46f1be3500SJunchao Zhang { 47f1be3500SJunchao Zhang PetscCallVoid(PetscFree3(bs_h,bs2_h,matIdx_h)); 48f1be3500SJunchao Zhang PetscCallCUDAVoid(cudaFree(bs_d)); 49f1be3500SJunchao Zhang PetscCallCUDAVoid(cudaFree(bs2_d)); 50f1be3500SJunchao Zhang PetscCallCUDAVoid(cudaFree(matIdx_d)); 51f1be3500SJunchao Zhang PetscCallCUDAVoid(cudaFree(diag_d)); 52f1be3500SJunchao Zhang } 53f1be3500SJunchao Zhang 54f1be3500SJunchao Zhang private: 55f1be3500SJunchao Zhang PetscErrorCode ComputeOffsetsOnHost(const PetscInt *bsizes) 56f1be3500SJunchao Zhang { 57f1be3500SJunchao Zhang PetscFunctionBegin; 58f1be3500SJunchao Zhang bs_h[0] = bs2_h[0] = 0; 59f1be3500SJunchao Zhang for (PetscInt i=0; i<nblocks; i++) { 60f1be3500SJunchao Zhang bs_h[i+1] = bs_h[i] + bsizes[i]; 61f1be3500SJunchao Zhang bs2_h[i+1] = bs2_h[i] + bsizes[i]*bsizes[i]; 62f1be3500SJunchao Zhang for (PetscInt j=0; j<bsizes[i]; j++) matIdx_h[bs_h[i]+j] = i; 63f1be3500SJunchao Zhang } 64f1be3500SJunchao Zhang PetscFunctionReturn(0); 65f1be3500SJunchao Zhang } 66f1be3500SJunchao Zhang }; 67f1be3500SJunchao Zhang 68f1be3500SJunchao Zhang /* Like cublasDgemvBatched() but with variable-sized blocks 69f1be3500SJunchao Zhang 70f1be3500SJunchao Zhang Input Parameters: 71f1be3500SJunchao Zhang + n - number of rows of the local matrix 72f1be3500SJunchao Zhang . bs - [nblocks+1], prefix sum of bsizes[] 73f1be3500SJunchao Zhang . bs2 - [nblocks+1], prefix sum of squares of bsizes[] 74f1be3500SJunchao Zhang . matIdx - [n], store block/matrix index for each row 75f1be3500SJunchao Zhang . A - blocks of the matrix back to back in column-major order 76f1be3500SJunchao Zhang - x - the input vector 77f1be3500SJunchao Zhang 78f1be3500SJunchao Zhang Output Parameter: 79f1be3500SJunchao Zhang . y - the output vector 80f1be3500SJunchao Zhang */ 81f1be3500SJunchao Zhang __global__ static void MatMultBatched(PetscInt n,const PetscInt *bs,const PetscInt *bs2,const PetscInt *matIdx,const MatScalar *A,const PetscScalar *x,PetscScalar *y) 82f1be3500SJunchao Zhang { 83f1be3500SJunchao Zhang const PetscInt gridSize = gridDim.x * blockDim.x; 84f1be3500SJunchao Zhang PetscInt tid = blockIdx.x*blockDim.x + threadIdx.x; 85f1be3500SJunchao Zhang PetscInt i,j,k,m; 86f1be3500SJunchao Zhang 87f1be3500SJunchao Zhang /* One row per thread. The blocks/matrices are stored in column-major order */ 88f1be3500SJunchao Zhang for (; tid<n; tid += gridSize) { 89f1be3500SJunchao Zhang k = matIdx[tid]; /* k-th block */ 90f1be3500SJunchao Zhang m = bs[k+1] - bs[k]; /* block size of the k-th block */ 91f1be3500SJunchao Zhang i = tid - bs[k]; /* i-th row of the block */ 92f1be3500SJunchao Zhang A += bs2[k] + i; /* advance A to the first entry of i-th row */ 93f1be3500SJunchao Zhang x += bs[k]; 94f1be3500SJunchao Zhang y += bs[k]; 95f1be3500SJunchao Zhang 96f1be3500SJunchao Zhang y[i] = 0.0; 97f1be3500SJunchao Zhang for (j=0; j<m; j++) {y[i] += A[0]*x[j]; A += m;} 98f1be3500SJunchao Zhang } 99f1be3500SJunchao Zhang } 100f1be3500SJunchao Zhang 101f1be3500SJunchao Zhang static PetscErrorCode PCApply_VPBJacobi_CUDA(PC pc,Vec x,Vec y) 102f1be3500SJunchao Zhang { 103f1be3500SJunchao Zhang PC_VPBJacobi *jac = (PC_VPBJacobi*)pc->data; 104f1be3500SJunchao Zhang PC_VPBJacobi_CUDA *pcuda = static_cast<PC_VPBJacobi_CUDA*>(jac->spptr); 105f1be3500SJunchao Zhang const PetscScalar *xx; 106f1be3500SJunchao Zhang PetscScalar *yy; 107f1be3500SJunchao Zhang PetscInt n; 108f1be3500SJunchao Zhang 109f1be3500SJunchao Zhang PetscFunctionBegin; 110*9a56b474SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 111f1be3500SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 112f1be3500SJunchao Zhang PetscBool isCuda; 113f1be3500SJunchao Zhang PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&isCuda,VECSEQCUDA,VECMPICUDA,"")); 114f1be3500SJunchao Zhang if (isCuda) PetscCall(PetscObjectTypeCompareAny((PetscObject)y,&isCuda,VECSEQCUDA,VECMPICUDA,"")); 115f1be3500SJunchao Zhang PetscCheck(isCuda,PETSC_COMM_SELF,PETSC_ERR_SUP,"PC: applying a CUDA pmat to non-cuda vectors"); 116f1be3500SJunchao Zhang } 117f1be3500SJunchao Zhang 118f1be3500SJunchao Zhang PetscCall(MatGetLocalSize(pc->pmat,&n,NULL)); 119f1be3500SJunchao Zhang if (n) { 120f1be3500SJunchao Zhang PetscInt gridSize = PetscMin((n+255)/256,2147483647); /* <= 2^31-1 */ 121f1be3500SJunchao Zhang PetscCall(VecCUDAGetArrayRead(x,&xx)); 122f1be3500SJunchao Zhang PetscCall(VecCUDAGetArrayWrite(y,&yy)); 123f1be3500SJunchao Zhang MatMultBatched<<<gridSize,256>>>(n,pcuda->bs_d,pcuda->bs2_d,pcuda->matIdx_d,pcuda->diag_d,xx,yy); 124f1be3500SJunchao Zhang PetscCallCUDA(cudaGetLastError()); 125f1be3500SJunchao Zhang PetscCall(VecCUDARestoreArrayRead(x,&xx)); 126f1be3500SJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(y,&yy)); 127f1be3500SJunchao Zhang } 128*9a56b474SJunchao Zhang PetscCall(PetscLogGpuFlops(pcuda->nsize*2)); /* FMA on entries in all blocks */ 129*9a56b474SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 130f1be3500SJunchao Zhang PetscFunctionReturn(0); 131f1be3500SJunchao Zhang } 132f1be3500SJunchao Zhang 133f1be3500SJunchao Zhang static PetscErrorCode PCDestroy_VPBJacobi_CUDA(PC pc) 134f1be3500SJunchao Zhang { 135f1be3500SJunchao Zhang PC_VPBJacobi *jac = (PC_VPBJacobi*)pc->data; 136f1be3500SJunchao Zhang 137f1be3500SJunchao Zhang PetscFunctionBegin; 138f1be3500SJunchao Zhang PetscCallCXX(delete static_cast<PC_VPBJacobi_CUDA*>(jac->spptr)); 139f1be3500SJunchao Zhang PCDestroy_VPBJacobi(pc); 140f1be3500SJunchao Zhang PetscFunctionReturn(0); 141f1be3500SJunchao Zhang } 142f1be3500SJunchao Zhang 143f1be3500SJunchao Zhang PETSC_INTERN PetscErrorCode PCSetUp_VPBJacobi_CUDA(PC pc) 144f1be3500SJunchao Zhang { 145f1be3500SJunchao Zhang PC_VPBJacobi *jac = (PC_VPBJacobi*)pc->data; 146f1be3500SJunchao Zhang PC_VPBJacobi_CUDA *pcuda = static_cast<PC_VPBJacobi_CUDA*>(jac->spptr); 147f1be3500SJunchao Zhang PetscInt i,n,nblocks,nsize = 0; 148f1be3500SJunchao Zhang const PetscInt *bsizes; 149f1be3500SJunchao Zhang 150f1be3500SJunchao Zhang PetscFunctionBegin; 151f1be3500SJunchao Zhang PetscCall(PCSetUp_VPBJacobi_Host(pc)); /* Compute the inverse on host now. Might worth doing it on device directly */ 152f1be3500SJunchao Zhang PetscCall(MatGetVariableBlockSizes(pc->pmat,&nblocks,&bsizes)); 153f1be3500SJunchao Zhang for (i=0; i<nblocks; i++) nsize += bsizes[i]*bsizes[i]; 154f1be3500SJunchao Zhang PetscCall(MatGetLocalSize(pc->pmat,&n,NULL)); 155f1be3500SJunchao Zhang 156f1be3500SJunchao Zhang /* If one calls MatSetVariableBlockSizes() multiple times and sizes have been changed (is it allowed?), we delete the old and rebuild anyway */ 157f1be3500SJunchao Zhang if (pcuda && (pcuda->n != n || pcuda->nblocks != nblocks || pcuda->nsize != nsize)) { 158f1be3500SJunchao Zhang PetscCallCXX(delete pcuda); 159f1be3500SJunchao Zhang pcuda = nullptr; 160f1be3500SJunchao Zhang } 161f1be3500SJunchao Zhang 162f1be3500SJunchao Zhang if (!pcuda) { /* allocate the struct along with the helper arrays from the scatch */ 163f1be3500SJunchao Zhang PetscCallCXX(jac->spptr = new PC_VPBJacobi_CUDA(n,nblocks,nsize,bsizes,jac->diag)); 164f1be3500SJunchao Zhang } else { /* update the value only */ 165f1be3500SJunchao Zhang PetscCall(pcuda->UpdateOffsetsOnDevice(bsizes,jac->diag)); 166f1be3500SJunchao Zhang } 167f1be3500SJunchao Zhang 168f1be3500SJunchao Zhang pc->ops->apply = PCApply_VPBJacobi_CUDA; 169f1be3500SJunchao Zhang pc->ops->destroy = PCDestroy_VPBJacobi_CUDA; 170f1be3500SJunchao Zhang PetscFunctionReturn(0); 171f1be3500SJunchao Zhang } 172