1 #include <petscconf.h> 2 #include <../src/mat/impls/aij/mpi/mpiaij.h> 3 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 4 #include <petscveckokkos.hpp> 5 6 PetscErrorCode MatAssemblyEnd_MPIAIJKokkos(Mat A,MatAssemblyType mode) 7 { 8 PetscErrorCode ierr; 9 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 10 Mat_SeqAIJKokkos *aijkok = mpiaij->A->spptr ? static_cast<Mat_SeqAIJKokkos*>(mpiaij->A->spptr) : NULL; 11 12 PetscFunctionBegin; 13 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 14 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 15 ierr = VecSetType(mpiaij->lvec,VECSEQKOKKOS);CHKERRQ(ierr); 16 } 17 if (aijkok && aijkok->device_mat_d.data()) { 18 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 19 } 20 21 PetscFunctionReturn(0); 22 } 23 24 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJKokkos(Mat mat,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 25 { 26 PetscErrorCode ierr; 27 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 28 29 PetscFunctionBegin; 30 ierr = PetscLayoutSetUp(mat->rmap);CHKERRQ(ierr); 31 ierr = PetscLayoutSetUp(mat->cmap);CHKERRQ(ierr); 32 #if defined(PETSC_USE_DEBUG) 33 if (d_nnz) { 34 PetscInt i; 35 for (i=0; i<mat->rmap->n; i++) { 36 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 37 } 38 } 39 if (o_nnz) { 40 PetscInt i; 41 for (i=0; i<mat->rmap->n; i++) { 42 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 43 } 44 } 45 #endif 46 #if defined(PETSC_USE_CTABLE) 47 ierr = PetscTableDestroy(&mpiaij->colmap);CHKERRQ(ierr); 48 #else 49 ierr = PetscFree(mpiaij->colmap);CHKERRQ(ierr); 50 #endif 51 ierr = PetscFree(mpiaij->garray);CHKERRQ(ierr); 52 ierr = VecDestroy(&mpiaij->lvec);CHKERRQ(ierr); 53 ierr = VecScatterDestroy(&mpiaij->Mvctx);CHKERRQ(ierr); 54 /* Because the B will have been resized we simply destroy it and create a new one each time */ 55 ierr = MatDestroy(&mpiaij->B);CHKERRQ(ierr); 56 57 if (!mpiaij->A) { 58 ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->A);CHKERRQ(ierr); 59 ierr = MatSetSizes(mpiaij->A,mat->rmap->n,mat->cmap->n,mat->rmap->n,mat->cmap->n);CHKERRQ(ierr); 60 ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->A);CHKERRQ(ierr); 61 } 62 if (!mpiaij->B) { 63 PetscMPIInt size; 64 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size);CHKERRQ(ierr); 65 ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->B);CHKERRQ(ierr); 66 ierr = MatSetSizes(mpiaij->B,mat->rmap->n,size > 1 ? mat->cmap->N : 0,mat->rmap->n,size > 1 ? mat->cmap->N : 0);CHKERRQ(ierr); 67 ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->B);CHKERRQ(ierr); 68 } 69 ierr = MatSetType(mpiaij->A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 70 ierr = MatSetType(mpiaij->B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 71 ierr = MatSeqAIJSetPreallocation(mpiaij->A,d_nz,d_nnz);CHKERRQ(ierr); 72 ierr = MatSeqAIJSetPreallocation(mpiaij->B,o_nz,o_nnz);CHKERRQ(ierr); 73 mat->preallocated = PETSC_TRUE; 74 PetscFunctionReturn(0); 75 } 76 77 PetscErrorCode MatMult_MPIAIJKokkos(Mat mat,Vec xx,Vec yy) 78 { 79 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 80 PetscErrorCode ierr; 81 PetscInt nt; 82 83 PetscFunctionBegin; 84 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 85 if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->cmap->n,nt); 86 ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 87 ierr = (*mpiaij->A->ops->mult)(mpiaij->A,xx,yy);CHKERRQ(ierr); 88 ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 89 ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,yy,yy);CHKERRQ(ierr); 90 PetscFunctionReturn(0); 91 } 92 93 PetscErrorCode MatMultAdd_MPIAIJKokkos(Mat mat,Vec xx,Vec yy,Vec zz) 94 { 95 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 96 PetscErrorCode ierr; 97 PetscInt nt; 98 99 PetscFunctionBegin; 100 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 101 if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->cmap->n,nt); 102 ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 103 ierr = (*mpiaij->A->ops->multadd)(mpiaij->A,xx,yy,zz);CHKERRQ(ierr); 104 ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 105 ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,zz,zz);CHKERRQ(ierr); 106 PetscFunctionReturn(0); 107 } 108 109 PetscErrorCode MatMultTranspose_MPIAIJKokkos(Mat mat,Vec xx,Vec yy) 110 { 111 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 112 PetscErrorCode ierr; 113 PetscInt nt; 114 115 PetscFunctionBegin; 116 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 117 if (nt != mat->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->rmap->n,nt); 118 ierr = (*mpiaij->B->ops->multtranspose)(mpiaij->B,xx,mpiaij->lvec);CHKERRQ(ierr); 119 ierr = (*mpiaij->A->ops->multtranspose)(mpiaij->A,xx,yy);CHKERRQ(ierr); 120 ierr = VecScatterBegin(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 121 ierr = VecScatterEnd(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 122 PetscFunctionReturn(0); 123 } 124 125 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 126 { 127 PetscErrorCode ierr; 128 Mat B; 129 130 PetscFunctionBegin; 131 if (reuse == MAT_INITIAL_MATRIX) { 132 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 133 } else if (reuse == MAT_REUSE_MATRIX) { 134 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 135 } 136 B = *newmat; 137 138 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 139 ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr); 140 ierr = PetscObjectChangeTypeName((PetscObject)B,MATMPIAIJKOKKOS);CHKERRQ(ierr); 141 142 B->ops->assemblyend = MatAssemblyEnd_MPIAIJKokkos; 143 B->ops->mult = MatMult_MPIAIJKokkos; 144 B->ops->multadd = MatMultAdd_MPIAIJKokkos; 145 B->ops->multtranspose = MatMultTranspose_MPIAIJKokkos; 146 147 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJKokkos);CHKERRQ(ierr); 148 PetscFunctionReturn(0); 149 } 150 151 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJKokkos(Mat A) 152 { 153 PetscErrorCode ierr; 154 155 PetscFunctionBegin; 156 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 157 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 158 ierr = MatConvert_MPIAIJ_MPIAIJKokkos(A,MATMPIAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 159 PetscFunctionReturn(0); 160 } 161 162 /*@C 163 MatCreateAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 164 (the default parallel PETSc format). This matrix will ultimately pushed down 165 to Kokkos for calculations. For good matrix 166 assembly performance the user should preallocate the matrix storage by setting 167 the parameter nz (or the array nnz). By setting these parameters accurately, 168 performance during matrix assembly can be increased by more than a factor of 50. 169 170 Collective 171 172 Input Parameters: 173 + comm - MPI communicator, set to PETSC_COMM_SELF 174 . m - number of rows 175 . n - number of columns 176 . nz - number of nonzeros per row (same for all rows) 177 - nnz - array containing the number of nonzeros in the various rows 178 (possibly different for each row) or NULL 179 180 Output Parameter: 181 . A - the matrix 182 183 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 184 MatXXXXSetPreallocation() paradigm instead of this routine directly. 185 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 186 187 Notes: 188 If nnz is given then nz is ignored 189 190 The AIJ format (also called the Yale sparse matrix format or 191 compressed row storage), is fully compatible with standard Fortran 77 192 storage. That is, the stored row and column indices can begin at 193 either one (as in Fortran) or zero. See the users' manual for details. 194 195 Specify the preallocated storage with either nz or nnz (not both). 196 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 197 allocation. For large problems you MUST preallocate memory or you 198 will get TERRIBLE performance, see the users' manual chapter on matrices. 199 200 By default, this format uses inodes (identical nodes) when possible, to 201 improve numerical efficiency of matrix-vector products and solves. We 202 search for consecutive rows with the same nonzero structure, thereby 203 reusing matrix information to achieve increased efficiency. 204 205 Level: intermediate 206 207 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJKOKKOS, MATAIJKokkos 208 @*/ 209 PetscErrorCode MatCreateAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 210 { 211 PetscErrorCode ierr; 212 PetscMPIInt size; 213 214 PetscFunctionBegin; 215 ierr = MatCreate(comm,A);CHKERRQ(ierr); 216 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 217 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 218 if (size > 1) { 219 ierr = MatSetType(*A,MATMPIAIJKOKKOS);CHKERRQ(ierr); 220 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 221 } else { 222 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 223 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 224 } 225 PetscFunctionReturn(0); 226 } 227 228 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 229 PetscErrorCode MatKokkosGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B) 230 { 231 #if defined(PETSC_USE_CTABLE) 232 SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)"); 233 #else 234 PetscMPIInt size,rank; 235 MPI_Comm comm; 236 PetscErrorCode ierr; 237 PetscSplitCSRDataStructure *d_mat=NULL; 238 239 PetscFunctionBegin; 240 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 241 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 242 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 243 if (size == 1) { 244 ierr = MatSeqAIJKokkosGetDeviceMat(A,&d_mat);CHKERRQ(ierr); 245 } else { 246 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 247 ierr = MatSeqAIJKokkosGetDeviceMat(aij->A,&d_mat);CHKERRQ(ierr); 248 } 249 // act like MatSetValues because not called on host 250 if (A->assembled) { 251 if (A->was_assembled) { 252 ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr); 253 } 254 A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 255 } else { 256 ierr = PetscInfo1(A,"Warning !assemble ??? assembled=%D\n",A->assembled);CHKERRQ(ierr); 257 // SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix"); 258 } 259 if (!d_mat) { 260 PetscSplitCSRDataStructure h_mat; /* host container */ 261 Mat_SeqAIJKokkos *aijkokA; 262 Mat_SeqAIJ *jaca; 263 PetscInt n = A->rmap->n, nnz; 264 Mat Amat; 265 // create and copy 266 ierr = PetscInfo(A,"Create device matrix in Kokkos\n");CHKERRQ(ierr); 267 if (size == 1) { 268 Amat = A; 269 jaca = (Mat_SeqAIJ*)A->data; 270 h_mat.rstart = 0; h_mat.rend = A->rmap->n; 271 h_mat.cstart = 0; h_mat.cend = A->cmap->n; 272 h_mat.offdiag.i = h_mat.offdiag.j = NULL; 273 h_mat.offdiag.a = NULL; 274 aijkokA = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 275 aijkokA->i_uncompressed_d = NULL; 276 aijkokA->colmap_d = NULL; 277 } else { 278 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 279 Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 280 PetscInt ii; 281 Mat_SeqAIJKokkos *aijkokB; 282 Amat = aij->A; 283 aijkokA = static_cast<Mat_SeqAIJKokkos*>(aij->A->spptr); 284 aijkokB = static_cast<Mat_SeqAIJKokkos*>(aij->B->spptr); 285 aijkokA->i_uncompressed_d = NULL; 286 aijkokA->colmap_d = NULL; 287 jaca = (Mat_SeqAIJ*)aij->A->data; 288 if (!aij->garray) SETERRQ(comm,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 289 if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 290 // create colmap - this is ussually done (lazy) in MatSetValues 291 aij->donotstash = PETSC_TRUE; 292 aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 293 jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly 294 ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr); 295 aij->colmap[A->cmap->N] = -9; 296 ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr); 297 for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1; 298 if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9"); 299 // allocate B copy data 300 h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 301 h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 302 nnz = jacb->i[n]; 303 if (jacb->compressedrow.use) { 304 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_i_k (jacb->i,n+1); 305 aijkokB->i_uncompressed_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DeviceMemorySpace(),h_i_k)); 306 Kokkos::deep_copy (*aijkokB->i_uncompressed_d, h_i_k); 307 h_mat.offdiag.i = aijkokB->i_uncompressed_d->data(); 308 //err = cudaMalloc((void **)&h_mat.offdiag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 309 //err = cudaMemcpy( h_mat.offdiag.i, jacb->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 310 } else { 311 h_mat.offdiag.i = (PetscInt*)aijkokB->i_d.data(); 312 } 313 h_mat.offdiag.j = (PetscInt*)aijkokB->j_d.data(); 314 h_mat.offdiag.a = aijkokB->a_d.data(); 315 { 316 Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_colmap_k (aij->colmap,A->cmap->N); 317 aijkokB->colmap_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DeviceMemorySpace(),h_colmap_k)); 318 Kokkos::deep_copy (*aijkokB->colmap_d, h_colmap_k); 319 h_mat.colmap = aijkokB->colmap_d->data(); 320 } 321 //err = cudaMalloc((void **)&h_mat.colmap, (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output 322 //err = cudaMemcpy( h_mat.colmap, aij->colmap, (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err); 323 h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 324 h_mat.offdiag.n = n; 325 } 326 // allocate A copy data 327 nnz = jaca->i[n]; 328 h_mat.diag.n = n; 329 h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 330 ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRQ(ierr); 331 if (jaca->compressedrow.use) { 332 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A does not suppport compressed row (todo)"); 333 } else { 334 h_mat.diag.i = (PetscInt*)aijkokA->i_d.data(); 335 } 336 //h_mat.diag.j = aj; 337 //h_mat.diag.a = aa; 338 h_mat.diag.j = (PetscInt*)aijkokA->j_d.data(); 339 h_mat.diag.a = aijkokA->a_d.data(); 340 // copy pointers and metdata to device 341 ierr = MatSeqAIJKokkosSetDeviceMat(Amat,&h_mat);CHKERRQ(ierr); 342 ierr = MatSeqAIJKokkosGetDeviceMat(Amat,&d_mat);CHKERRQ(ierr); 343 ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr); 344 } 345 *B = d_mat; // return it, set it in Mat, and set it up 346 A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 347 PetscFunctionReturn(0); 348 #endif 349 } 350