1 #include <petscconf.h> 2 #include <../src/mat/impls/aij/mpi/mpiaij.h> 3 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 4 #include <petscveckokkos.hpp> 5 6 PetscErrorCode MatAssemblyEnd_MPIAIJKokkos(Mat A,MatAssemblyType mode) 7 { 8 PetscErrorCode ierr; 9 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 10 Mat_SeqAIJKokkos *aijkok = mpiaij->A->spptr ? static_cast<Mat_SeqAIJKokkos*>(mpiaij->A->spptr) : NULL; 11 12 PetscFunctionBegin; 13 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 14 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 15 ierr = VecSetType(mpiaij->lvec,VECSEQKOKKOS);CHKERRQ(ierr); 16 } 17 if (aijkok && aijkok->device_mat_d.data()) { 18 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 19 } 20 21 PetscFunctionReturn(0); 22 } 23 24 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJKokkos(Mat mat,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 25 { 26 PetscErrorCode ierr; 27 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 28 29 PetscFunctionBegin; 30 ierr = PetscLayoutSetUp(mat->rmap);CHKERRQ(ierr); 31 ierr = PetscLayoutSetUp(mat->cmap);CHKERRQ(ierr); 32 #if defined(PETSC_USE_DEBUG) 33 if (d_nnz) { 34 PetscInt i; 35 for (i=0; i<mat->rmap->n; i++) { 36 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 37 } 38 } 39 if (o_nnz) { 40 PetscInt i; 41 for (i=0; i<mat->rmap->n; i++) { 42 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 43 } 44 } 45 #endif 46 #if defined(PETSC_USE_CTABLE) 47 ierr = PetscTableDestroy(&mpiaij->colmap);CHKERRQ(ierr); 48 #else 49 ierr = PetscFree(mpiaij->colmap);CHKERRQ(ierr); 50 #endif 51 ierr = PetscFree(mpiaij->garray);CHKERRQ(ierr); 52 ierr = VecDestroy(&mpiaij->lvec);CHKERRQ(ierr); 53 ierr = VecScatterDestroy(&mpiaij->Mvctx);CHKERRQ(ierr); 54 /* Because the B will have been resized we simply destroy it and create a new one each time */ 55 ierr = MatDestroy(&mpiaij->B);CHKERRQ(ierr); 56 57 if (!mpiaij->A) { 58 ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->A);CHKERRQ(ierr); 59 ierr = MatSetSizes(mpiaij->A,mat->rmap->n,mat->cmap->n,mat->rmap->n,mat->cmap->n);CHKERRQ(ierr); 60 ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->A);CHKERRQ(ierr); 61 } 62 if (!mpiaij->B) { 63 PetscMPIInt size; 64 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size);CHKERRQ(ierr); 65 ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->B);CHKERRQ(ierr); 66 ierr = MatSetSizes(mpiaij->B,mat->rmap->n,size > 1 ? mat->cmap->N : 0,mat->rmap->n,size > 1 ? mat->cmap->N : 0);CHKERRQ(ierr); 67 ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->B);CHKERRQ(ierr); 68 } 69 ierr = MatSetType(mpiaij->A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 70 ierr = MatSetType(mpiaij->B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 71 ierr = MatSeqAIJSetPreallocation(mpiaij->A,d_nz,d_nnz);CHKERRQ(ierr); 72 ierr = MatSeqAIJSetPreallocation(mpiaij->B,o_nz,o_nnz);CHKERRQ(ierr); 73 mat->preallocated = PETSC_TRUE; 74 PetscFunctionReturn(0); 75 } 76 77 PetscErrorCode MatMult_MPIAIJKokkos(Mat mat,Vec xx,Vec yy) 78 { 79 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 80 PetscErrorCode ierr; 81 PetscInt nt; 82 83 PetscFunctionBegin; 84 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 85 if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->cmap->n,nt); 86 ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 87 ierr = (*mpiaij->A->ops->mult)(mpiaij->A,xx,yy);CHKERRQ(ierr); 88 ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 89 ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,yy,yy);CHKERRQ(ierr); 90 PetscFunctionReturn(0); 91 } 92 93 PetscErrorCode MatMultAdd_MPIAIJKokkos(Mat mat,Vec xx,Vec yy,Vec zz) 94 { 95 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 96 PetscErrorCode ierr; 97 PetscInt nt; 98 99 PetscFunctionBegin; 100 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 101 if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->cmap->n,nt); 102 ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 103 ierr = (*mpiaij->A->ops->multadd)(mpiaij->A,xx,yy,zz);CHKERRQ(ierr); 104 ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 105 ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,zz,zz);CHKERRQ(ierr); 106 PetscFunctionReturn(0); 107 } 108 109 PetscErrorCode MatMultTranspose_MPIAIJKokkos(Mat mat,Vec xx,Vec yy) 110 { 111 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 112 PetscErrorCode ierr; 113 PetscInt nt; 114 115 PetscFunctionBegin; 116 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 117 if (nt != mat->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->rmap->n,nt); 118 ierr = (*mpiaij->B->ops->multtranspose)(mpiaij->B,xx,mpiaij->lvec);CHKERRQ(ierr); 119 ierr = (*mpiaij->A->ops->multtranspose)(mpiaij->A,xx,yy);CHKERRQ(ierr); 120 ierr = VecScatterBegin(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 121 ierr = VecScatterEnd(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 122 PetscFunctionReturn(0); 123 } 124 125 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 126 { 127 PetscErrorCode ierr; 128 Mat B; 129 130 PetscFunctionBegin; 131 if (reuse == MAT_INITIAL_MATRIX) { 132 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 133 } else if (reuse == MAT_REUSE_MATRIX) { 134 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 135 } 136 B = *newmat; 137 138 B->boundtocpu = PETSC_FALSE; 139 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 140 ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr); 141 ierr = PetscObjectChangeTypeName((PetscObject)B,MATMPIAIJKOKKOS);CHKERRQ(ierr); 142 143 B->ops->assemblyend = MatAssemblyEnd_MPIAIJKokkos; 144 B->ops->mult = MatMult_MPIAIJKokkos; 145 B->ops->multadd = MatMultAdd_MPIAIJKokkos; 146 B->ops->multtranspose = MatMultTranspose_MPIAIJKokkos; 147 // Needs an efficient implementation of the COO preallocation routines 148 //B->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 149 150 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJKokkos);CHKERRQ(ierr); 151 PetscFunctionReturn(0); 152 } 153 154 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJKokkos(Mat A) 155 { 156 PetscErrorCode ierr; 157 158 PetscFunctionBegin; 159 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 160 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 161 ierr = MatConvert_MPIAIJ_MPIAIJKokkos(A,MATMPIAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 162 PetscFunctionReturn(0); 163 } 164 165 /*@C 166 MatCreateAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 167 (the default parallel PETSc format). This matrix will ultimately pushed down 168 to Kokkos for calculations. For good matrix 169 assembly performance the user should preallocate the matrix storage by setting 170 the parameter nz (or the array nnz). By setting these parameters accurately, 171 performance during matrix assembly can be increased by more than a factor of 50. 172 173 Collective 174 175 Input Parameters: 176 + comm - MPI communicator, set to PETSC_COMM_SELF 177 . m - number of rows 178 . n - number of columns 179 . nz - number of nonzeros per row (same for all rows) 180 - nnz - array containing the number of nonzeros in the various rows 181 (possibly different for each row) or NULL 182 183 Output Parameter: 184 . A - the matrix 185 186 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 187 MatXXXXSetPreallocation() paradigm instead of this routine directly. 188 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 189 190 Notes: 191 If nnz is given then nz is ignored 192 193 The AIJ format (also called the Yale sparse matrix format or 194 compressed row storage), is fully compatible with standard Fortran 77 195 storage. That is, the stored row and column indices can begin at 196 either one (as in Fortran) or zero. See the users' manual for details. 197 198 Specify the preallocated storage with either nz or nnz (not both). 199 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 200 allocation. For large problems you MUST preallocate memory or you 201 will get TERRIBLE performance, see the users' manual chapter on matrices. 202 203 By default, this format uses inodes (identical nodes) when possible, to 204 improve numerical efficiency of matrix-vector products and solves. We 205 search for consecutive rows with the same nonzero structure, thereby 206 reusing matrix information to achieve increased efficiency. 207 208 Level: intermediate 209 210 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJKOKKOS, MATAIJKokkos 211 @*/ 212 PetscErrorCode MatCreateAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 213 { 214 PetscErrorCode ierr; 215 PetscMPIInt size; 216 217 PetscFunctionBegin; 218 ierr = MatCreate(comm,A);CHKERRQ(ierr); 219 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 220 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 221 if (size > 1) { 222 ierr = MatSetType(*A,MATMPIAIJKOKKOS);CHKERRQ(ierr); 223 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 224 } else { 225 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 226 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 227 } 228 PetscFunctionReturn(0); 229 } 230 231 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 232 PetscErrorCode MatKokkosGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B) 233 { 234 #if defined(PETSC_USE_CTABLE) 235 SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)"); 236 #else 237 PetscMPIInt size,rank; 238 MPI_Comm comm; 239 PetscErrorCode ierr; 240 PetscSplitCSRDataStructure *d_mat=NULL; 241 242 PetscFunctionBegin; 243 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 244 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 245 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 246 if (size == 1) { 247 ierr = MatSeqAIJKokkosGetDeviceMat(A,&d_mat);CHKERRQ(ierr); 248 } else { 249 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 250 ierr = MatSeqAIJKokkosGetDeviceMat(aij->A,&d_mat);CHKERRQ(ierr); 251 } 252 // act like MatSetValues because not called on host 253 if (A->assembled) { 254 if (A->was_assembled) { 255 ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr); 256 } 257 A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 258 } else { 259 ierr = PetscInfo1(A,"Warning !assemble ??? assembled=%D\n",A->assembled);CHKERRQ(ierr); 260 // SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix"); 261 } 262 if (!d_mat) { 263 PetscSplitCSRDataStructure h_mat; /* host container */ 264 Mat_SeqAIJKokkos *aijkokA; 265 Mat_SeqAIJ *jaca; 266 PetscInt n = A->rmap->n, nnz; 267 Mat Amat; 268 // create and copy 269 ierr = PetscInfo(A,"Create device matrix in Kokkos\n");CHKERRQ(ierr); 270 if (size == 1) { 271 Amat = A; 272 jaca = (Mat_SeqAIJ*)A->data; 273 h_mat.rstart = 0; h_mat.rend = A->rmap->n; 274 h_mat.cstart = 0; h_mat.cend = A->cmap->n; 275 h_mat.offdiag.i = h_mat.offdiag.j = NULL; 276 h_mat.offdiag.a = NULL; 277 aijkokA = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 278 aijkokA->i_uncompressed_d = NULL; 279 aijkokA->colmap_d = NULL; 280 } else { 281 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 282 Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 283 PetscInt ii; 284 Mat_SeqAIJKokkos *aijkokB; 285 Amat = aij->A; 286 aijkokA = static_cast<Mat_SeqAIJKokkos*>(aij->A->spptr); 287 aijkokB = static_cast<Mat_SeqAIJKokkos*>(aij->B->spptr); 288 aijkokA->i_uncompressed_d = NULL; 289 aijkokA->colmap_d = NULL; 290 jaca = (Mat_SeqAIJ*)aij->A->data; 291 if (!aij->garray) SETERRQ(comm,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 292 if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 293 // create colmap - this is ussually done (lazy) in MatSetValues 294 aij->donotstash = PETSC_TRUE; 295 aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 296 jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly 297 ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr); 298 aij->colmap[A->cmap->N] = -9; 299 ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr); 300 for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1; 301 if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9"); 302 // allocate B copy data 303 h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 304 h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 305 nnz = jacb->i[n]; 306 if (jacb->compressedrow.use) { 307 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_i_k (jacb->i,n+1); 308 aijkokB->i_uncompressed_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DeviceMemorySpace(),h_i_k)); 309 Kokkos::deep_copy (*aijkokB->i_uncompressed_d, h_i_k); 310 h_mat.offdiag.i = aijkokB->i_uncompressed_d->data(); 311 //err = cudaMalloc((void **)&h_mat.offdiag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 312 //err = cudaMemcpy( h_mat.offdiag.i, jacb->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 313 } else { 314 h_mat.offdiag.i = (PetscInt*)aijkokB->i_d.data(); 315 } 316 h_mat.offdiag.j = (PetscInt*)aijkokB->j_d.data(); 317 h_mat.offdiag.a = aijkokB->a_d.data(); 318 { 319 Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_colmap_k (aij->colmap,A->cmap->N); 320 aijkokB->colmap_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DeviceMemorySpace(),h_colmap_k)); 321 Kokkos::deep_copy (*aijkokB->colmap_d, h_colmap_k); 322 h_mat.colmap = aijkokB->colmap_d->data(); 323 } 324 //err = cudaMalloc((void **)&h_mat.colmap, (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output 325 //err = cudaMemcpy( h_mat.colmap, aij->colmap, (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err); 326 h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 327 h_mat.offdiag.n = n; 328 } 329 // allocate A copy data 330 nnz = jaca->i[n]; 331 h_mat.diag.n = n; 332 h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 333 ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRQ(ierr); 334 if (jaca->compressedrow.use) { 335 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A does not suppport compressed row (todo)"); 336 } else { 337 h_mat.diag.i = (PetscInt*)aijkokA->i_d.data(); 338 } 339 //h_mat.diag.j = aj; 340 //h_mat.diag.a = aa; 341 h_mat.diag.j = (PetscInt*)aijkokA->j_d.data(); 342 h_mat.diag.a = aijkokA->a_d.data(); 343 // copy pointers and metdata to device 344 ierr = MatSeqAIJKokkosSetDeviceMat(Amat,&h_mat);CHKERRQ(ierr); 345 ierr = MatSeqAIJKokkosGetDeviceMat(Amat,&d_mat);CHKERRQ(ierr); 346 ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr); 347 } 348 *B = d_mat; // return it, set it in Mat, and set it up 349 A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 350 PetscFunctionReturn(0); 351 #endif 352 } 353