1 #include <petscvec_kokkos.hpp> 2 #include <../src/mat/impls/aij/mpi/mpiaij.h> 3 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 4 5 PetscErrorCode MatAssemblyEnd_MPIAIJKokkos(Mat A,MatAssemblyType mode) 6 { 7 PetscErrorCode ierr; 8 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 9 Mat_SeqAIJKokkos *aijkok = mpiaij->A->spptr ? static_cast<Mat_SeqAIJKokkos*>(mpiaij->A->spptr) : NULL; 10 11 PetscFunctionBegin; 12 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 13 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 14 ierr = VecSetType(mpiaij->lvec,VECSEQKOKKOS);CHKERRQ(ierr); 15 } 16 if (aijkok && aijkok->device_mat_d.data()) { 17 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 18 } 19 20 PetscFunctionReturn(0); 21 } 22 23 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJKokkos(Mat mat,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 24 { 25 PetscErrorCode ierr; 26 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 27 28 PetscFunctionBegin; 29 ierr = PetscLayoutSetUp(mat->rmap);CHKERRQ(ierr); 30 ierr = PetscLayoutSetUp(mat->cmap);CHKERRQ(ierr); 31 #if defined(PETSC_USE_DEBUG) 32 if (d_nnz) { 33 PetscInt i; 34 for (i=0; i<mat->rmap->n; i++) { 35 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 36 } 37 } 38 if (o_nnz) { 39 PetscInt i; 40 for (i=0; i<mat->rmap->n; i++) { 41 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 42 } 43 } 44 #endif 45 #if defined(PETSC_USE_CTABLE) 46 ierr = PetscTableDestroy(&mpiaij->colmap);CHKERRQ(ierr); 47 #else 48 ierr = PetscFree(mpiaij->colmap);CHKERRQ(ierr); 49 #endif 50 ierr = PetscFree(mpiaij->garray);CHKERRQ(ierr); 51 ierr = VecDestroy(&mpiaij->lvec);CHKERRQ(ierr); 52 ierr = VecScatterDestroy(&mpiaij->Mvctx);CHKERRQ(ierr); 53 /* Because the B will have been resized we simply destroy it and create a new one each time */ 54 ierr = MatDestroy(&mpiaij->B);CHKERRQ(ierr); 55 56 if (!mpiaij->A) { 57 ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->A);CHKERRQ(ierr); 58 ierr = MatSetSizes(mpiaij->A,mat->rmap->n,mat->cmap->n,mat->rmap->n,mat->cmap->n);CHKERRQ(ierr); 59 ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->A);CHKERRQ(ierr); 60 } 61 if (!mpiaij->B) { 62 PetscMPIInt size; 63 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size);CHKERRMPI(ierr); 64 ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->B);CHKERRQ(ierr); 65 ierr = MatSetSizes(mpiaij->B,mat->rmap->n,size > 1 ? mat->cmap->N : 0,mat->rmap->n,size > 1 ? mat->cmap->N : 0);CHKERRQ(ierr); 66 ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->B);CHKERRQ(ierr); 67 } 68 ierr = MatSetType(mpiaij->A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 69 ierr = MatSetType(mpiaij->B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 70 ierr = MatSeqAIJSetPreallocation(mpiaij->A,d_nz,d_nnz);CHKERRQ(ierr); 71 ierr = MatSeqAIJSetPreallocation(mpiaij->B,o_nz,o_nnz);CHKERRQ(ierr); 72 mat->preallocated = PETSC_TRUE; 73 PetscFunctionReturn(0); 74 } 75 76 PetscErrorCode MatMult_MPIAIJKokkos(Mat mat,Vec xx,Vec yy) 77 { 78 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 79 PetscErrorCode ierr; 80 PetscInt nt; 81 82 PetscFunctionBegin; 83 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 84 if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->cmap->n,nt); 85 ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 86 ierr = (*mpiaij->A->ops->mult)(mpiaij->A,xx,yy);CHKERRQ(ierr); 87 ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 88 ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,yy,yy);CHKERRQ(ierr); 89 PetscFunctionReturn(0); 90 } 91 92 PetscErrorCode MatMultAdd_MPIAIJKokkos(Mat mat,Vec xx,Vec yy,Vec zz) 93 { 94 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 95 PetscErrorCode ierr; 96 PetscInt nt; 97 98 PetscFunctionBegin; 99 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 100 if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->cmap->n,nt); 101 ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 102 ierr = (*mpiaij->A->ops->multadd)(mpiaij->A,xx,yy,zz);CHKERRQ(ierr); 103 ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 104 ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,zz,zz);CHKERRQ(ierr); 105 PetscFunctionReturn(0); 106 } 107 108 PetscErrorCode MatMultTranspose_MPIAIJKokkos(Mat mat,Vec xx,Vec yy) 109 { 110 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 111 PetscErrorCode ierr; 112 PetscInt nt; 113 114 PetscFunctionBegin; 115 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 116 if (nt != mat->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->rmap->n,nt); 117 ierr = (*mpiaij->B->ops->multtranspose)(mpiaij->B,xx,mpiaij->lvec);CHKERRQ(ierr); 118 ierr = (*mpiaij->A->ops->multtranspose)(mpiaij->A,xx,yy);CHKERRQ(ierr); 119 ierr = VecScatterBegin(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 120 ierr = VecScatterEnd(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 121 PetscFunctionReturn(0); 122 } 123 124 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 125 { 126 PetscErrorCode ierr; 127 Mat B; 128 129 PetscFunctionBegin; 130 if (reuse == MAT_INITIAL_MATRIX) { 131 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 132 } else if (reuse == MAT_REUSE_MATRIX) { 133 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 134 } 135 B = *newmat; 136 137 B->boundtocpu = PETSC_FALSE; 138 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 139 ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr); 140 ierr = PetscObjectChangeTypeName((PetscObject)B,MATMPIAIJKOKKOS);CHKERRQ(ierr); 141 142 B->ops->assemblyend = MatAssemblyEnd_MPIAIJKokkos; 143 B->ops->mult = MatMult_MPIAIJKokkos; 144 B->ops->multadd = MatMultAdd_MPIAIJKokkos; 145 B->ops->multtranspose = MatMultTranspose_MPIAIJKokkos; 146 // Needs an efficient implementation of the COO preallocation routines 147 //B->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 148 149 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJKokkos);CHKERRQ(ierr); 150 PetscFunctionReturn(0); 151 } 152 153 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJKokkos(Mat A) 154 { 155 PetscErrorCode ierr; 156 157 PetscFunctionBegin; 158 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 159 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 160 ierr = MatConvert_MPIAIJ_MPIAIJKokkos(A,MATMPIAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 161 PetscFunctionReturn(0); 162 } 163 164 /*@C 165 MatCreateAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 166 (the default parallel PETSc format). This matrix will ultimately pushed down 167 to Kokkos for calculations. For good matrix 168 assembly performance the user should preallocate the matrix storage by setting 169 the parameter nz (or the array nnz). By setting these parameters accurately, 170 performance during matrix assembly can be increased by more than a factor of 50. 171 172 Collective 173 174 Input Parameters: 175 + comm - MPI communicator, set to PETSC_COMM_SELF 176 . m - number of rows 177 . n - number of columns 178 . nz - number of nonzeros per row (same for all rows) 179 - nnz - array containing the number of nonzeros in the various rows 180 (possibly different for each row) or NULL 181 182 Output Parameter: 183 . A - the matrix 184 185 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 186 MatXXXXSetPreallocation() paradigm instead of this routine directly. 187 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 188 189 Notes: 190 If nnz is given then nz is ignored 191 192 The AIJ format (also called the Yale sparse matrix format or 193 compressed row storage), is fully compatible with standard Fortran 77 194 storage. That is, the stored row and column indices can begin at 195 either one (as in Fortran) or zero. See the users' manual for details. 196 197 Specify the preallocated storage with either nz or nnz (not both). 198 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 199 allocation. For large problems you MUST preallocate memory or you 200 will get TERRIBLE performance, see the users' manual chapter on matrices. 201 202 By default, this format uses inodes (identical nodes) when possible, to 203 improve numerical efficiency of matrix-vector products and solves. We 204 search for consecutive rows with the same nonzero structure, thereby 205 reusing matrix information to achieve increased efficiency. 206 207 Level: intermediate 208 209 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJKOKKOS, MATAIJKokkos 210 @*/ 211 PetscErrorCode MatCreateAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 212 { 213 PetscErrorCode ierr; 214 PetscMPIInt size; 215 216 PetscFunctionBegin; 217 ierr = MatCreate(comm,A);CHKERRQ(ierr); 218 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 219 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 220 if (size > 1) { 221 ierr = MatSetType(*A,MATMPIAIJKOKKOS);CHKERRQ(ierr); 222 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 223 } else { 224 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 225 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 226 } 227 PetscFunctionReturn(0); 228 } 229 230 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 231 PetscErrorCode MatKokkosGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B) 232 { 233 #if defined(PETSC_USE_CTABLE) 234 SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)"); 235 #else 236 PetscMPIInt size,rank; 237 MPI_Comm comm; 238 PetscErrorCode ierr; 239 PetscSplitCSRDataStructure *d_mat=NULL; 240 241 PetscFunctionBegin; 242 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 243 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 244 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 245 if (size == 1) { 246 ierr = MatSeqAIJKokkosGetDeviceMat(A,&d_mat);CHKERRQ(ierr); 247 } else { 248 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 249 ierr = MatSeqAIJKokkosGetDeviceMat(aij->A,&d_mat);CHKERRQ(ierr); 250 } 251 // act like MatSetValues because not called on host 252 if (A->assembled) { 253 if (A->was_assembled) { 254 ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr); 255 } 256 A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 257 } else { 258 ierr = PetscInfo1(A,"Warning !assemble ??? assembled=%D\n",A->assembled);CHKERRQ(ierr); 259 // SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix"); 260 } 261 if (!d_mat) { 262 PetscSplitCSRDataStructure h_mat; /* host container */ 263 Mat_SeqAIJKokkos *aijkokA; 264 Mat_SeqAIJ *jaca; 265 PetscInt n = A->rmap->n, nnz; 266 Mat Amat; 267 // create and copy 268 ierr = PetscInfo(A,"Create device matrix in Kokkos\n");CHKERRQ(ierr); 269 if (size == 1) { 270 Amat = A; 271 jaca = (Mat_SeqAIJ*)A->data; 272 h_mat.rstart = 0; h_mat.rend = A->rmap->n; 273 h_mat.cstart = 0; h_mat.cend = A->cmap->n; 274 h_mat.offdiag.i = h_mat.offdiag.j = NULL; 275 h_mat.offdiag.a = NULL; 276 aijkokA = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 277 aijkokA->i_uncompressed_d = NULL; 278 aijkokA->colmap_d = NULL; 279 } else { 280 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 281 Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 282 PetscInt ii; 283 Mat_SeqAIJKokkos *aijkokB; 284 Amat = aij->A; 285 aijkokA = static_cast<Mat_SeqAIJKokkos*>(aij->A->spptr); 286 aijkokB = static_cast<Mat_SeqAIJKokkos*>(aij->B->spptr); 287 aijkokA->i_uncompressed_d = NULL; 288 aijkokA->colmap_d = NULL; 289 jaca = (Mat_SeqAIJ*)aij->A->data; 290 if (!aij->garray) SETERRQ(comm,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 291 if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 292 // create colmap - this is ussually done (lazy) in MatSetValues 293 aij->donotstash = PETSC_TRUE; 294 aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 295 jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly 296 ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr); 297 aij->colmap[A->cmap->N] = -9; 298 ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr); 299 for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1; 300 if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9"); 301 // allocate B copy data 302 h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 303 h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 304 nnz = jacb->i[n]; 305 if (jacb->compressedrow.use) { 306 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_i_k (jacb->i,n+1); 307 aijkokB->i_uncompressed_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_i_k)); 308 Kokkos::deep_copy (*aijkokB->i_uncompressed_d, h_i_k); 309 h_mat.offdiag.i = aijkokB->i_uncompressed_d->data(); 310 //err = cudaMalloc((void **)&h_mat.offdiag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 311 //err = cudaMemcpy( h_mat.offdiag.i, jacb->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 312 } else { 313 h_mat.offdiag.i = (PetscInt*)aijkokB->i_d.data(); 314 } 315 h_mat.offdiag.j = (PetscInt*)aijkokB->j_d.data(); 316 h_mat.offdiag.a = aijkokB->a_d.data(); 317 { 318 Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_colmap_k (aij->colmap,A->cmap->N); 319 aijkokB->colmap_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_colmap_k)); 320 Kokkos::deep_copy (*aijkokB->colmap_d, h_colmap_k); 321 h_mat.colmap = aijkokB->colmap_d->data(); 322 } 323 //err = cudaMalloc((void **)&h_mat.colmap, (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output 324 //err = cudaMemcpy( h_mat.colmap, aij->colmap, (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err); 325 h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 326 h_mat.offdiag.n = n; 327 } 328 // allocate A copy data 329 nnz = jaca->i[n]; 330 h_mat.diag.n = n; 331 h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 332 ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr); 333 if (jaca->compressedrow.use) { 334 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A does not suppport compressed row (todo)"); 335 } else { 336 h_mat.diag.i = (PetscInt*)aijkokA->i_d.data(); 337 } 338 //h_mat.diag.j = aj; 339 //h_mat.diag.a = aa; 340 h_mat.diag.j = (PetscInt*)aijkokA->j_d.data(); 341 h_mat.diag.a = aijkokA->a_d.data(); 342 // copy pointers and metdata to device 343 ierr = MatSeqAIJKokkosSetDeviceMat(Amat,&h_mat);CHKERRQ(ierr); 344 ierr = MatSeqAIJKokkosGetDeviceMat(Amat,&d_mat);CHKERRQ(ierr); 345 ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr); 346 } 347 *B = d_mat; // return it, set it in Mat, and set it up 348 A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 349 PetscFunctionReturn(0); 350 #endif 351 } 352