1 #include <petscconf.h> 2 #include <../src/mat/impls/aij/mpi/mpiaij.h> 3 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 4 #include <petscveckokkos.hpp> 5 6 PetscErrorCode MatAssemblyEnd_MPIAIJKokkos(Mat A,MatAssemblyType mode) 7 { 8 PetscErrorCode ierr; 9 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 10 Mat_SeqAIJKokkos *aijkok = mpiaij->A->spptr ? static_cast<Mat_SeqAIJKokkos*>(mpiaij->A->spptr) : NULL; 11 12 PetscFunctionBegin; 13 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 14 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 15 ierr = VecSetType(mpiaij->lvec,VECSEQKOKKOS);CHKERRQ(ierr); 16 } 17 if (aijkok && aijkok->device_mat_d.data()) { 18 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 19 } 20 21 PetscFunctionReturn(0); 22 } 23 24 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJKokkos(Mat mat,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 25 { 26 PetscErrorCode ierr; 27 PetscInt i; 28 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 29 30 PetscFunctionBegin; 31 ierr = PetscLayoutSetUp(mat->rmap);CHKERRQ(ierr); 32 ierr = PetscLayoutSetUp(mat->cmap);CHKERRQ(ierr); 33 if (d_nnz) { 34 for (i=0; i<mat->rmap->n; i++) { 35 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 36 } 37 } 38 if (o_nnz) { 39 for (i=0; i<mat->rmap->n; i++) { 40 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 41 } 42 } 43 if (!mat->preallocated) { 44 /* Explicitly create 2 MATSEQAIJKOKKOS matrices. */ 45 ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->A);CHKERRQ(ierr); 46 ierr = MatSetSizes(mpiaij->A,mat->rmap->n,mat->cmap->n,mat->rmap->n,mat->cmap->n);CHKERRQ(ierr); 47 ierr = MatSetType(mpiaij->A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 48 ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->A);CHKERRQ(ierr); 49 ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->B);CHKERRQ(ierr); 50 ierr = MatSetSizes(mpiaij->B,mat->rmap->n,mat->cmap->N,mat->rmap->n,mat->cmap->N);CHKERRQ(ierr); 51 ierr = MatSetType(mpiaij->B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 52 ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->B);CHKERRQ(ierr); 53 } 54 ierr = MatSeqAIJSetPreallocation(mpiaij->A,d_nz,d_nnz);CHKERRQ(ierr); 55 ierr = MatSeqAIJSetPreallocation(mpiaij->B,o_nz,o_nnz);CHKERRQ(ierr); 56 57 mat->preallocated = PETSC_TRUE; 58 PetscFunctionReturn(0); 59 } 60 61 PetscErrorCode MatMult_MPIAIJKokkos(Mat mat,Vec xx,Vec yy) 62 { 63 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 64 PetscErrorCode ierr; 65 PetscInt nt; 66 67 PetscFunctionBegin; 68 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 69 if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->cmap->n,nt); 70 ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 71 ierr = (*mpiaij->A->ops->mult)(mpiaij->A,xx,yy);CHKERRQ(ierr); 72 ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 73 ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,yy,yy);CHKERRQ(ierr); 74 PetscFunctionReturn(0); 75 } 76 77 PetscErrorCode MatMultAdd_MPIAIJKokkos(Mat mat,Vec xx,Vec yy,Vec zz) 78 { 79 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 80 PetscErrorCode ierr; 81 PetscInt nt; 82 83 PetscFunctionBegin; 84 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 85 if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->cmap->n,nt); 86 ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 87 ierr = (*mpiaij->A->ops->multadd)(mpiaij->A,xx,yy,zz);CHKERRQ(ierr); 88 ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 89 ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,zz,zz);CHKERRQ(ierr); 90 PetscFunctionReturn(0); 91 } 92 93 PetscErrorCode MatMultTranspose_MPIAIJKokkos(Mat mat,Vec xx,Vec yy) 94 { 95 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 96 PetscErrorCode ierr; 97 PetscInt nt; 98 99 PetscFunctionBegin; 100 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 101 if (nt != mat->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->rmap->n,nt); 102 ierr = (*mpiaij->B->ops->multtranspose)(mpiaij->B,xx,mpiaij->lvec);CHKERRQ(ierr); 103 ierr = (*mpiaij->A->ops->multtranspose)(mpiaij->A,xx,yy);CHKERRQ(ierr); 104 ierr = VecScatterBegin(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 105 ierr = VecScatterEnd(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 106 PetscFunctionReturn(0); 107 } 108 109 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 110 { 111 PetscErrorCode ierr; 112 Mat B; 113 114 PetscFunctionBegin; 115 if (reuse == MAT_INITIAL_MATRIX) { 116 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 117 } else if (reuse == MAT_REUSE_MATRIX) { 118 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 119 } 120 B = *newmat; 121 122 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 123 ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr); 124 ierr = PetscObjectChangeTypeName((PetscObject)B,MATMPIAIJKOKKOS);CHKERRQ(ierr); 125 126 B->ops->assemblyend = MatAssemblyEnd_MPIAIJKokkos; 127 B->ops->mult = MatMult_MPIAIJKokkos; 128 B->ops->multadd = MatMultAdd_MPIAIJKokkos; 129 B->ops->multtranspose = MatMultTranspose_MPIAIJKokkos; 130 131 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJKokkos);CHKERRQ(ierr); 132 PetscFunctionReturn(0); 133 } 134 135 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJKokkos(Mat A) 136 { 137 PetscErrorCode ierr; 138 139 PetscFunctionBegin; 140 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 141 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 142 ierr = MatConvert_MPIAIJ_MPIAIJKokkos(A,MATMPIAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 143 PetscFunctionReturn(0); 144 } 145 146 /*@C 147 MatCreateAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 148 (the default parallel PETSc format). This matrix will ultimately pushed down 149 to Kokkos for calculations. For good matrix 150 assembly performance the user should preallocate the matrix storage by setting 151 the parameter nz (or the array nnz). By setting these parameters accurately, 152 performance during matrix assembly can be increased by more than a factor of 50. 153 154 Collective 155 156 Input Parameters: 157 + comm - MPI communicator, set to PETSC_COMM_SELF 158 . m - number of rows 159 . n - number of columns 160 . nz - number of nonzeros per row (same for all rows) 161 - nnz - array containing the number of nonzeros in the various rows 162 (possibly different for each row) or NULL 163 164 Output Parameter: 165 . A - the matrix 166 167 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 168 MatXXXXSetPreallocation() paradigm instead of this routine directly. 169 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 170 171 Notes: 172 If nnz is given then nz is ignored 173 174 The AIJ format (also called the Yale sparse matrix format or 175 compressed row storage), is fully compatible with standard Fortran 77 176 storage. That is, the stored row and column indices can begin at 177 either one (as in Fortran) or zero. See the users' manual for details. 178 179 Specify the preallocated storage with either nz or nnz (not both). 180 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 181 allocation. For large problems you MUST preallocate memory or you 182 will get TERRIBLE performance, see the users' manual chapter on matrices. 183 184 By default, this format uses inodes (identical nodes) when possible, to 185 improve numerical efficiency of matrix-vector products and solves. We 186 search for consecutive rows with the same nonzero structure, thereby 187 reusing matrix information to achieve increased efficiency. 188 189 Level: intermediate 190 191 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJKOKKOS, MATAIJKokkos 192 @*/ 193 PetscErrorCode MatCreateAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 194 { 195 PetscErrorCode ierr; 196 PetscMPIInt size; 197 198 PetscFunctionBegin; 199 ierr = MatCreate(comm,A);CHKERRQ(ierr); 200 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 201 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 202 if (size > 1) { 203 ierr = MatSetType(*A,MATMPIAIJKOKKOS);CHKERRQ(ierr); 204 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 205 } else { 206 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 207 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 208 } 209 PetscFunctionReturn(0); 210 } 211 212 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 213 PetscErrorCode MatKokkosGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B) 214 { 215 #if defined(PETSC_USE_CTABLE) 216 SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)"); 217 #else 218 PetscMPIInt size,rank; 219 MPI_Comm comm; 220 PetscErrorCode ierr; 221 PetscSplitCSRDataStructure *d_mat=NULL; 222 223 PetscFunctionBegin; 224 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 225 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 226 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 227 if (size == 1) { 228 ierr = MatSeqAIJKokkosGetDeviceMat(A,&d_mat);CHKERRQ(ierr); 229 } else { 230 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 231 ierr = MatSeqAIJKokkosGetDeviceMat(aij->A,&d_mat);CHKERRQ(ierr); 232 } 233 // act like MatSetValues because not called on host 234 if (A->assembled) { 235 if (A->was_assembled) { 236 ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr); 237 } 238 A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 239 } else { 240 ierr = PetscInfo1(A,"Warning !assemble ??? assembled=%D\n",A->assembled);CHKERRQ(ierr); 241 // SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix"); 242 } 243 if (!d_mat) { 244 PetscSplitCSRDataStructure h_mat; /* host container */ 245 Mat_SeqAIJKokkos *aijkokA; 246 Mat_SeqAIJ *jaca; 247 PetscInt n = A->rmap->n, nnz; 248 Mat Amat; 249 // create and copy 250 ierr = PetscInfo(A,"Create device matrix in Kokkos\n");CHKERRQ(ierr); 251 if (size == 1) { 252 Amat = A; 253 jaca = (Mat_SeqAIJ*)A->data; 254 h_mat.rstart = 0; h_mat.rend = A->rmap->n; 255 h_mat.cstart = 0; h_mat.cend = A->cmap->n; 256 h_mat.offdiag.i = h_mat.offdiag.j = NULL; 257 h_mat.offdiag.a = NULL; 258 aijkokA = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 259 aijkokA->i_uncompressed_d = NULL; 260 aijkokA->colmap_d = NULL; 261 } else { 262 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 263 Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 264 PetscInt ii; 265 Mat_SeqAIJKokkos *aijkokB; 266 Amat = aij->A; 267 aijkokA = static_cast<Mat_SeqAIJKokkos*>(aij->A->spptr); 268 aijkokB = static_cast<Mat_SeqAIJKokkos*>(aij->B->spptr); 269 aijkokA->i_uncompressed_d = NULL; 270 aijkokA->colmap_d = NULL; 271 jaca = (Mat_SeqAIJ*)aij->A->data; 272 if (!aij->garray) SETERRQ(comm,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 273 if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 274 // create colmap - this is ussually done (lazy) in MatSetValues 275 aij->donotstash = PETSC_TRUE; 276 aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 277 jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly 278 ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr); 279 aij->colmap[A->cmap->N] = -9; 280 ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr); 281 for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1; 282 if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9"); 283 // allocate B copy data 284 h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 285 h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 286 nnz = jacb->i[n]; 287 if (jacb->compressedrow.use) { 288 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_i_k (jacb->i,n+1); 289 aijkokB->i_uncompressed_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DeviceMemorySpace(),h_i_k)); 290 Kokkos::deep_copy (*aijkokB->i_uncompressed_d, h_i_k); 291 h_mat.offdiag.i = aijkokB->i_uncompressed_d->data(); 292 //err = cudaMalloc((void **)&h_mat.offdiag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 293 //err = cudaMemcpy( h_mat.offdiag.i, jacb->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 294 } else { 295 h_mat.offdiag.i = (PetscInt*)aijkokB->i_d.data(); 296 } 297 h_mat.offdiag.j = (PetscInt*)aijkokB->j_d.data(); 298 h_mat.offdiag.a = aijkokB->a_d.data(); 299 { 300 Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_colmap_k (aij->colmap,A->cmap->N); 301 aijkokB->colmap_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DeviceMemorySpace(),h_colmap_k)); 302 Kokkos::deep_copy (*aijkokB->colmap_d, h_colmap_k); 303 h_mat.colmap = aijkokB->colmap_d->data(); 304 } 305 //err = cudaMalloc((void **)&h_mat.colmap, (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output 306 //err = cudaMemcpy( h_mat.colmap, aij->colmap, (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err); 307 h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 308 h_mat.offdiag.n = n; 309 } 310 // allocate A copy data 311 nnz = jaca->i[n]; 312 h_mat.diag.n = n; 313 h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 314 ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRQ(ierr); 315 if (jaca->compressedrow.use) { 316 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A does not suppport compressed row (todo)"); 317 } else { 318 h_mat.diag.i = (PetscInt*)aijkokA->i_d.data(); 319 } 320 //h_mat.diag.j = aj; 321 //h_mat.diag.a = aa; 322 h_mat.diag.j = (PetscInt*)aijkokA->j_d.data(); 323 h_mat.diag.a = aijkokA->a_d.data(); 324 // copy pointers and metdata to device 325 ierr = MatSeqAIJKokkosSetDeviceMat(Amat,&h_mat);CHKERRQ(ierr); 326 ierr = MatSeqAIJKokkosGetDeviceMat(Amat,&d_mat);CHKERRQ(ierr); 327 ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr); 328 } 329 *B = d_mat; // return it, set it in Mat, and set it up 330 A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 331 PetscFunctionReturn(0); 332 #endif 333 } 334