1 #include <petscvec_kokkos.hpp> 2 #include <petscsf.h> 3 #include <petsc/private/sfimpl.h> 4 #include <../src/mat/impls/aij/mpi/mpiaij.h> 5 #include <../src/mat/impls/aij/mpi/kokkos/mpiaijkok.hpp> 6 #include <KokkosSparse_spadd.hpp> 7 8 PetscErrorCode MatAssemblyEnd_MPIAIJKokkos(Mat A,MatAssemblyType mode) 9 { 10 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 11 Mat_SeqAIJKokkos *aijkok = mpiaij->A->spptr ? static_cast<Mat_SeqAIJKokkos*>(mpiaij->A->spptr) : NULL; 12 13 PetscFunctionBegin; 14 PetscCall(MatAssemblyEnd_MPIAIJ(A,mode)); 15 if (aijkok && aijkok->device_mat_d.data()) { 16 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 17 } 18 19 PetscFunctionReturn(0); 20 } 21 22 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJKokkos(Mat mat,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 23 { 24 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 25 26 PetscFunctionBegin; 27 PetscCall(PetscLayoutSetUp(mat->rmap)); 28 PetscCall(PetscLayoutSetUp(mat->cmap)); 29 #if defined(PETSC_USE_DEBUG) 30 if (d_nnz) { 31 PetscInt i; 32 for (i=0; i<mat->rmap->n; i++) { 33 PetscCheck(d_nnz[i] >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]); 34 } 35 } 36 if (o_nnz) { 37 PetscInt i; 38 for (i=0; i<mat->rmap->n; i++) { 39 PetscCheck(o_nnz[i] >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]); 40 } 41 } 42 #endif 43 #if defined(PETSC_USE_CTABLE) 44 PetscCall(PetscTableDestroy(&mpiaij->colmap)); 45 #else 46 PetscCall(PetscFree(mpiaij->colmap)); 47 #endif 48 PetscCall(PetscFree(mpiaij->garray)); 49 PetscCall(VecDestroy(&mpiaij->lvec)); 50 PetscCall(VecScatterDestroy(&mpiaij->Mvctx)); 51 /* Because the B will have been resized we simply destroy it and create a new one each time */ 52 PetscCall(MatDestroy(&mpiaij->B)); 53 54 if (!mpiaij->A) { 55 PetscCall(MatCreate(PETSC_COMM_SELF,&mpiaij->A)); 56 PetscCall(MatSetSizes(mpiaij->A,mat->rmap->n,mat->cmap->n,mat->rmap->n,mat->cmap->n)); 57 PetscCall(PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->A)); 58 } 59 if (!mpiaij->B) { 60 PetscMPIInt size; 61 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size)); 62 PetscCall(MatCreate(PETSC_COMM_SELF,&mpiaij->B)); 63 PetscCall(MatSetSizes(mpiaij->B,mat->rmap->n,size > 1 ? mat->cmap->N : 0,mat->rmap->n,size > 1 ? mat->cmap->N : 0)); 64 PetscCall(PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->B)); 65 } 66 PetscCall(MatSetType(mpiaij->A,MATSEQAIJKOKKOS)); 67 PetscCall(MatSetType(mpiaij->B,MATSEQAIJKOKKOS)); 68 PetscCall(MatSeqAIJSetPreallocation(mpiaij->A,d_nz,d_nnz)); 69 PetscCall(MatSeqAIJSetPreallocation(mpiaij->B,o_nz,o_nnz)); 70 mat->preallocated = PETSC_TRUE; 71 PetscFunctionReturn(0); 72 } 73 74 PetscErrorCode MatMult_MPIAIJKokkos(Mat mat,Vec xx,Vec yy) 75 { 76 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 77 PetscInt nt; 78 79 PetscFunctionBegin; 80 PetscCall(VecGetLocalSize(xx,&nt)); 81 PetscCheck(nt == mat->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",mat->cmap->n,nt); 82 PetscCall(VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD)); 83 PetscCall((*mpiaij->A->ops->mult)(mpiaij->A,xx,yy)); 84 PetscCall(VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD)); 85 PetscCall((*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,yy,yy)); 86 PetscFunctionReturn(0); 87 } 88 89 PetscErrorCode MatMultAdd_MPIAIJKokkos(Mat mat,Vec xx,Vec yy,Vec zz) 90 { 91 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 92 PetscInt nt; 93 94 PetscFunctionBegin; 95 PetscCall(VecGetLocalSize(xx,&nt)); 96 PetscCheck(nt == mat->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",mat->cmap->n,nt); 97 PetscCall(VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD)); 98 PetscCall((*mpiaij->A->ops->multadd)(mpiaij->A,xx,yy,zz)); 99 PetscCall(VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD)); 100 PetscCall((*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,zz,zz)); 101 PetscFunctionReturn(0); 102 } 103 104 PetscErrorCode MatMultTranspose_MPIAIJKokkos(Mat mat,Vec xx,Vec yy) 105 { 106 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 107 PetscInt nt; 108 109 PetscFunctionBegin; 110 PetscCall(VecGetLocalSize(xx,&nt)); 111 PetscCheck(nt == mat->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",mat->rmap->n,nt); 112 PetscCall((*mpiaij->B->ops->multtranspose)(mpiaij->B,xx,mpiaij->lvec)); 113 PetscCall((*mpiaij->A->ops->multtranspose)(mpiaij->A,xx,yy)); 114 PetscCall(VecScatterBegin(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE)); 115 PetscCall(VecScatterEnd(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE)); 116 PetscFunctionReturn(0); 117 } 118 119 /* Merge the "A, B" matrices of mat into a matrix C. mat's type is MPIAIJKOKKOS. C's type is MATSEQAIJKOKKOS. 120 A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n). 121 C still uses local column ids. Their corresponding global column ids are returned in glob. 122 */ 123 PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJKokkos(Mat mat,MatReuse reuse,IS *glob,Mat *C) 124 { 125 Mat Ad,Ao; 126 const PetscInt *cmap; 127 128 PetscFunctionBegin; 129 PetscCall(MatMPIAIJGetSeqAIJ(mat,&Ad,&Ao,&cmap)); 130 PetscCall(MatSeqAIJKokkosMergeMats(Ad,Ao,reuse,C)); 131 if (glob) { 132 PetscInt cst, i, dn, on, *gidx; 133 PetscCall(MatGetLocalSize(Ad,NULL,&dn)); 134 PetscCall(MatGetLocalSize(Ao,NULL,&on)); 135 PetscCall(MatGetOwnershipRangeColumn(mat,&cst,NULL)); 136 PetscCall(PetscMalloc1(dn+on,&gidx)); 137 for (i=0; i<dn; i++) gidx[i] = cst + i; 138 for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 139 PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob)); 140 } 141 PetscFunctionReturn(0); 142 } 143 144 /* Structs used in matrix product C=AB, C=A^tB and C=B^tAB */ 145 struct MatMatStruct { 146 MatRowMapKokkosView Cdstart; /* Used to split sequential matrix into petsc's A, B format */ 147 PetscSF sf; /* SF to send/recv matrix entries */ 148 MatScalarKokkosView abuf; /* buf of mat values in send/recv */ 149 Mat C1,C2,B_local; 150 KokkosCsrMatrix C1_global,C2_global,C_global; 151 KernelHandle kh; 152 MatMatStruct() { 153 C1 = C2 = B_local = NULL; 154 sf = NULL; 155 } 156 157 ~MatMatStruct() { 158 MatDestroy(&C1); 159 MatDestroy(&C2); 160 MatDestroy(&B_local); 161 PetscSFDestroy(&sf); 162 kh.destroy_spadd_handle(); 163 } 164 }; 165 166 struct MatMatStruct_AB : public MatMatStruct { 167 MatColIdxKokkosView rows; 168 MatRowMapKokkosView rowoffset; 169 Mat B_other,C_petsc; /* SEQAIJKOKKOS matrices. TODO: have a better var name than C_petsc */ 170 171 MatMatStruct_AB() : B_other(NULL),C_petsc(NULL){} 172 ~MatMatStruct_AB() { 173 MatDestroy(&B_other); 174 MatDestroy(&C_petsc); 175 } 176 }; 177 178 struct MatMatStruct_AtB : public MatMatStruct { 179 MatRowMapKokkosView srcrowoffset,dstrowoffset; 180 }; 181 182 struct MatProductData_MPIAIJKokkos 183 { 184 MatMatStruct_AB *mmAB; 185 MatMatStruct_AtB *mmAtB; 186 PetscBool reusesym; 187 188 MatProductData_MPIAIJKokkos(): mmAB(NULL),mmAtB(NULL),reusesym(PETSC_FALSE){} 189 ~MatProductData_MPIAIJKokkos() { 190 delete mmAB; 191 delete mmAtB; 192 } 193 }; 194 195 static PetscErrorCode MatProductDataDestroy_MPIAIJKokkos(void *data) 196 { 197 PetscFunctionBegin; 198 PetscCallCXX(delete static_cast<MatProductData_MPIAIJKokkos*>(data)); 199 PetscFunctionReturn(0); 200 } 201 202 /* MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds - Get a KokkosCsrMatrix from a MATSEQAIJKOKKOS matrix 203 204 Input Parameters: 205 + A - the MATSEQAIJKOKKOS matrix 206 . N - new column size for the returned Kokkos matrix 207 - l2g - a map that maps old col ids to new col ids 208 209 Output Parameters: 210 . csrmat - the Kokkos matrix, which has the same row size as A, shares a, i but not j with A. 211 */ 212 static PetscErrorCode MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(Mat A,PetscInt N,const ConstMatColIdxKokkosView& l2g,KokkosCsrMatrix& csrmat) 213 { 214 KokkosCsrMatrix& orig = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->csrmat; 215 MatColIdxKokkosView jg("jg",orig.nnz()); /* New j array for csrmat */ 216 217 PetscFunctionBegin; 218 PetscCallCXX(Kokkos::parallel_for(orig.nnz(), KOKKOS_LAMBDA(const PetscInt i) {jg(i) = l2g(orig.graph.entries(i));})); 219 PetscCallCXX(csrmat = KokkosCsrMatrix("csrmat",orig.numRows(),N,orig.nnz(),orig.values,orig.graph.row_map,jg)); 220 PetscFunctionReturn(0); 221 } 222 223 /* MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices - Set the diag and offdiag matrices of a MATMPIAIJKOKKOS matrix. 224 It is similar to MatCreateMPIAIJWithSplitArrays. 225 226 Input Parameters: 227 + mat - the MATMPIAIJKOKKOS matrix, which should have its type and layout set, but should not have its diag, offdiag matrices set 228 . A - the diag matrix using local col ids 229 - B - the offdiag matrix using global col ids 230 231 Output Parameters: 232 . mat - the updated MATMPIAIJKOKKOS matrix 233 */ 234 static PetscErrorCode MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices(Mat mat,Mat A,Mat B) 235 { 236 Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); 237 PetscInt m,n,M,N,Am,An,Bm,Bn; 238 Mat_SeqAIJKokkos *bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 239 240 PetscFunctionBegin; 241 PetscCall(MatGetSize(mat,&M,&N)); 242 PetscCall(MatGetLocalSize(mat,&m,&n)); 243 PetscCall(MatGetLocalSize(A,&Am,&An)); 244 PetscCall(MatGetLocalSize(B,&Bm,&Bn)); 245 246 PetscCheckFalse(m != Am || m != Bm,PETSC_COMM_SELF,PETSC_ERR_PLIB,"local number of rows do not match"); 247 PetscCheck(n == An,PETSC_COMM_SELF,PETSC_ERR_PLIB,"local number of columns do not match"); 248 PetscCheck(N == Bn,PETSC_COMM_SELF,PETSC_ERR_PLIB,"global number of columns do not match"); 249 PetscCheck(!mpiaij->A && !mpiaij->B,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A, B of the MPIAIJ matrix are not empty"); 250 mpiaij->A = A; 251 mpiaij->B = B; 252 253 mat->preallocated = PETSC_TRUE; 254 mat->nooffprocentries = PETSC_TRUE; /* See MatAssemblyBegin_MPIAIJ. In effect, making MatAssemblyBegin a nop */ 255 256 PetscCall(MatSetOption(mat,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE)); 257 PetscCall(MatAssemblyBegin(mat,MAT_FINAL_ASSEMBLY)); 258 /* MatAssemblyEnd is critical here. It sets mat->offloadmask according to A and B's, and 259 also gets mpiaij->B compacted, with its col ids and size reduced 260 */ 261 PetscCall(MatAssemblyEnd(mat,MAT_FINAL_ASSEMBLY)); 262 PetscCall(MatSetOption(mat,MAT_NO_OFF_PROC_ENTRIES,PETSC_FALSE)); 263 PetscCall(MatSetOption(mat,MAT_NEW_NONZERO_LOCATION_ERR,PETSC_TRUE)); 264 265 /* Update bkok with new local col ids (stored on host) and size */ 266 bkok->j_dual.modify_host(); 267 bkok->j_dual.sync_device(); 268 bkok->SetColSize(mpiaij->B->cmap->n); 269 PetscFunctionReturn(0); 270 } 271 272 /* MatSeqAIJKokkosBcast - Bcast rows of a SEQAIJKOKKOS matrice (B) to form a SEQAIJKOKKOS matrix (C). 273 274 It is essentially the MPIAIJKOKKOS counterpart of MatGetBrowsOfAoCols_MPIAIJ, but supports device and uses PetscSF. 275 In the given ownerSF, leaves correspond to rows in C, and roots correspond to rows in B. Roots may connect to multiple leaves. 276 Suppose C's j-th row is connected to a root identified by PetscSFNode (k,i), it means we will bcast the i-th row of B on rank k 277 to j-th row of C. ownerSF's leaves must be contiguous (in other words, as if ilocal=NULL was used to set its graph). 278 279 Collective on comm of ownerSF 280 281 Input Parameters: 282 + B - the SEQAIJKOKKOS matrix, using local col ids 283 . reuse - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX 284 . N - global col ids are in range of [0,N). N Must be the same across ranks (nonsignificant in MAT_REUSE_MATRIX) 285 . l2g - a map mapping B's local col ids to global ones (nonsignificant in MAT_REUSE_MATRIX) 286 . ownerSF - the ownership SF (nonsignificant in MAT_REUSE_MATRIX) 287 288 Input/Output Parameters (out when resue = MAT_INITIAL_MATRIX, inout when reuse = MAT_REUSE_MATRIX) 289 + bcastSF - the SF used to bcast rows of B. This plain SF does buffer (abuf) to buffer (Ca) send/recv. In this SF, vertices are nonzeros. 290 . abuf - buffer for sending matrix values 291 . rows - array containing indices of (local) rows that this rank needs to bcast to others. Each receiver rank has a chunk in rows[]. 292 Values in rows[] might have repeats, which simply indicates a row will be bcast'ed to multiple neighbors. 293 . rowoffset - For each row in rows[], it will be copied to rowoffset[] at abuf[] 294 - C - the SEQAIJKOKKOS matrix made of the bcast'ed rows, using local col ids. 295 */ 296 static PetscErrorCode MatSeqAIJKokkosBcast(Mat B,MatReuse reuse,PetscInt N,const ConstMatColIdxKokkosView& l2g,PetscSF ownerSF, 297 PetscSF& bcastSF,MatScalarKokkosView& abuf,MatColIdxKokkosView& rows, 298 MatRowMapKokkosView& rowoffset,Mat& C) 299 { 300 Mat_SeqAIJKokkos *bkok,*ckok; 301 302 PetscFunctionBegin; 303 PetscCall(MatSeqAIJKokkosSyncDevice(B)); /* Make sure B->spptr is accessible */ 304 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 305 306 if (reuse == MAT_REUSE_MATRIX) { 307 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 308 309 const auto& Ba = bkok->a_dual.view_device(); 310 const auto& Bi = bkok->i_dual.view_device(); 311 const auto& Ca = ckok->a_dual.view_device(); 312 313 /* Copy Ba to abuf */ 314 Kokkos::parallel_for(Kokkos::TeamPolicy<>(rows.extent(0), Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 315 PetscInt i = t.league_rank(); /* rows[i] is r-th row of B */ 316 PetscInt r = rows(i); 317 PetscInt base = rowoffset(i); /* Copy r-th row of B to this offset in abuf[] */ 318 Kokkos::parallel_for(Kokkos::TeamThreadRange(t,Bi(r+1)-Bi(r)),[&](PetscInt k) { 319 abuf(base+k) = Ba(Bi(r)+k); 320 }); 321 }); 322 323 /* Send abuf to Ca through bcastSF and then mark C is updated on device */ 324 PetscCall(PetscSFBcastBegin(bcastSF,MPIU_SCALAR,abuf.data(),Ca.data(),MPI_REPLACE)); /* TODO: get memtype for abuf */ 325 PetscCall(PetscSFBcastEnd (bcastSF,MPIU_SCALAR,abuf.data(),Ca.data(),MPI_REPLACE)); 326 ckok->a_dual.modify_device(); 327 } else if (reuse == MAT_INITIAL_MATRIX) { 328 MPI_Comm comm; 329 PetscMPIInt tag; 330 PetscInt k,Cm,Cn,Cnnz,*Ci_h,nroots,nleaves; 331 332 PetscCallMPI(PetscObjectGetComm((PetscObject)ownerSF,&comm)); 333 PetscCall(PetscSFGetGraph(ownerSF,&nroots,&nleaves,NULL,NULL)); 334 Cm = nleaves; /* row size of C */ 335 Cn = N; /* col size of C, which initially uses global ids, so we can safely set its col size as N */ 336 337 /* Get row lens (nz) of B's rows for later fast query */ 338 PetscInt *Browlens; 339 const PetscInt *tmp = bkok->i_host_data(); 340 PetscCall(PetscMalloc1(nroots,&Browlens)); 341 for (k=0; k<nroots; k++) Browlens[k] = tmp[k+1]-tmp[k]; 342 343 /* By ownerSF, each proc gets lens of rows of C */ 344 MatRowMapKokkosDualView Ci("i",Cm+1); /* C's rowmap */ 345 Ci_h = Ci.view_host().data(); 346 Ci_h[0] = 0; 347 PetscCall(PetscSFBcastWithMemTypeBegin(ownerSF,MPIU_INT,PETSC_MEMTYPE_HOST,Browlens,PETSC_MEMTYPE_HOST,&Ci_h[1],MPI_REPLACE)); 348 PetscCall(PetscSFBcastEnd(ownerSF,MPIU_INT,Browlens,&Ci_h[1],MPI_REPLACE)); 349 for (k=1; k<Cm+1; k++) Ci_h[k] += Ci_h[k-1]; /* Convert lens to CSR */ 350 Cnnz = Ci_h[Cm]; 351 Ci.modify_host(); 352 Ci.sync_device(); 353 354 /* With the newly known Cnnz, we are able to allocate (j, a) for C on host & device */ 355 MatColIdxKokkosDualView Cj("j",Cnnz); 356 MatScalarKokkosDualView Ca("a",Cnnz); 357 358 /* Now build the bcastSF to fill Ca, Cj. This plain SF only does (contiguous) buffer to buffer send/recv */ 359 const PetscMPIInt *iranks,*ranks; 360 const PetscInt *ioffset,*irootloc,*roffset; 361 PetscInt i,j,niranks,nranks,*sdisp,*rdisp,*rowptr; 362 MPI_Request *reqs; 363 364 PetscCall(PetscSFGetLeafRanks(ownerSF,&niranks,&iranks,&ioffset,&irootloc)); /* irootloc[] contains indices of rows I need to send to each receiver */ 365 PetscCall(PetscSFGetRootRanks(ownerSF,&nranks,&ranks,&roffset,NULL/*rmine*/,NULL/*rremote*/)); /* recv info */ 366 367 /* figure out offsets at the send buffer, to build the SF 368 sdisp[] - stores offsets of nonzeros (in abuf or jbuf, see later) I need to send, per receiver. 369 rowptr[] - stores offsets for data of each row in abuf 370 371 rdisp[] - to receive sdisp[] 372 */ 373 PetscCall(PetscMalloc3(niranks+1,&sdisp,nranks,&rdisp,niranks+nranks,&reqs)); 374 MatRowMapKokkosViewHost rowptr_h("rowptr_h",ioffset[niranks]+1); /* Let Kokkos do the allocation, so that we can do an easy mirror later */ 375 rowptr = rowptr_h.data(); 376 377 sdisp[0] = 0; 378 rowptr[0] = 0; 379 for (i=0; i<niranks; i++) { /* for each receiver */ 380 PetscInt len, nz = 0; 381 for (j=ioffset[i]; j<ioffset[i+1]; j++) { /* for each row to this receiver */ 382 len = Browlens[irootloc[j]]; 383 rowptr[j+1] = rowptr[j] + len; 384 nz += len; 385 } 386 sdisp[i+1] = sdisp[i] + nz; 387 } 388 PetscCallMPI(PetscCommGetNewTag(comm,&tag)); 389 for (i=0; i<nranks; i++) PetscCallMPI(MPI_Irecv(&rdisp[i],1,MPIU_INT,ranks[i],tag,comm,&reqs[i])); 390 for (i=0; i<niranks; i++) PetscCallMPI(MPI_Isend(&sdisp[i],1,MPIU_INT,iranks[i],tag,comm,&reqs[nranks+i])); 391 PetscCallMPI(MPI_Waitall(niranks+nranks,reqs,MPI_STATUSES_IGNORE)); 392 393 PetscInt nleaves2 = Cnnz; /* leaves are the nonzeros I will receive */ 394 PetscInt nroots2 = sdisp[niranks]; /* roots are the nonzeros (in abuf) I will send */ 395 PetscSFNode *iremote; 396 PetscCall(PetscMalloc1(nleaves2,&iremote)); 397 for (i=0; i<nranks; i++) { /* for each sender */ 398 k = 0; 399 for (j=Ci_h[roffset[i]]; j<Ci_h[roffset[i+1]]; j++) { 400 iremote[j].rank = ranks[i]; 401 iremote[j].index = rdisp[i] + k; 402 k++; 403 } 404 } 405 /* TODO: we should extend PetscSF APIs for this buffer-to-buffer send/recv */ 406 PetscCall(PetscSFCreate(comm,&bcastSF)); 407 PetscCall(PetscSFSetGraph(bcastSF,nroots2,nleaves2,NULL/*ilocal*/,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER)); 408 409 /* Extract selected rows of B, and copy their (a, j) into abuf[] and jbuf[], with j converted 410 from local to global. Then use bcastSF to fill Ca, Cj. 411 */ 412 ConstMatColIdxKokkosViewHost rows_h(irootloc,ioffset[niranks]); /* irootloc[] stores indices of rows I need to send */ 413 MatColIdxKokkosView rows("rows",ioffset[niranks]); 414 Kokkos::deep_copy(rows,rows_h); /* Use deep copy since irootoc is managed by PetscSF and we want 'rows' to be standalone */ 415 416 rowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),rowptr_h); /* If no device, rowoffset will be an alias to rowptr_h */ 417 418 MatColIdxKokkosView jbuf("jbuf",sdisp[niranks]); /* send buf for (global) col ids */ 419 abuf = MatScalarKokkosView("abuf",sdisp[niranks]); /* send buf for mat values */ 420 421 const auto& Ba = bkok->a_dual.view_device(); 422 const auto& Bi = bkok->i_dual.view_device(); 423 const auto& Bj = bkok->j_dual.view_device(); 424 425 /* Copy Ba, Bj to abuf, jbuf with change col ids from local to global */ 426 Kokkos::parallel_for(Kokkos::TeamPolicy<>(rows.extent(0), Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 427 PetscInt i = t.league_rank(); /* rows[i] is r-th row of B */ 428 PetscInt r = rows(i); 429 PetscInt base = rowoffset(i); /* Copy r-th row of B to this offset in abuf[] */ 430 Kokkos::parallel_for(Kokkos::TeamThreadRange(t,Bi(r+1)-Bi(r)),[&](PetscInt k) { 431 abuf(base+k) = Ba(Bi(r)+k); 432 jbuf(base+k) = l2g(Bj(Bi(r)+k)); 433 }); 434 }); 435 436 /* Send abuf & jbuf to fill Ca, Cj */ 437 PetscCall(PetscSFBcastBegin(bcastSF,MPIU_INT, jbuf.data(),Cj.view_device().data(),MPI_REPLACE)); 438 PetscCall(PetscSFBcastBegin(bcastSF,MPIU_SCALAR,abuf.data(),Ca.view_device().data(),MPI_REPLACE)); 439 PetscCall(PetscSFBcastEnd (bcastSF,MPIU_INT, jbuf.data(),Cj.view_device().data(),MPI_REPLACE)); 440 PetscCall(PetscSFBcastEnd (bcastSF,MPIU_SCALAR,abuf.data(),Ca.view_device().data(),MPI_REPLACE)); 441 Cj.modify_device(); /* Mark Cj, Ca modified on device, but only sync Cj since we might not need Ca on host at all */ 442 Cj.sync_host(); 443 Ca.modify_device(); 444 445 /* Construct C with Ca, Ci, Cj */ 446 auto ckok = new Mat_SeqAIJKokkos(Cm,Cn,Cnnz,Ci,Cj,Ca); 447 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,&C)); 448 PetscCall(PetscFree3(sdisp,rdisp,reqs)); 449 PetscCall(PetscFree(Browlens)); 450 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unsupported MatReuse enum %d",reuse); 451 PetscFunctionReturn(0); 452 } 453 454 /* MatSeqAIJKokkosReduce - Reduce rows of a SEQAIJKOKKOS matrix (A) to form a Kokkos Csr matrix (C) 455 456 It is the reverse of MatSeqAIJKokkosBcast in some sense. 457 458 Think each row of A as a leaf, then the given ownerSF specifies roots of the leaves. Roots may connect to multiple leaves. 459 In this routine, we reduce (i.e., concatenate) leaves (rows) at their roots to form potentially longer rows in C. Such rows might 460 contain repeats, which does not matter since they will be summed up by other routines. C's row size will be nroots of ownerSF. 461 462 Input Parameters: 463 + A - the SEQAIJKOKKOS matrix to be reduced 464 . reuse - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX 465 . local - true if A uses local col ids; false if A is already in global col ids. 466 . N - if local, N is A's global col size 467 . l2g - if local, a map mapping A's local col ids to global ones, which are in range of [0,N). 468 - ownerSF - the SF specifies ownership (root) of rows in A 469 470 Output Parameters: 471 + reduceSF - the SF to reduce A's rows to contiguous buffers at the receiver side 472 . abuf - a contiguous buffer to receive A's rows sent to this proc. Suppose there are 'nrows' such rows. 473 . srcrowoffset - offset array of size nrows+1. Each entry is the corresponding row's offset in abuf[]. srcrowoffset[i+1]-srcrowoffset[i] is row i's len. 474 . dstrowoffset - offset array of size nrows. Each entry is the corresponding row's offset in Ca[], i.e., C's 'a' array. Row i, i+1 in abuf[] may go to 475 unrelated places in Ca, so dstrowoffset is not in CSR-like format as srcrowoffset. 476 - C - the matrix made up by rows sent to me from other ranks, using global col ids 477 478 TODO: we can even have MatSeqAIJKokkosReduceBegin/End to provide oppertunity for callers to overlap comp./comm. when reuse = MAT_REUSE_MATRIX. 479 */ 480 static PetscErrorCode MatSeqAIJKokkosReduce(Mat A,MatReuse reuse,PetscBool local,PetscInt N,const ConstMatColIdxKokkosView& l2g,PetscSF ownerSF, 481 PetscSF& reduceSF,MatScalarKokkosView& abuf, 482 MatRowMapKokkosView& srcrowoffset,MatRowMapKokkosView& dstrowoffset, 483 KokkosCsrMatrix& C) 484 { 485 PetscInt i,r,Am,An,Annz,Cnnz,nrows; 486 const PetscInt *Ai; 487 Mat_SeqAIJKokkos *akok; 488 489 PetscFunctionBegin; 490 PetscCall(MatSeqAIJKokkosSyncDevice(A)); /* So that A's latest data is on device */ 491 PetscCall(MatGetSize(A,&Am,&An)); 492 Ai = static_cast<Mat_SeqAIJ*>(A->data)->i; 493 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 494 Annz = Ai[Am]; 495 496 if (reuse == MAT_REUSE_MATRIX) { 497 /* Send Aa to abuf */ 498 PetscCallMPI(PetscSFReduceBegin(reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE)); 499 PetscCallMPI(PetscSFReduceEnd (reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE)); 500 501 /* Copy abuf to Ca */ 502 const MatScalarKokkosView& Ca = C.values; 503 nrows = dstrowoffset.extent(0); /* Not srcrowoffset[] since it has an extra entry for CSR */ 504 Kokkos::parallel_for(Kokkos::TeamPolicy<>(nrows, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 505 PetscInt i = t.league_rank(); 506 PetscInt src = srcrowoffset(i), dst = dstrowoffset(i); 507 PetscInt len = srcrowoffset(i+1) - srcrowoffset(i); 508 Kokkos::parallel_for(Kokkos::TeamThreadRange(t,len), [&](PetscInt k) {Ca(dst+k) = abuf(src+k);}); 509 }); 510 } else if (reuse == MAT_INITIAL_MATRIX) { 511 MPI_Comm comm; 512 MPI_Request *reqs; 513 PetscMPIInt tag; 514 PetscInt Cm; 515 516 PetscCall(PetscObjectGetComm((PetscObject)ownerSF,&comm)); 517 PetscCall(PetscCommGetNewTag(comm,&tag)); 518 519 PetscInt niranks,nranks,nroots,nleaves; 520 const PetscMPIInt *iranks,*ranks; 521 const PetscInt *ioffset,*rows,*roffset; /* rows[] contains local indices of rows scattered to me from others. ioffset[] is a CSR on rows[] */ 522 PetscCall(PetscSFSetUp(ownerSF)); 523 PetscCall(PetscSFGetLeafRanks(ownerSF,&niranks,&iranks,&ioffset,&rows)); /* recv info: iranks[] will send rows to me */ 524 PetscCall(PetscSFGetRootRanks(ownerSF,&nranks,&ranks,&roffset,NULL/*rmine*/,NULL/*rremote*/)); /* send info */ 525 PetscCall(PetscSFGetGraph(ownerSF,&nroots,&nleaves,NULL,NULL)); 526 PetscCheck(nleaves == Am,PETSC_COMM_SELF,PETSC_ERR_PLIB,"ownerSF's nleaves(%" PetscInt_FMT ") != row size of A(%" PetscInt_FMT ")",nleaves,Am); 527 Cm = nroots; 528 nrows = ioffset[niranks]; /* # of rows to be received. Might receive same row (each is partial) from different senders */ 529 530 /* Tell owners how long each row I will send */ 531 PetscInt *srowlens; /* send buf of row lens */ 532 MatRowMapKokkosViewHost rrowlens_h("rrowoffset_h",nrows+1); /* recv buf of row lens. +1 to make CSR later. Memory might be passed to other views */ 533 PetscInt *rrowlens = rrowlens_h.data(); 534 535 PetscCall(PetscMalloc2(Am,&srowlens,niranks+nranks,&reqs)); 536 for (i=0; i<Am; i++) srowlens[i] = Ai[i+1] - Ai[i]; 537 rrowlens[0] = 0; 538 rrowlens++; /* shift the pointer to make the following expression more readable */ 539 for (i=0; i<niranks; i++)PetscCallMPI(MPI_Irecv(&rrowlens[ioffset[i]],ioffset[i+1]-ioffset[i],MPIU_INT,iranks[i],tag,comm,&reqs[i])); 540 for (i=0; i<nranks; i++) PetscCallMPI(MPI_Isend(&srowlens[roffset[i]],roffset[i+1]-roffset[i],MPIU_INT,ranks[i],tag,comm,&reqs[niranks+i])); 541 PetscCallMPI(MPI_Waitall(niranks+nranks,reqs,MPI_STATUSES_IGNORE)); 542 543 /* Owner builds Ci on host by histogramming rrowlens[] */ 544 MatRowMapKokkosViewHost Ci_h("i",Cm+1); 545 Kokkos::deep_copy(Ci_h,0); /* Zero Ci */ 546 MatRowMapType *Ci_ptr = Ci_h.data(); 547 548 for (i=0; i<nrows; i++) { 549 r = rows[i]; /* local row id of i-th received row */ 550 #if defined(PETSC_USE_DEBUG) 551 PetscCheckFalse(r<0 || r>=Cm,PETSC_COMM_SELF,PETSC_ERR_PLIB,"local row id (%" PetscInt_FMT ") is out of range [0,%" PetscInt_FMT ")",r,Cm); 552 #endif 553 Ci_ptr[r+1] += rrowlens[i]; /* add to length of row r in C */ 554 } 555 for (i=0; i<Cm; i++) Ci_ptr[i+1] += Ci_ptr[i]; /* to CSR format */ 556 Cnnz = Ci_ptr[Cm]; 557 558 /* For each received row, compute src & dst offsets in memory copying (from recv bufs abuf, jbuf to Ca, Cj) */ 559 MatRowMapKokkosViewHost dstrowoffset_h("dstrowoffset_h",nrows); 560 PetscInt *dstrowoffset_hptr = dstrowoffset_h.data(); 561 PetscInt *currowlens; /* Current row lens. They are temp accumulators for row lens in C, to help build dstrowoffset */ 562 563 PetscCall(PetscCalloc1(Cm,&currowlens)); /* Init with zero, to be added to */ 564 for (i=0; i<nrows; i++) { /* for each row I receive */ 565 r = rows[i]; /* row id in C */ 566 dstrowoffset_hptr[i] = Ci_ptr[r] + currowlens[r]; /* dst offset of the new place for each recv'ed row in Ca/Cj */ 567 currowlens[r] += rrowlens[i]; /* accumulate to length of row r in C */ 568 } 569 PetscCall(PetscFree(currowlens)); 570 571 rrowlens--; 572 for (i=0; i<nrows; i++) rrowlens[i+1] += rrowlens[i]; /* Change rrowlens[] to CSR format */ 573 dstrowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),dstrowoffset_h); 574 srcrowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),rrowlens_h); /* src offset of each recv'ed row in abuf/jbuf */ 575 576 /* Build the reduceSF, which performs buffer to buffer send/recv */ 577 PetscInt *sdisp,*rdisp; /* buffer to send offsets of roots, and buffer to recv them */ 578 PetscCall(PetscMalloc2(niranks,&sdisp,nranks,&rdisp)); 579 for (i=0; i<niranks; i++) sdisp[i] = rrowlens[ioffset[i]]; 580 for (i=0; i<nranks; i++) PetscCallMPI(MPI_Irecv(&rdisp[i],1,MPIU_INT,ranks[i],tag,comm,&reqs[i])); 581 for (i=0; i<niranks; i++) PetscCallMPI(MPI_Isend(&sdisp[i],1,MPIU_INT,iranks[i],tag,comm,&reqs[nranks+i])); 582 PetscCallMPI(MPI_Waitall(niranks+nranks,reqs,MPI_STATUSES_IGNORE)); 583 584 /* Nonzeros in abuf/jbuf are roots and those in A are leaves */ 585 PetscInt nroots2 = Cnnz,nleaves2 = Annz; 586 PetscSFNode *iremote; 587 PetscCall(PetscMalloc1(nleaves2,&iremote)); /* no free, since memory will be given to reduceSF */ 588 for (i=0; i<nranks; i++) { 589 PetscInt rootbase = rdisp[i]; /* root offset at this root rank */ 590 PetscInt leafbase = Ai[roffset[i]]; /* leaf base */ 591 PetscInt nz = Ai[roffset[i+1]] - leafbase; /* I will send nz nonzeros to this root rank */ 592 for (PetscInt k=0; k<nz; k++) { 593 iremote[leafbase+k].rank = ranks[i]; 594 iremote[leafbase+k].index = rootbase + k; 595 } 596 } 597 PetscCall(PetscSFCreate(comm,&reduceSF)); 598 PetscCall(PetscSFSetGraph(reduceSF,nroots2,nleaves2,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER)); 599 PetscCall(PetscFree2(sdisp,rdisp)); 600 601 /* Reduce Aa, Ajg to abuf and jbuf */ 602 603 /* If A uses local col ids, convert them to global ones before sending */ 604 MatColIdxKokkosView Ajg; 605 if (local) { 606 Ajg = MatColIdxKokkosView("j",Annz); 607 const MatColIdxKokkosView& Aj = akok->j_dual.view_device(); 608 Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscInt i) {Ajg(i) = l2g(Aj(i));}); 609 } else { 610 Ajg = akok->j_dual.view_device(); /* no data copy, just take a reference */ 611 } 612 613 MatColIdxKokkosView jbuf("jbuf",Cnnz); 614 abuf = MatScalarKokkosView("abuf",Cnnz); 615 PetscCallMPI(PetscSFReduceBegin(reduceSF,MPIU_INT, Ajg.data(), jbuf.data(),MPI_REPLACE)); 616 PetscCallMPI(PetscSFReduceEnd (reduceSF,MPIU_INT, Ajg.data(), jbuf.data(),MPI_REPLACE)); 617 PetscCallMPI(PetscSFReduceBegin(reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE)); 618 PetscCallMPI(PetscSFReduceEnd (reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE)); 619 620 /* Copy data from abuf, jbuf to Ca, Cj */ 621 MatRowMapKokkosView Ci = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),Ci_h); /* Ci is an alias of Ci_h if no device */ 622 MatColIdxKokkosView Cj("j",Cnnz); 623 MatScalarKokkosView Ca("a",Cnnz); 624 625 Kokkos::parallel_for(Kokkos::TeamPolicy<>(nrows, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 626 PetscInt i = t.league_rank(); 627 PetscInt src = srcrowoffset(i), dst = dstrowoffset(i); 628 PetscInt len = srcrowoffset(i+1) - srcrowoffset(i); 629 Kokkos::parallel_for(Kokkos::TeamThreadRange(t,len), [&](PetscInt k) { 630 Ca(dst+k) = abuf(src+k); 631 Cj(dst+k) = jbuf(src+k); 632 }); 633 }); 634 635 /* Build C with Ca, Ci, Cj */ 636 C = KokkosCsrMatrix("csrmat",Cm,N,Cnnz,Ca,Ci,Cj); 637 PetscCall(PetscFree2(srowlens,reqs)); 638 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unsupported MatReuse enum %d",reuse); 639 PetscFunctionReturn(0); 640 } 641 642 /* MatSetMPIAIJKokkosWithGlobalCSRMatrix - Set the diag and offdiag parts of a MATMPIAIJKOKKOS matrix by splitting a KokkosCsrMatrix 643 644 Input Parameters: 645 + C - the MATMPIAIJKOKKOS matrix, of size m,n,M,N 646 . reuse - indicate whether the matrix has called this function before 647 . csrmat - the KokkosCsrMatrix, of size m,N 648 - Cdstart - when reuse == MAT_REUSE_MATRIX, it is an input parameter. For each row in csrmat, it stores the start of the first 649 entry of the diag block of C in csrmat's j array. E.g, if row i has col ids = {0, 3, 4, 5, 7, 9} and the first diag 650 entry is 5, then Cdstart[i] = 3. 651 652 Output Parameters: 653 + C - the updated MATMPIAIJKOKKOS matrix 654 - Cdstart - when reuse == MAT_INITIAL_MATRIX, it is an output parameter 655 656 Notes: 657 Between calls with MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX, csrmat must have the same nonzero pattern 658 */ 659 static PetscErrorCode MatSetMPIAIJKokkosWithGlobalCSRMatrix(Mat C,MatReuse reuse,const KokkosCsrMatrix& csrmat,MatRowMapKokkosView& Cdstart) 660 { 661 const MatScalarKokkosView& Ca = csrmat.values; 662 const ConstMatRowMapKokkosView& Ci = csrmat.graph.row_map; 663 PetscInt m,n,N; 664 665 PetscFunctionBegin; 666 PetscCall(MatGetLocalSize(C,&m,&n)); 667 PetscCall(MatGetSize(C,NULL,&N)); 668 669 if (reuse == MAT_REUSE_MATRIX) { 670 Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ*>(C->data); 671 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(mpiaij->A->spptr); 672 Mat_SeqAIJKokkos *bkok = static_cast<Mat_SeqAIJKokkos*>(mpiaij->B->spptr); 673 const MatScalarKokkosView& Cda = akok->a_dual.view_device(),Coa = bkok->a_dual.view_device(); 674 const MatRowMapKokkosView& Cdi = akok->i_dual.view_device(),Coi = bkok->i_dual.view_device(); 675 676 /* Fill 'a' of Cd and Co on device */ 677 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 678 PetscInt i = t.league_rank(); /* row i */ 679 PetscInt clen = Ci(i+1) - Ci(i); /* len of row i of C */ 680 PetscInt cdlen = Cdi(i+1) - Cdi(i); /* len of row i of Cd */ 681 PetscInt cdstart = Cdstart(i); /* [start, end) of row i of Cd in C */ 682 PetscInt cdend = cdstart + cdlen; 683 /* [0, clen) is cut into three blocks: [0, cdstart), [cdstart, cdend), [cdend, clen) */ 684 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clen), [&](PetscInt k) { 685 if (k < cdstart) { /* k in [0, cdstart) */ 686 Coa(Coi(i)+k) = Ca(Ci(i)+k); 687 } else if (k < cdend) { /* k in [cdstart, cdend) */ 688 Cda(Cdi(i)+(k-cdstart)) = Ca(Ci(i)+k); 689 } else { /* k in [cdend, clen) */ 690 Coa(Coi(i)+k-cdlen) = Ca(Ci(i)+k); 691 } 692 }); 693 }); 694 695 akok->a_dual.modify_device(); 696 bkok->a_dual.modify_device(); 697 } else if (reuse == MAT_INITIAL_MATRIX) { 698 Mat Cd,Co; 699 const MatColIdxKokkosView& Cj = csrmat.graph.entries; 700 MatRowMapKokkosDualView Cdi_dual("i",m+1),Coi_dual("i",m+1); 701 MatRowMapKokkosView Cdi = Cdi_dual.view_device(),Coi = Coi_dual.view_device(); 702 PetscInt cstart,cend; 703 704 /* Note that each row of C is sorted by col ids. We want to find out how to cut each row into three blocks: 705 left to the diag block, diag block, right to the diag block. The diag block have col ids in [cstart,cend). 706 Suppose a row of C has len nonzeros, indexed by [0, len). We want to know two indices: cdstart and cdend, 707 such that the three blocks are [0,cdstart), [cdstart,cdend), [cdend,len). The following code equivalentaly 708 stores values of cdstart and cdend-cstart (aka Cdi[]) instead. 709 */ 710 Cdstart = MatRowMapKokkosView("Cdstart",m); 711 PetscCall(PetscLayoutGetRange(C->cmap,&cstart,&cend)); /* Not MatGetOwnershipRangeColumn() since C has not been preallocated yet */ 712 713 /* I could use RangePolicy and one thread per row. But since each thread essentially does binary search, threads in a 714 CUDA warp would completely diverge. So I use TeamPolicy with a team size 1. 715 */ 716 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 717 Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 718 PetscInt i = t.league_rank(); /* row i */ 719 PetscInt j,first,count,step; 720 721 if (i == 0) { /* Set the first entry of the i arrays to zero on device, to be used in CSR */ 722 Cdi(0) = 0; 723 Coi(0) = 0; 724 } 725 726 /* Do std::lower_bound(Ci(i),Ci(i+1),cstart) on Cj[]. We use j as the iterator. lower_bound() returns 727 in 'first' the first iterator with a value >= cstart, or last iterator if no such element is found. 728 */ 729 count = Ci(i+1)-Ci(i); 730 first = Ci(i); 731 while (count > 0) { 732 j = first; 733 step = count / 2; 734 j += step; 735 if (Cj(j) < cstart) { 736 first = ++j; 737 count -= step + 1; 738 } else count = step; 739 } 740 Cdstart(i) = first - Ci(i); /* 'first' is the while-loop's output */ 741 742 /* Do std::lower_bound(first,Ci(i+1),cend) on Cj[] */ 743 count = Ci(i+1) - first; 744 while (count > 0) { 745 j = first; 746 step = count / 2; 747 j += step; 748 if (Cj(j) < cend) { 749 first = ++j; 750 count -= step + 1; 751 } else count = step; 752 } 753 Cdi(i+1) = first - (Ci(i)+Cdstart(i)); /* 'first' is the while-loop's output */ 754 Coi(i+1) = (Ci(i+1)-Ci(i)) - Cdi(i+1); /* Co's row len = C's row len - Cd's row len */ 755 }); 756 }); 757 758 /* Convert row lens in Cdi[], Coi[] to CSR format using inclusive scan, e.g., changing [0,1,2,3] into [0,1,3,6] */ 759 Kokkos::parallel_scan(m+1,KOKKOS_LAMBDA(const PetscInt i,PetscInt& update,const bool final) { 760 update += Cdi(i); 761 if (final) Cdi(i) = update; 762 }); 763 Kokkos::parallel_scan(m+1,KOKKOS_LAMBDA(const PetscInt i,PetscInt& update,const bool final) { 764 update += Coi(i); 765 if (final) Coi(i) = update; 766 }); 767 768 /* Get Cdi, Coi on host (it is not a waste, since we do need them on host in 769 MatCreateSeqAIJKokkosWithCSRMatrix() below), then get nnz of Cd and Co. 770 */ 771 Cdi_dual.modify_device(); 772 Coi_dual.modify_device(); 773 Cdi_dual.sync_host(); 774 Coi_dual.sync_host(); 775 PetscInt Cd_nnz = Cdi_dual.view_host().data()[m]; 776 PetscInt Co_nnz = Coi_dual.view_host().data()[m]; 777 778 /* With nnz, allocate a, j for Cd and Co */ 779 MatColIdxKokkosDualView Cdj_dual("j",Cd_nnz),Coj_dual("j",Co_nnz); 780 MatScalarKokkosDualView Cda_dual("a",Cd_nnz),Coa_dual("a",Co_nnz); 781 782 /* Fill a, j of Cd and Co on device */ 783 MatColIdxKokkosView Cdj = Cdj_dual.view_device(),Coj = Coj_dual.view_device(); 784 MatScalarKokkosView Cda = Cda_dual.view_device(),Coa = Coa_dual.view_device(); 785 786 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 787 PetscInt i = t.league_rank(); /* row i */ 788 PetscInt clen = Ci(i+1) - Ci(i); /* len of row i of C */ 789 PetscInt cdlen = Cdi(i+1) - Cdi(i); /* len of row i of Cd */ 790 PetscInt cdstart = Cdstart(i); /* [start, end) of row i of Cd in C */ 791 PetscInt cdend = cdstart + cdlen; 792 /* [0, clen) is cut into three blocks: [0, cdstart), [cdstart, cdend), [cdend, clen) */ 793 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clen), [&](PetscInt k) { 794 if (k < cdstart) { /* k in [0, cdstart) */ 795 Coa(Coi(i)+k) = Ca(Ci(i)+k); 796 Coj(Coi(i)+k) = Cj(Ci(i)+k); 797 } else if (k < cdend) { /* k in [cdstart, cdend) */ 798 Cda(Cdi(i)+(k-cdstart)) = Ca(Ci(i)+k); 799 Cdj(Cdi(i)+(k-cdstart)) = Cj(Ci(i)+k) - cstart; /* Use local col ids in Cdj */ 800 } else { /* k in [cdend, clen) */ 801 Coa(Coi(i)+k-cdlen) = Ca(Ci(i)+k); 802 Coj(Coi(i)+k-cdlen) = Cj(Ci(i)+k); 803 } 804 }); 805 }); 806 807 Cdj_dual.modify_device(); 808 Cda_dual.modify_device(); 809 Coj_dual.modify_device(); 810 Coa_dual.modify_device(); 811 /* With a, i, j for Cd and Co, finally build Cd, Co and then C. Their offloadmask will be set in each's MatAssemblyEnd */ 812 auto cdkok = new Mat_SeqAIJKokkos(m,n,Cd_nnz,Cdi_dual,Cdj_dual,Cda_dual); 813 auto cokok = new Mat_SeqAIJKokkos(m,N,Co_nnz,Coi_dual,Coj_dual,Coa_dual); 814 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,cdkok,&Cd)); 815 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,cokok,&Co)); 816 PetscCall(MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices(C,Cd,Co)); /* Coj will be converted to local ids within */ 817 } 818 PetscFunctionReturn(0); 819 } 820 821 /* MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos - Compact a SEQAIJKOKKS matrix's global col ids. 822 823 It is similar to MatSeqAIJCompactOutExtraColumns_SeqAIJ, but it applies to SEQAIJKOKKOS and returns the l2g map in Kokkos view. 824 825 Input Parameters: 826 + C - the MATMPIAIJKOKKOS matrix, of size m,n,M,N 827 . reuse - indicate whether the matrix has called this function before 828 . csrmat - the KokkosCsrMatrix, of size m,N 829 - Cdoffset - when reuse == MAT_REUSE_MATRIX, it is an input parameter. For each row in csrmat, it stores the offset of the first 830 entry of the diag block of C in csrmat's j array. 831 832 Output Parameters: 833 + C - the updated MATMPIAIJKOKKOS matrix 834 - Cdoffset - when reuse == MAT_INITIAL_MATRIX, it is an output parameter 835 836 Notes: the input matrix's col ids and col size will be changed. 837 */ 838 static PetscErrorCode MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos(Mat C,MatColIdxKokkosView& l2g) 839 { 840 Mat_SeqAIJKokkos *ckok; 841 ISLocalToGlobalMapping l2gmap; 842 const PetscInt *garray; 843 PetscInt sz; 844 845 PetscFunctionBegin; 846 /* Compact P_other's global col ids and col size. We do it since we guess with local ids KK might be more memory scalable */ 847 PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(C,&l2gmap)); 848 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 849 ckok->j_dual.modify_host(); /* P_other's j is modified on host; we need to sync it on device */ 850 ckok->j_dual.sync_device(); 851 ckok->SetColSize(C->cmap->n); /* Update col size of the csrmat in spptr */ 852 853 /* Build l2g -- the local to global mapping of C's cols */ 854 PetscCall(ISLocalToGlobalMappingGetIndices(l2gmap,&garray)); 855 PetscCall(ISLocalToGlobalMappingGetSize(l2gmap,&sz)); 856 PetscCheck(C->cmap->n == sz,PETSC_COMM_SELF,PETSC_ERR_PLIB,"matrix column size(%" PetscInt_FMT ") != l2g mapping size(%" PetscInt_FMT ")", C->cmap->n,sz); 857 858 ConstMatColIdxKokkosViewHost tmp(garray,sz); 859 l2g = MatColIdxKokkosView("l2g",sz); 860 Kokkos::deep_copy(l2g,tmp); 861 862 PetscCall(ISLocalToGlobalMappingRestoreIndices(l2gmap,&garray)); 863 PetscCall(ISLocalToGlobalMappingDestroy(&l2gmap)); 864 PetscFunctionReturn(0); 865 } 866 867 /* MatProductSymbolic_MPIAIJKokkos_AB - AB flavor of MatProductSymbolic_MPIAIJKokkos 868 869 Input Parameters: 870 + product - Mat_Product which carried out the computation. Passed in to access info about this mat product. 871 . A - an MPIAIJKOKKOS matrix 872 . B - an MPIAIJKOKKOS matrix 873 - mm - a struct used to stash intermediate data when computing AB. Persist from symbolic to numeric operations. 874 875 Notes: The local part of the result C is stored as mm->C_global, which is of type KokkosCsrMatrix and uses global col ids. 876 */ 877 static PetscErrorCode MatProductSymbolic_MPIAIJKokkos_AB(Mat_Product *product,Mat A,Mat B,MatMatStruct_AB *mm) 878 { 879 Mat_MPIAIJ *a = static_cast<Mat_MPIAIJ*>(A->data); 880 Mat Ad = a->A,Ao = a->B; /* diag and offdiag of A */ 881 IS glob = NULL; 882 const PetscInt *garray; 883 PetscInt N = B->cmap->N,sz; 884 ConstMatColIdxKokkosView l2g1; /* two temp maps mapping local col ids to global ones */ 885 MatColIdxKokkosView l2g2; 886 Mat C1,C2; /* intermediate matrices */ 887 888 PetscFunctionBegin; 889 /* C1 = Ad * B_local. B_local is a matrix got by merging Bd and Bo, and uses local col ids */ 890 PetscCall(MatMPIAIJGetLocalMatMerge(B,MAT_INITIAL_MATRIX,&glob,&mm->B_local)); 891 PetscCall(MatProductCreate(Ad,mm->B_local,NULL,&C1)); 892 PetscCall(MatProductSetType(C1,MATPRODUCT_AB)); 893 PetscCall(MatProductSetFill(C1,product->fill)); 894 C1->product->api_user = product->api_user; 895 PetscCall(MatProductSetFromOptions(C1)); 896 PetscCheck(C1->ops->productsymbolic,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C1->product->type]); 897 PetscCall((*C1->ops->productsymbolic)(C1)); 898 899 PetscCall(ISGetIndices(glob,&garray)); 900 PetscCall(ISGetSize(glob,&sz)); 901 const auto& tmp = ConstMatColIdxKokkosViewHost(garray,sz); /* wrap garray as a view */ 902 l2g1 = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),tmp); /* maybe just an alias to tmp, so we restore garray at the very end */ 903 PetscCall(MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C1,N,l2g1,mm->C1_global)); 904 905 /* C2 = Ao * B_other. B_other is a matrix consisting of needed rows of B gathered from other procs */ 906 PetscCall(MatSeqAIJKokkosBcast(mm->B_local,MAT_INITIAL_MATRIX,N,l2g1,a->Mvctx,mm->sf,mm->abuf,mm->rows,mm->rowoffset,mm->B_other)); 907 908 /* Compact B_other to use local ids as we guess KK spgemm is more memroy scalable with that; We could skip the compaction to simplify code */ 909 PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos(mm->B_other,l2g2)); 910 PetscCall(MatProductCreate(Ao,mm->B_other,NULL,&C2)); 911 PetscCall(MatProductSetType(C2,MATPRODUCT_AB)); 912 PetscCall(MatProductSetFill(C2,product->fill)); 913 C2->product->api_user = product->api_user; 914 PetscCall(MatProductSetFromOptions(C2)); 915 PetscCheck(C2->ops->productsymbolic,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C2->product->type]); 916 PetscCall((*C2->ops->productsymbolic)(C2)); 917 PetscCall(MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C2,N,l2g2,mm->C2_global)); 918 919 /* C = C1 + C2. We actually use their global col ids versions in adding */ 920 mm->kh.create_spadd_handle(false); /* Input C1, C2 are NOT sorted, since B_local, B_other are not */ 921 KokkosSparse::spadd_symbolic(&mm->kh,mm->C1_global,mm->C2_global,mm->C_global); 922 /* Have to do numeric since spadd_symbolic does not really populate column indices of the result matrix */ 923 KokkosSparse::spadd_numeric(&mm->kh,(MatScalarType)1.0,mm->C1_global,(MatScalarType)1.0,mm->C2_global,mm->C_global); 924 925 mm->C1 = C1; 926 mm->C2 = C2; 927 PetscCall(ISRestoreIndices(glob,&garray)); 928 PetscCall(ISDestroy(&glob)); 929 PetscFunctionReturn(0); 930 } 931 932 /* MatProductSymbolic_MPIAIJKokkos_AtB - A^tB flavor of MatProductSymbolic_MPIAIJKokkos 933 934 Input Parameters: 935 + product - Mat_Product which carried out the computation. Passed in to access info about this mat product. 936 . A - an MPIAIJKOKKOS matrix 937 . B - a SEQAIJKOKKOS matrix. It works as if A^t is multiplied by a parallel matrix made up of Bs on each rank. 938 . localB - Does B use local col ids? If false, then B is already in global col ids. 939 . N - col size of the "parallel B matrix". It implies B's global col ids are in range of [0,N) and N is the same across the communicator. 940 . l2g - If localB, then l2g maps B's local col ids to global ones. 941 - mm - a struct used to stash intermediate data in AtB 942 943 Notes: The local part of the result C is stored as mm->C_global, which is of type KokkosCsrMatrix and uses global col ids. 944 */ 945 static PetscErrorCode MatProductSymbolic_MPIAIJKokkos_AtB(Mat_Product *product,Mat A,Mat B,PetscBool localB,PetscInt N,const ConstMatColIdxKokkosView& l2g,MatMatStruct_AtB *mm) 946 { 947 Mat_MPIAIJ *a = static_cast<Mat_MPIAIJ*>(A->data); 948 Mat Ad = a->A,Ao = a->B; /* diag and offdiag of A */ 949 Mat C1,C2; /* intermediate matrices */ 950 951 PetscFunctionBegin; 952 /* C1 = Ad^t * B */ 953 PetscCall(MatProductCreate(Ad,B,NULL,&C1)); 954 PetscCall(MatProductSetType(C1,MATPRODUCT_AtB)); 955 PetscCall(MatProductSetFill(C1,product->fill)); 956 C1->product->api_user = product->api_user; 957 PetscCall(MatProductSetFromOptions(C1)); 958 PetscCheck(C1->ops->productsymbolic,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C1->product->type]); 959 PetscCall((*C1->ops->productsymbolic)(C1)); 960 961 if (localB) PetscCall(MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C1,N,l2g,mm->C1_global)); 962 else mm->C1_global = static_cast<Mat_SeqAIJKokkos*>(C1->spptr)->csrmat; /* the csrmat already uses global col ids */ 963 964 /* C2 = Ao^t * B */ 965 PetscCall(MatProductCreate(Ao,B,NULL,&C2)); 966 PetscCall(MatProductSetType(C2,MATPRODUCT_AtB)); 967 PetscCall(MatProductSetFill(C2,product->fill)); 968 C2->product->api_user = product->api_user; 969 PetscCall(MatProductSetFromOptions(C2)); 970 PetscCheck(C2->ops->productsymbolic,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C2->product->type]); 971 PetscCall((*C2->ops->productsymbolic)(C2)); 972 973 PetscCall(MatSeqAIJKokkosReduce(C2,MAT_INITIAL_MATRIX,localB,N,l2g,a->Mvctx,mm->sf,mm->abuf,mm->srcrowoffset,mm->dstrowoffset,mm->C2_global)); 974 975 mm->kh.create_spadd_handle(false); /* Input C1, C2 are NOT sorted, since B may be not */ 976 KokkosSparse::spadd_symbolic(&mm->kh,mm->C1_global,mm->C2_global,mm->C_global); 977 /* Have to do numeric since spadd_symbolic does not really populate column indices of the result matrix */ 978 KokkosSparse::spadd_numeric(&mm->kh,(MatScalarType)1.0,mm->C1_global,(MatScalarType)1.0,mm->C2_global,mm->C_global); 979 mm->C1 = C1; 980 mm->C2 = C2; 981 PetscFunctionReturn(0); 982 } 983 984 PetscErrorCode MatProductNumeric_MPIAIJKokkos(Mat C) 985 { 986 Mat_Product *product = C->product; 987 MatProductType ptype; 988 MatProductData_MPIAIJKokkos *mmdata; 989 MatMatStruct *mm = NULL; 990 MatMatStruct_AB *ab; 991 MatMatStruct_AtB *atb; 992 Mat A,B,Ad,Ao,Bd,Bo; 993 const MatScalarType one = 1.0; /* Not use literal 1.0 directly, to avoid wrong template instantiation in KokkosSparse::spadd_numeric */ 994 995 PetscFunctionBegin; 996 MatCheckProduct(C,1); 997 mmdata = static_cast<MatProductData_MPIAIJKokkos*>(product->data); 998 ptype = product->type; 999 A = product->A; 1000 B = product->B; 1001 Ad = static_cast<Mat_MPIAIJ*>(A->data)->A; 1002 Ao = static_cast<Mat_MPIAIJ*>(A->data)->B; 1003 Bd = static_cast<Mat_MPIAIJ*>(B->data)->A; 1004 Bo = static_cast<Mat_MPIAIJ*>(B->data)->B; 1005 1006 if (mmdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 1007 mmdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 1008 ab = mmdata->mmAB; 1009 atb = mmdata->mmAtB; 1010 if (ab) { 1011 static_cast<MatProductData_SeqAIJKokkos*>(ab->C1->product->data)->reusesym = PETSC_FALSE; 1012 static_cast<MatProductData_SeqAIJKokkos*>(ab->C2->product->data)->reusesym = PETSC_FALSE; 1013 } 1014 if (atb) { 1015 static_cast<MatProductData_SeqAIJKokkos*>(atb->C1->product->data)->reusesym = PETSC_FALSE; 1016 static_cast<MatProductData_SeqAIJKokkos*>(atb->C2->product->data)->reusesym = PETSC_FALSE; 1017 } 1018 PetscFunctionReturn(0); 1019 } 1020 1021 if (ptype == MATPRODUCT_AB) { 1022 ab = mmdata->mmAB; 1023 /* C1 = Ad * B_local */ 1024 PetscCheck(ab->C1->ops->productnumeric && ab->C2->ops->productnumeric,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing numeric op for MATPRODUCT_AB"); 1025 PetscCall(MatMPIAIJGetLocalMatMerge(B,MAT_REUSE_MATRIX,NULL/*glob*/,&ab->B_local)); 1026 PetscCheck(ab->C1->product->B == ab->B_local,PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AB, internal mat product matrix C1->B has unexpectedly changed"); 1027 if (ab->C1->product->A != Ad) PetscCall(MatProductReplaceMats(Ad,NULL,NULL,ab->C1)); 1028 PetscCall((*ab->C1->ops->productnumeric)(ab->C1)); 1029 PetscCall(MatSeqAIJKokkosBcast(ab->B_local,MAT_REUSE_MATRIX,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,ab->sf, 1030 ab->abuf,ab->rows,ab->rowoffset,ab->B_other)); 1031 /* C2 = Ao * B_other */ 1032 PetscCheck(ab->C2->product->B == ab->B_other,PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AB, internal mat product matrix C2->B has unexpectedly changed"); 1033 if (ab->C1->product->A != Ao) PetscCall(MatProductReplaceMats(Ao,NULL,NULL,ab->C2)); 1034 PetscCall((*ab->C2->ops->productnumeric)(ab->C2)); 1035 /* C = C1_global + C2_global */ 1036 KokkosSparse::spadd_numeric(&ab->kh,one,ab->C1_global,one,ab->C2_global,ab->C_global); 1037 mm = static_cast<MatMatStruct*>(ab); 1038 } else if (ptype == MATPRODUCT_AtB) { 1039 atb = mmdata->mmAtB; 1040 PetscCheck(atb->C1->ops->productnumeric && atb->C2->ops->productnumeric,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing numeric op for MATPRODUCT_AtB"); 1041 /* C1 = Ad^t * B_local */ 1042 PetscCall(MatMPIAIJGetLocalMatMerge(B,MAT_REUSE_MATRIX,NULL/*glob*/,&atb->B_local)); 1043 PetscCheck(atb->C1->product->B == atb->B_local,PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AtB, internal mat product matrix C1->B has unexpectedly changed"); 1044 if (atb->C1->product->A != Ad) PetscCall(MatProductReplaceMats(Ad,NULL,NULL,atb->C1)); 1045 PetscCall((*atb->C1->ops->productnumeric)(atb->C1)); 1046 1047 /* C2 = Ao^t * B_local */ 1048 PetscCheck(atb->C2->product->B == atb->B_local,PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AtB, internal mat product matrix C2->B has unexpectedly changed"); 1049 if (atb->C2->product->A != Ao) PetscCall(MatProductReplaceMats(Ao,NULL,NULL,atb->C2)); 1050 PetscCall((*atb->C2->ops->productnumeric)(atb->C2)); 1051 /* Form C2_global */ 1052 PetscCall(MatSeqAIJKokkosReduce(atb->C2,MAT_REUSE_MATRIX,PETSC_TRUE,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,atb->sf, 1053 atb->abuf,atb->srcrowoffset,atb->dstrowoffset,atb->C2_global)); 1054 /* C = C1_global + C2_global */ 1055 KokkosSparse::spadd_numeric(&atb->kh,one,atb->C1_global,one,atb->C2_global,atb->C_global); 1056 mm = static_cast<MatMatStruct*>(atb); 1057 } else if (ptype == MATPRODUCT_PtAP) { /* BtAB */ 1058 ab = mmdata->mmAB; 1059 PetscCall(MatMPIAIJGetLocalMatMerge(B,MAT_REUSE_MATRIX,NULL/*glob*/,&ab->B_local)); 1060 1061 /* ab->C1 = Ad * B_local */ 1062 PetscCheck(ab->C1->product->B == ab->B_local,PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_PtAP, internal mat product matrix ab->C1->B has unexpectedly changed"); 1063 if (ab->C1->product->A != Ad) PetscCall(MatProductReplaceMats(Ad,NULL,NULL,ab->C1)); 1064 PetscCall((*ab->C1->ops->productnumeric)(ab->C1)); 1065 PetscCall(MatSeqAIJKokkosBcast(ab->B_local,MAT_REUSE_MATRIX,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,ab->sf, 1066 ab->abuf,ab->rows,ab->rowoffset,ab->B_other)); 1067 /* ab->C2 = Ao * B_other */ 1068 if (ab->C2->product->A != Ao) PetscCall(MatProductReplaceMats(Ao,NULL,NULL,ab->C2)); 1069 PetscCall((*ab->C2->ops->productnumeric)(ab->C2)); /* C2 = Ao * B_other */ 1070 KokkosSparse::spadd_numeric(&ab->kh,one,ab->C1_global,one,ab->C2_global,ab->C_global); 1071 1072 /* atb->C1 = Bd^t * ab->C_petsc */ 1073 atb = mmdata->mmAtB; 1074 PetscCheck(atb->C1->product->B == ab->C_petsc,PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_PtAP, internal mat product matrix atb->C1->B has unexpectedly changed"); 1075 if (atb->C1->product->A != Bd) PetscCall(MatProductReplaceMats(Bd,NULL,NULL,atb->C1)); 1076 PetscCall((*atb->C1->ops->productnumeric)(atb->C1)); 1077 /* atb->C2 = Bo^t * ab->C_petsc */ 1078 if (atb->C2->product->A != Bo) PetscCall(MatProductReplaceMats(Bo,NULL,NULL,atb->C2)); 1079 PetscCall((*atb->C2->ops->productnumeric)(atb->C2)); 1080 PetscCall(MatSeqAIJKokkosReduce(atb->C2,MAT_REUSE_MATRIX,PETSC_FALSE,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,atb->sf, 1081 atb->abuf,atb->srcrowoffset,atb->dstrowoffset,atb->C2_global)); 1082 KokkosSparse::spadd_numeric(&atb->kh,one,atb->C1_global,one,atb->C2_global,atb->C_global); 1083 mm = static_cast<MatMatStruct*>(atb); 1084 } 1085 /* Split C_global to form C */ 1086 PetscCall(MatSetMPIAIJKokkosWithGlobalCSRMatrix(C,MAT_REUSE_MATRIX,mm->C_global,mm->Cdstart)); 1087 PetscFunctionReturn(0); 1088 } 1089 1090 PetscErrorCode MatProductSymbolic_MPIAIJKokkos(Mat C) 1091 { 1092 Mat A,B; 1093 Mat_Product *product = C->product; 1094 MatProductType ptype; 1095 MatProductData_MPIAIJKokkos *mmdata; 1096 MatMatStruct *mm = NULL; 1097 IS glob = NULL; 1098 const PetscInt *garray; 1099 PetscInt m,n,M,N,sz; 1100 ConstMatColIdxKokkosView l2g; /* map local col ids to global ones */ 1101 1102 PetscFunctionBegin; 1103 MatCheckProduct(C,1); 1104 PetscCheck(!product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 1105 ptype = product->type; 1106 A = product->A; 1107 B = product->B; 1108 1109 switch (ptype) { 1110 case MATPRODUCT_AB: m = A->rmap->n; n = B->cmap->n; M = A->rmap->N; N = B->cmap->N; break; 1111 case MATPRODUCT_AtB: m = A->cmap->n; n = B->cmap->n; M = A->cmap->N; N = B->cmap->N; break; 1112 case MATPRODUCT_PtAP: m = B->cmap->n; n = B->cmap->n; M = B->cmap->N; N = B->cmap->N; break; /* BtAB */ 1113 default: SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for product type %s",MatProductTypes[ptype]); 1114 } 1115 1116 PetscCall(MatSetSizes(C,m,n,M,N)); 1117 PetscCall(PetscLayoutSetUp(C->rmap)); 1118 PetscCall(PetscLayoutSetUp(C->cmap)); 1119 PetscCall(MatSetType(C,((PetscObject)A)->type_name)); 1120 1121 mmdata = new MatProductData_MPIAIJKokkos(); 1122 mmdata->reusesym = product->api_user; 1123 1124 if (ptype == MATPRODUCT_AB) { 1125 mmdata->mmAB = new MatMatStruct_AB(); 1126 PetscCall(MatProductSymbolic_MPIAIJKokkos_AB(product,A,B,mmdata->mmAB)); 1127 mm = static_cast<MatMatStruct*>(mmdata->mmAB); 1128 } else if (ptype == MATPRODUCT_AtB) { 1129 mmdata->mmAtB = new MatMatStruct_AtB(); 1130 auto atb = mmdata->mmAtB; 1131 PetscCall(MatMPIAIJGetLocalMatMerge(B,MAT_INITIAL_MATRIX,&glob,&atb->B_local)); 1132 PetscCall(ISGetIndices(glob,&garray)); 1133 PetscCall(ISGetSize(glob,&sz)); 1134 l2g = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatColIdxKokkosViewHost(garray,sz)); 1135 PetscCall(MatProductSymbolic_MPIAIJKokkos_AtB(product,A,atb->B_local,PETSC_TRUE,N,l2g,atb)); 1136 PetscCall(ISRestoreIndices(glob,&garray)); 1137 PetscCall(ISDestroy(&glob)); 1138 mm = static_cast<MatMatStruct*>(atb); 1139 } else if (ptype == MATPRODUCT_PtAP) { /* BtAB */ 1140 mmdata->mmAB = new MatMatStruct_AB(); /* tmp=A*B */ 1141 mmdata->mmAtB = new MatMatStruct_AtB(); /* C=B^t*tmp */ 1142 auto ab = mmdata->mmAB; 1143 auto atb = mmdata->mmAtB; 1144 PetscCall(MatProductSymbolic_MPIAIJKokkos_AB(product,A,B,ab)); 1145 auto tmp = new Mat_SeqAIJKokkos(ab->C_global); /* Memory will be owned by ab->C_petsc */ 1146 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,tmp,&ab->C_petsc)); 1147 PetscCall(MatProductSymbolic_MPIAIJKokkos_AtB(product,B,ab->C_petsc,PETSC_FALSE,N,l2g/*not used*/,atb)); 1148 mm = static_cast<MatMatStruct*>(atb); 1149 } 1150 /* Split the C_global into petsc A, B format */ 1151 PetscCall(MatSetMPIAIJKokkosWithGlobalCSRMatrix(C,MAT_INITIAL_MATRIX,mm->C_global,mm->Cdstart)); 1152 C->product->data = mmdata; 1153 C->product->destroy = MatProductDataDestroy_MPIAIJKokkos; 1154 C->ops->productnumeric = MatProductNumeric_MPIAIJKokkos; 1155 PetscFunctionReturn(0); 1156 } 1157 1158 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_MPIAIJKokkos(Mat mat) 1159 { 1160 Mat_Product *product = mat->product; 1161 PetscBool match = PETSC_FALSE; 1162 PetscBool usecpu = PETSC_FALSE; 1163 1164 PetscFunctionBegin; 1165 MatCheckProduct(mat,1); 1166 if (!product->A->boundtocpu && !product->B->boundtocpu) { 1167 PetscCall(PetscObjectTypeCompare((PetscObject)product->B,((PetscObject)product->A)->type_name,&match)); 1168 } 1169 if (match) { /* we can always fallback to the CPU if requested */ 1170 switch (product->type) { 1171 case MATPRODUCT_AB: 1172 if (product->api_user) { 1173 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat"); 1174 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 1175 PetscOptionsEnd(); 1176 } else { 1177 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat"); 1178 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 1179 PetscOptionsEnd(); 1180 } 1181 break; 1182 case MATPRODUCT_AtB: 1183 if (product->api_user) { 1184 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat"); 1185 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 1186 PetscOptionsEnd(); 1187 } else { 1188 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat"); 1189 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 1190 PetscOptionsEnd(); 1191 } 1192 break; 1193 case MATPRODUCT_PtAP: 1194 if (product->api_user) { 1195 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat"); 1196 PetscCall(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 1197 PetscOptionsEnd(); 1198 } else { 1199 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat"); 1200 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 1201 PetscOptionsEnd(); 1202 } 1203 break; 1204 default: 1205 break; 1206 } 1207 match = (PetscBool)!usecpu; 1208 } 1209 if (match) { 1210 switch (product->type) { 1211 case MATPRODUCT_AB: 1212 case MATPRODUCT_AtB: 1213 case MATPRODUCT_PtAP: 1214 mat->ops->productsymbolic = MatProductSymbolic_MPIAIJKokkos; 1215 break; 1216 default: 1217 break; 1218 } 1219 } 1220 /* fallback to MPIAIJ ops */ 1221 if (!mat->ops->productsymbolic) { 1222 PetscCall(MatProductSetFromOptions_MPIAIJ(mat)); 1223 } 1224 PetscFunctionReturn(0); 1225 } 1226 1227 static PetscErrorCode MatSetPreallocationCOO_MPIAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 1228 { 1229 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 1230 Mat_MPIAIJKokkos *mpikok; 1231 1232 PetscFunctionBegin; 1233 PetscCall(MatSetPreallocationCOO_MPIAIJ(mat,coo_n,coo_i,coo_j)); 1234 mat->preallocated = PETSC_TRUE; 1235 PetscCall(MatAssemblyBegin(mat,MAT_FINAL_ASSEMBLY)); 1236 PetscCall(MatAssemblyEnd(mat,MAT_FINAL_ASSEMBLY)); 1237 PetscCall(MatZeroEntries(mat)); 1238 mpikok = static_cast<Mat_MPIAIJKokkos*>(mpiaij->spptr); 1239 delete mpikok; 1240 mpiaij->spptr = new Mat_MPIAIJKokkos(mpiaij); 1241 PetscFunctionReturn(0); 1242 } 1243 1244 static PetscErrorCode MatSetValuesCOO_MPIAIJKokkos(Mat mat,const PetscScalar v[],InsertMode imode) 1245 { 1246 Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); 1247 Mat_MPIAIJKokkos *mpikok = static_cast<Mat_MPIAIJKokkos*>(mpiaij->spptr); 1248 Mat A = mpiaij->A,B = mpiaij->B; 1249 PetscCount Annz1 = mpiaij->Annz1,Annz2 = mpiaij->Annz2,Bnnz1 = mpiaij->Bnnz1,Bnnz2 = mpiaij->Bnnz2; 1250 MatScalarKokkosView Aa,Ba; 1251 MatScalarKokkosView v1; 1252 MatScalarKokkosView& vsend = mpikok->sendbuf_d; 1253 const MatScalarKokkosView& v2 = mpikok->recvbuf_d; 1254 const PetscCountKokkosView& Ajmap1 = mpikok->Ajmap1_d,Ajmap2 = mpikok->Ajmap2_d,Aimap1 = mpikok->Aimap1_d,Aimap2 = mpikok->Aimap2_d; 1255 const PetscCountKokkosView& Bjmap1 = mpikok->Bjmap1_d,Bjmap2 = mpikok->Bjmap2_d,Bimap1 = mpikok->Bimap1_d,Bimap2 = mpikok->Bimap2_d; 1256 const PetscCountKokkosView& Aperm1 = mpikok->Aperm1_d,Aperm2 = mpikok->Aperm2_d,Bperm1 = mpikok->Bperm1_d,Bperm2 = mpikok->Bperm2_d; 1257 const PetscCountKokkosView& Cperm1 = mpikok->Cperm1_d; 1258 PetscMemType memtype; 1259 1260 PetscFunctionBegin; 1261 PetscCall(PetscGetMemType(v,&memtype)); /* Return PETSC_MEMTYPE_HOST when v is NULL */ 1262 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device if any */ 1263 v1 = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),MatScalarKokkosViewHost((PetscScalar*)v,mpiaij->coo_n)); 1264 } else { 1265 v1 = MatScalarKokkosView((PetscScalar*)v,mpiaij->coo_n); /* Directly use v[]'s memory */ 1266 } 1267 1268 if (imode == INSERT_VALUES) { 1269 PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); /* write matrix values */ 1270 PetscCall(MatSeqAIJGetKokkosViewWrite(B,&Ba)); 1271 Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */ 1272 Kokkos::deep_copy(Ba,0.0); 1273 } else { 1274 PetscCall(MatSeqAIJGetKokkosView(A,&Aa)); /* read & write matrix values */ 1275 PetscCall(MatSeqAIJGetKokkosView(B,&Ba)); 1276 } 1277 1278 /* Pack entries to be sent to remote */ 1279 Kokkos::parallel_for(vsend.extent(0),KOKKOS_LAMBDA(const PetscCount i) {vsend(i) = v1(Cperm1(i));}); 1280 1281 /* Send remote entries to their owner and overlap the communication with local computation */ 1282 PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_KOKKOS,vsend.data(),PETSC_MEMTYPE_KOKKOS,v2.data(),MPI_REPLACE)); 1283 /* Add local entries to A and B */ 1284 Kokkos::parallel_for(Annz1,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=Ajmap1(i); k<Ajmap1(i+1); k++) Aa(Aimap1(i)) += v1(Aperm1(k));}); 1285 Kokkos::parallel_for(Bnnz1,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=Bjmap1(i); k<Bjmap1(i+1); k++) Ba(Bimap1(i)) += v1(Bperm1(k));}); 1286 PetscCall(PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend.data(),v2.data(),MPI_REPLACE)); 1287 1288 /* Add received remote entries to A and B */ 1289 Kokkos::parallel_for(Annz2,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=Ajmap2(i); k<Ajmap2(i+1); k++) Aa(Aimap2(i)) += v2(Aperm2(k));}); 1290 Kokkos::parallel_for(Bnnz2,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=Bjmap2(i); k<Bjmap2(i+1); k++) Ba(Bimap2(i)) += v2(Bperm2(k));}); 1291 1292 if (imode == INSERT_VALUES) { 1293 PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa)); /* Increase A & B's state etc. */ 1294 PetscCall(MatSeqAIJRestoreKokkosViewWrite(B,&Ba)); 1295 } else { 1296 PetscCall(MatSeqAIJRestoreKokkosView(A,&Aa)); 1297 PetscCall(MatSeqAIJRestoreKokkosView(B,&Ba)); 1298 } 1299 PetscFunctionReturn(0); 1300 } 1301 1302 PetscErrorCode MatDestroy_MPIAIJKokkos(Mat A) 1303 { 1304 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 1305 1306 PetscFunctionBegin; 1307 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL)); 1308 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL)); 1309 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C", NULL)); 1310 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C", NULL)); 1311 delete (Mat_MPIAIJKokkos*)mpiaij->spptr; 1312 PetscCall(MatDestroy_MPIAIJ(A)); 1313 PetscFunctionReturn(0); 1314 } 1315 1316 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 1317 { 1318 Mat B; 1319 Mat_MPIAIJ *a; 1320 1321 PetscFunctionBegin; 1322 if (reuse == MAT_INITIAL_MATRIX) { 1323 PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); 1324 } else if (reuse == MAT_REUSE_MATRIX) { 1325 PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); 1326 } 1327 B = *newmat; 1328 1329 B->boundtocpu = PETSC_FALSE; 1330 PetscCall(PetscFree(B->defaultvectype)); 1331 PetscCall(PetscStrallocpy(VECKOKKOS,&B->defaultvectype)); 1332 PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATMPIAIJKOKKOS)); 1333 1334 a = static_cast<Mat_MPIAIJ*>(A->data); 1335 if (a->A) PetscCall(MatSetType(a->A,MATSEQAIJKOKKOS)); 1336 if (a->B) PetscCall(MatSetType(a->B,MATSEQAIJKOKKOS)); 1337 if (a->lvec) PetscCall(VecSetType(a->lvec,VECSEQKOKKOS)); 1338 1339 B->ops->assemblyend = MatAssemblyEnd_MPIAIJKokkos; 1340 B->ops->mult = MatMult_MPIAIJKokkos; 1341 B->ops->multadd = MatMultAdd_MPIAIJKokkos; 1342 B->ops->multtranspose = MatMultTranspose_MPIAIJKokkos; 1343 B->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJKokkos; 1344 B->ops->destroy = MatDestroy_MPIAIJKokkos; 1345 1346 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJKokkos)); 1347 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJKokkos)); 1348 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJKokkos)); 1349 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJKokkos)); 1350 PetscFunctionReturn(0); 1351 } 1352 /*MC 1353 MATSMPIAIJKOKKOS - MATAIJKOKKOS = "(mpi)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 1354 1355 A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 1356 1357 Options Database Keys: 1358 . -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions() 1359 1360 Level: beginner 1361 1362 .seealso: MatCreateAIJKokkos(), MATSEQAIJKOKKOS 1363 M*/ 1364 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJKokkos(Mat A) 1365 { 1366 PetscFunctionBegin; 1367 PetscCall(PetscKokkosInitializeCheck()); 1368 PetscCall(MatCreate_MPIAIJ(A)); 1369 PetscCall(MatConvert_MPIAIJ_MPIAIJKokkos(A,MATMPIAIJKOKKOS,MAT_INPLACE_MATRIX,&A)); 1370 PetscFunctionReturn(0); 1371 } 1372 1373 /*@C 1374 MatCreateAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 1375 (the default parallel PETSc format). This matrix will ultimately pushed down 1376 to Kokkos for calculations. For good matrix 1377 assembly performance the user should preallocate the matrix storage by setting 1378 the parameter nz (or the array nnz). By setting these parameters accurately, 1379 performance during matrix assembly can be increased by more than a factor of 50. 1380 1381 Collective 1382 1383 Input Parameters: 1384 + comm - MPI communicator, set to PETSC_COMM_SELF 1385 . m - number of rows 1386 . n - number of columns 1387 . nz - number of nonzeros per row (same for all rows) 1388 - nnz - array containing the number of nonzeros in the various rows 1389 (possibly different for each row) or NULL 1390 1391 Output Parameter: 1392 . A - the matrix 1393 1394 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1395 MatXXXXSetPreallocation() paradigm instead of this routine directly. 1396 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1397 1398 Notes: 1399 If nnz is given then nz is ignored 1400 1401 The AIJ format (also called the Yale sparse matrix format or 1402 compressed row storage), is fully compatible with standard Fortran 77 1403 storage. That is, the stored row and column indices can begin at 1404 either one (as in Fortran) or zero. See the users' manual for details. 1405 1406 Specify the preallocated storage with either nz or nnz (not both). 1407 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1408 allocation. For large problems you MUST preallocate memory or you 1409 will get TERRIBLE performance, see the users' manual chapter on matrices. 1410 1411 By default, this format uses inodes (identical nodes) when possible, to 1412 improve numerical efficiency of matrix-vector products and solves. We 1413 search for consecutive rows with the same nonzero structure, thereby 1414 reusing matrix information to achieve increased efficiency. 1415 1416 Level: intermediate 1417 1418 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJKOKKOS, MATAIJKokkos 1419 @*/ 1420 PetscErrorCode MatCreateAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 1421 { 1422 PetscMPIInt size; 1423 1424 PetscFunctionBegin; 1425 PetscCall(MatCreate(comm,A)); 1426 PetscCall(MatSetSizes(*A,m,n,M,N)); 1427 PetscCallMPI(MPI_Comm_size(comm,&size)); 1428 if (size > 1) { 1429 PetscCall(MatSetType(*A,MATMPIAIJKOKKOS)); 1430 PetscCall(MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz)); 1431 } else { 1432 PetscCall(MatSetType(*A,MATSEQAIJKOKKOS)); 1433 PetscCall(MatSeqAIJSetPreallocation(*A,d_nz,d_nnz)); 1434 } 1435 PetscFunctionReturn(0); 1436 } 1437 1438 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 1439 PetscErrorCode MatKokkosGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) 1440 { 1441 PetscMPIInt size,rank; 1442 MPI_Comm comm; 1443 PetscSplitCSRDataStructure d_mat=NULL; 1444 1445 PetscFunctionBegin; 1446 PetscCall(PetscObjectGetComm((PetscObject)A,&comm)); 1447 PetscCallMPI(MPI_Comm_size(comm,&size)); 1448 PetscCallMPI(MPI_Comm_rank(comm,&rank)); 1449 if (size == 1) { 1450 PetscCall(MatSeqAIJKokkosGetDeviceMat(A,&d_mat)); 1451 PetscCall(MatSeqAIJKokkosModifyDevice(A)); /* Since we are going to modify matrix values on device */ 1452 } else { 1453 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 1454 PetscCall(MatSeqAIJKokkosGetDeviceMat(aij->A,&d_mat)); 1455 PetscCall(MatSeqAIJKokkosModifyDevice(aij->A)); 1456 PetscCall(MatSeqAIJKokkosModifyDevice(aij->B)); 1457 PetscCheck(A->nooffprocentries || aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 1458 } 1459 // act like MatSetValues because not called on host 1460 if (A->assembled) { 1461 if (A->was_assembled) { 1462 PetscCall(PetscInfo(A,"Assemble more than once already\n")); 1463 } 1464 A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 1465 } else { 1466 PetscCall(PetscInfo(A,"Warning !assemble ??? assembled=%" PetscInt_FMT "\n",A->assembled)); 1467 } 1468 if (!d_mat) { 1469 struct _n_SplitCSRMat h_mat; /* host container */ 1470 Mat_SeqAIJKokkos *aijkokA; 1471 Mat_SeqAIJ *jaca; 1472 PetscInt n = A->rmap->n, nnz; 1473 Mat Amat; 1474 PetscInt *colmap; 1475 1476 /* create and copy h_mat */ 1477 h_mat.M = A->cmap->N; // use for debug build 1478 PetscCall(PetscInfo(A,"Create device matrix in Kokkos\n")); 1479 if (size == 1) { 1480 Amat = A; 1481 jaca = (Mat_SeqAIJ*)A->data; 1482 h_mat.rstart = 0; h_mat.rend = A->rmap->n; 1483 h_mat.cstart = 0; h_mat.cend = A->cmap->n; 1484 h_mat.offdiag.i = h_mat.offdiag.j = NULL; 1485 h_mat.offdiag.a = NULL; 1486 aijkokA = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1487 } else { 1488 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 1489 Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 1490 PetscInt ii; 1491 Mat_SeqAIJKokkos *aijkokB; 1492 1493 Amat = aij->A; 1494 aijkokA = static_cast<Mat_SeqAIJKokkos*>(aij->A->spptr); 1495 aijkokB = static_cast<Mat_SeqAIJKokkos*>(aij->B->spptr); 1496 jaca = (Mat_SeqAIJ*)aij->A->data; 1497 PetscCheck(!aij->B->cmap->n || aij->garray,comm,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 1498 PetscCheck(aij->B->rmap->n == aij->A->rmap->n,comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 1499 aij->donotstash = PETSC_TRUE; 1500 aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 1501 jaca->nonew = jacb->nonew = PETSC_TRUE; // no more disassembly 1502 PetscCall(PetscCalloc1(A->cmap->N,&colmap)); 1503 PetscCall(PetscLogObjectMemory((PetscObject)A,(A->cmap->N)*sizeof(PetscInt))); 1504 for (ii=0; ii<aij->B->cmap->n; ii++) colmap[aij->garray[ii]] = ii+1; 1505 // allocate B copy data 1506 h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 1507 h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 1508 nnz = jacb->i[n]; 1509 if (jacb->compressedrow.use) { 1510 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_i_k (jacb->i,n+1); 1511 aijkokB->i_uncompressed_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_i_k)); 1512 Kokkos::deep_copy (aijkokB->i_uncompressed_d, h_i_k); 1513 h_mat.offdiag.i = aijkokB->i_uncompressed_d.data(); 1514 } else { 1515 h_mat.offdiag.i = aijkokB->i_device_data(); 1516 } 1517 h_mat.offdiag.j = aijkokB->j_device_data(); 1518 h_mat.offdiag.a = aijkokB->a_device_data(); 1519 { 1520 Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_colmap_k (colmap,A->cmap->N); 1521 aijkokB->colmap_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_colmap_k)); 1522 Kokkos::deep_copy (aijkokB->colmap_d, h_colmap_k); 1523 h_mat.colmap = aijkokB->colmap_d.data(); 1524 PetscCall(PetscFree(colmap)); 1525 } 1526 h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 1527 h_mat.offdiag.n = n; 1528 } 1529 // allocate A copy data 1530 nnz = jaca->i[n]; 1531 h_mat.diag.n = n; 1532 h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 1533 PetscCallMPI(MPI_Comm_rank(comm,&h_mat.rank)); 1534 PetscCheckFalse(jaca->compressedrow.use,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A does not suppport compressed row (todo)"); 1535 else { 1536 h_mat.diag.i = aijkokA->i_device_data(); 1537 } 1538 h_mat.diag.j = aijkokA->j_device_data(); 1539 h_mat.diag.a = aijkokA->a_device_data(); 1540 // copy pointers and metdata to device 1541 PetscCall(MatSeqAIJKokkosSetDeviceMat(Amat,&h_mat)); 1542 PetscCall(MatSeqAIJKokkosGetDeviceMat(Amat,&d_mat)); 1543 PetscCall(PetscInfo(A,"Create device Mat n=%" PetscInt_FMT " nnz=%" PetscInt_FMT "\n",h_mat.diag.n, nnz)); 1544 } 1545 *B = d_mat; // return it, set it in Mat, and set it up 1546 A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 1547 PetscFunctionReturn(0); 1548 } 1549 1550 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetOffloadMask(Mat A, const char **mask) 1551 { 1552 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1553 1554 PetscFunctionBegin; 1555 if (!aijkok) *mask = "AIJKOK_UNALLOCATED"; 1556 else if (aijkok->a_dual.need_sync_host()) *mask = "PETSC_OFFLOAD_GPU"; 1557 else if (aijkok->a_dual.need_sync_device()) *mask = "PETSC_OFFLOAD_CPU"; 1558 else *mask = "PETSC_OFFLOAD_BOTH"; 1559 PetscFunctionReturn(0); 1560 } 1561 1562 PETSC_INTERN PetscErrorCode MatAIJKokkosPrintOffloadMask(Mat A) 1563 { 1564 PetscMPIInt size; 1565 Mat Ad,Ao; 1566 const char *amask,*bmask; 1567 1568 PetscFunctionBegin; 1569 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 1570 1571 if (size == 1) { 1572 PetscCall(MatSeqAIJKokkosGetOffloadMask(A,&amask)); 1573 PetscCall(PetscPrintf(PETSC_COMM_SELF,"%s\n",amask)); 1574 } else { 1575 Ad = ((Mat_MPIAIJ*)A->data)->A; 1576 Ao = ((Mat_MPIAIJ*)A->data)->B; 1577 PetscCall(MatSeqAIJKokkosGetOffloadMask(Ad,&amask)); 1578 PetscCall(MatSeqAIJKokkosGetOffloadMask(Ao,&bmask)); 1579 PetscCall(PetscPrintf(PETSC_COMM_SELF,"Diag : Off-diag = %s : %s\n",amask,bmask)); 1580 } 1581 PetscFunctionReturn(0); 1582 } 1583