1 #include <petscvec_kokkos.hpp> 2 #include <petscsf.h> 3 #include <petsc/private/sfimpl.h> 4 #include <../src/mat/impls/aij/mpi/mpiaij.h> 5 #include <../src/mat/impls/aij/mpi/kokkos/mpiaijkok.hpp> 6 #include <KokkosSparse_spadd.hpp> 7 8 PetscErrorCode MatAssemblyEnd_MPIAIJKokkos(Mat A,MatAssemblyType mode) 9 { 10 PetscErrorCode ierr; 11 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 12 Mat_SeqAIJKokkos *aijkok = mpiaij->A->spptr ? static_cast<Mat_SeqAIJKokkos*>(mpiaij->A->spptr) : NULL; 13 14 PetscFunctionBegin; 15 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 16 if (aijkok && aijkok->device_mat_d.data()) { 17 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 18 } 19 20 PetscFunctionReturn(0); 21 } 22 23 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJKokkos(Mat mat,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 24 { 25 PetscErrorCode ierr; 26 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 27 28 PetscFunctionBegin; 29 ierr = PetscLayoutSetUp(mat->rmap);CHKERRQ(ierr); 30 ierr = PetscLayoutSetUp(mat->cmap);CHKERRQ(ierr); 31 #if defined(PETSC_USE_DEBUG) 32 if (d_nnz) { 33 PetscInt i; 34 for (i=0; i<mat->rmap->n; i++) { 35 PetscCheckFalse(d_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]); 36 } 37 } 38 if (o_nnz) { 39 PetscInt i; 40 for (i=0; i<mat->rmap->n; i++) { 41 PetscCheckFalse(o_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]); 42 } 43 } 44 #endif 45 #if defined(PETSC_USE_CTABLE) 46 ierr = PetscTableDestroy(&mpiaij->colmap);CHKERRQ(ierr); 47 #else 48 ierr = PetscFree(mpiaij->colmap);CHKERRQ(ierr); 49 #endif 50 ierr = PetscFree(mpiaij->garray);CHKERRQ(ierr); 51 ierr = VecDestroy(&mpiaij->lvec);CHKERRQ(ierr); 52 ierr = VecScatterDestroy(&mpiaij->Mvctx);CHKERRQ(ierr); 53 /* Because the B will have been resized we simply destroy it and create a new one each time */ 54 ierr = MatDestroy(&mpiaij->B);CHKERRQ(ierr); 55 56 if (!mpiaij->A) { 57 ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->A);CHKERRQ(ierr); 58 ierr = MatSetSizes(mpiaij->A,mat->rmap->n,mat->cmap->n,mat->rmap->n,mat->cmap->n);CHKERRQ(ierr); 59 ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->A);CHKERRQ(ierr); 60 } 61 if (!mpiaij->B) { 62 PetscMPIInt size; 63 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size);CHKERRMPI(ierr); 64 ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->B);CHKERRQ(ierr); 65 ierr = MatSetSizes(mpiaij->B,mat->rmap->n,size > 1 ? mat->cmap->N : 0,mat->rmap->n,size > 1 ? mat->cmap->N : 0);CHKERRQ(ierr); 66 ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->B);CHKERRQ(ierr); 67 } 68 ierr = MatSetType(mpiaij->A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 69 ierr = MatSetType(mpiaij->B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 70 ierr = MatSeqAIJSetPreallocation(mpiaij->A,d_nz,d_nnz);CHKERRQ(ierr); 71 ierr = MatSeqAIJSetPreallocation(mpiaij->B,o_nz,o_nnz);CHKERRQ(ierr); 72 mat->preallocated = PETSC_TRUE; 73 PetscFunctionReturn(0); 74 } 75 76 PetscErrorCode MatMult_MPIAIJKokkos(Mat mat,Vec xx,Vec yy) 77 { 78 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 79 PetscErrorCode ierr; 80 PetscInt nt; 81 82 PetscFunctionBegin; 83 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 84 PetscCheckFalse(nt != mat->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",mat->cmap->n,nt); 85 ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 86 ierr = (*mpiaij->A->ops->mult)(mpiaij->A,xx,yy);CHKERRQ(ierr); 87 ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 88 ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,yy,yy);CHKERRQ(ierr); 89 PetscFunctionReturn(0); 90 } 91 92 PetscErrorCode MatMultAdd_MPIAIJKokkos(Mat mat,Vec xx,Vec yy,Vec zz) 93 { 94 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 95 PetscErrorCode ierr; 96 PetscInt nt; 97 98 PetscFunctionBegin; 99 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 100 PetscCheckFalse(nt != mat->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",mat->cmap->n,nt); 101 ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 102 ierr = (*mpiaij->A->ops->multadd)(mpiaij->A,xx,yy,zz);CHKERRQ(ierr); 103 ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 104 ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,zz,zz);CHKERRQ(ierr); 105 PetscFunctionReturn(0); 106 } 107 108 PetscErrorCode MatMultTranspose_MPIAIJKokkos(Mat mat,Vec xx,Vec yy) 109 { 110 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 111 PetscErrorCode ierr; 112 PetscInt nt; 113 114 PetscFunctionBegin; 115 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 116 PetscCheckFalse(nt != mat->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",mat->rmap->n,nt); 117 ierr = (*mpiaij->B->ops->multtranspose)(mpiaij->B,xx,mpiaij->lvec);CHKERRQ(ierr); 118 ierr = (*mpiaij->A->ops->multtranspose)(mpiaij->A,xx,yy);CHKERRQ(ierr); 119 ierr = VecScatterBegin(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 120 ierr = VecScatterEnd(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 121 PetscFunctionReturn(0); 122 } 123 124 /* Merge the "A, B" matrices of mat into a matrix C. mat's type is MPIAIJKOKKOS. C's type is MATSEQAIJKOKKOS. 125 A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n). 126 C still uses local column ids. Their corresponding global column ids are returned in glob. 127 */ 128 PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJKokkos(Mat mat,MatReuse reuse,IS *glob,Mat *C) 129 { 130 Mat Ad,Ao; 131 const PetscInt *cmap; 132 PetscErrorCode ierr; 133 134 PetscFunctionBegin; 135 ierr = MatMPIAIJGetSeqAIJ(mat,&Ad,&Ao,&cmap);CHKERRQ(ierr); 136 ierr = MatSeqAIJKokkosMergeMats(Ad,Ao,reuse,C);CHKERRQ(ierr); 137 if (glob) { 138 PetscInt cst, i, dn, on, *gidx; 139 ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr); 140 ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr); 141 ierr = MatGetOwnershipRangeColumn(mat,&cst,NULL);CHKERRQ(ierr); 142 ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr); 143 for (i=0; i<dn; i++) gidx[i] = cst + i; 144 for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 145 ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr); 146 } 147 PetscFunctionReturn(0); 148 } 149 150 /* Structs used in matrix product C=AB, C=A^tB and C=B^tAB */ 151 struct MatMatStruct { 152 MatRowMapKokkosView Cdstart; /* Used to split sequential matrix into petsc's A, B format */ 153 PetscSF sf; /* SF to send/recv matrix entries */ 154 MatScalarKokkosView abuf; /* buf of mat values in send/recv */ 155 Mat C1,C2,B_local; 156 KokkosCsrMatrix C1_global,C2_global,C_global; 157 KernelHandle kh; 158 MatMatStruct() { 159 C1 = C2 = B_local = NULL; 160 sf = NULL; 161 } 162 163 ~MatMatStruct() { 164 MatDestroy(&C1); 165 MatDestroy(&C2); 166 MatDestroy(&B_local); 167 PetscSFDestroy(&sf); 168 kh.destroy_spadd_handle(); 169 } 170 }; 171 172 struct MatMatStruct_AB : public MatMatStruct { 173 MatColIdxKokkosView rows; 174 MatRowMapKokkosView rowoffset; 175 Mat B_other,C_petsc; /* SEQAIJKOKKOS matrices. TODO: have a better var name than C_petsc */ 176 177 MatMatStruct_AB() : B_other(NULL),C_petsc(NULL){} 178 ~MatMatStruct_AB() { 179 MatDestroy(&B_other); 180 MatDestroy(&C_petsc); 181 } 182 }; 183 184 struct MatMatStruct_AtB : public MatMatStruct { 185 MatRowMapKokkosView srcrowoffset,dstrowoffset; 186 }; 187 188 struct MatProductData_MPIAIJKokkos 189 { 190 MatMatStruct_AB *mmAB; 191 MatMatStruct_AtB *mmAtB; 192 PetscBool reusesym; 193 194 MatProductData_MPIAIJKokkos(): mmAB(NULL),mmAtB(NULL),reusesym(PETSC_FALSE){} 195 ~MatProductData_MPIAIJKokkos() { 196 delete mmAB; 197 delete mmAtB; 198 } 199 }; 200 201 static PetscErrorCode MatProductDataDestroy_MPIAIJKokkos(void *data) 202 { 203 PetscFunctionBegin; 204 CHKERRCXX(delete static_cast<MatProductData_MPIAIJKokkos*>(data)); 205 PetscFunctionReturn(0); 206 } 207 208 /* MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds - Get a KokkosCsrMatrix from a MATSEQAIJKOKKOS matrix 209 210 Input Parameters: 211 + A - the MATSEQAIJKOKKOS matrix 212 . N - new column size for the returned Kokkos matrix 213 - l2g - a map that maps old col ids to new col ids 214 215 Output Parameters: 216 . csrmat - the Kokkos matrix, which has the same row size as A, shares a, i but not j with A. 217 */ 218 static PetscErrorCode MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(Mat A,PetscInt N,const ConstMatColIdxKokkosView& l2g,KokkosCsrMatrix& csrmat) 219 { 220 KokkosCsrMatrix& orig = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->csrmat; 221 MatColIdxKokkosView jg("jg",orig.nnz()); /* New j array for csrmat */ 222 223 PetscFunctionBegin; 224 CHKERRCXX(Kokkos::parallel_for(orig.nnz(), KOKKOS_LAMBDA(const PetscInt i) {jg(i) = l2g(orig.graph.entries(i));})); 225 CHKERRCXX(csrmat = KokkosCsrMatrix("csrmat",orig.numRows(),N,orig.nnz(),orig.values,orig.graph.row_map,jg)); 226 PetscFunctionReturn(0); 227 } 228 229 /* MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices - Set the diag and offdiag matrices of a MATMPIAIJKOKKOS matrix. 230 It is similar to MatCreateMPIAIJWithSplitArrays. 231 232 Input Parameters: 233 + mat - the MATMPIAIJKOKKOS matrix, which should have its type and layout set, but should not have its diag, offdiag matrices set 234 . A - the diag matrix using local col ids 235 - B - the offdiag matrix using global col ids 236 237 Output Parameters: 238 . mat - the updated MATMPIAIJKOKKOS matrix 239 */ 240 static PetscErrorCode MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices(Mat mat,Mat A,Mat B) 241 { 242 PetscErrorCode ierr; 243 Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); 244 PetscInt m,n,M,N,Am,An,Bm,Bn; 245 Mat_SeqAIJKokkos *bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 246 247 PetscFunctionBegin; 248 ierr = MatGetSize(mat,&M,&N);CHKERRQ(ierr); 249 ierr = MatGetLocalSize(mat,&m,&n);CHKERRQ(ierr); 250 ierr = MatGetLocalSize(A,&Am,&An);CHKERRQ(ierr); 251 ierr = MatGetLocalSize(B,&Bm,&Bn);CHKERRQ(ierr); 252 253 PetscCheckFalse(m != Am || m != Bm,PETSC_COMM_SELF,PETSC_ERR_PLIB,"local number of rows do not match"); 254 PetscCheckFalse(n != An,PETSC_COMM_SELF,PETSC_ERR_PLIB,"local number of columns do not match"); 255 PetscCheckFalse(N != Bn,PETSC_COMM_SELF,PETSC_ERR_PLIB,"global number of columns do not match"); 256 PetscCheckFalse(mpiaij->A || mpiaij->B,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A, B of the MPIAIJ matrix are not empty"); 257 mpiaij->A = A; 258 mpiaij->B = B; 259 260 mat->preallocated = PETSC_TRUE; 261 mat->nooffprocentries = PETSC_TRUE; /* See MatAssemblyBegin_MPIAIJ. In effect, making MatAssemblyBegin a nop */ 262 263 ierr = MatSetOption(mat,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE);CHKERRQ(ierr); 264 ierr = MatAssemblyBegin(mat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 265 /* MatAssemblyEnd is critical here. It sets mat->offloadmask according to A and B's, and 266 also gets mpiaij->B compacted, with its col ids and size reduced 267 */ 268 ierr = MatAssemblyEnd(mat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 269 ierr = MatSetOption(mat,MAT_NO_OFF_PROC_ENTRIES,PETSC_FALSE);CHKERRQ(ierr); 270 ierr = MatSetOption(mat,MAT_NEW_NONZERO_LOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 271 272 /* Update bkok with new local col ids (stored on host) and size */ 273 bkok->j_dual.modify_host(); 274 bkok->j_dual.sync_device(); 275 bkok->SetColSize(mpiaij->B->cmap->n); 276 PetscFunctionReturn(0); 277 } 278 279 /* MatSeqAIJKokkosBcast - Bcast rows of a SEQAIJKOKKOS matrice (B) to form a SEQAIJKOKKOS matrix (C). 280 281 It is essentially the MPIAIJKOKKOS counterpart of MatGetBrowsOfAoCols_MPIAIJ, but supports device and uses PetscSF. 282 In the given ownerSF, leaves correspond to rows in C, and roots correspond to rows in B. Roots may connect to multiple leaves. 283 Suppose C's j-th row is connected to a root identified by PetscSFNode (k,i), it means we will bcast the i-th row of B on rank k 284 to j-th row of C. ownerSF's leaves must be contiguous (in other words, as if ilocal=NULL was used to set its graph). 285 286 Collective on comm of ownerSF 287 288 Input Parameters: 289 + B - the SEQAIJKOKKOS matrix, using local col ids 290 . reuse - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX 291 . N - global col ids are in range of [0,N). N Must be the same across ranks (nonsignificant in MAT_REUSE_MATRIX) 292 . l2g - a map mapping B's local col ids to global ones (nonsignificant in MAT_REUSE_MATRIX) 293 . ownerSF - the ownership SF (nonsignificant in MAT_REUSE_MATRIX) 294 295 Input/Output Parameters (out when resue = MAT_INITIAL_MATRIX, inout when reuse = MAT_REUSE_MATRIX) 296 + bcastSF - the SF used to bcast rows of B. This plain SF does buffer (abuf) to buffer (Ca) send/recv. In this SF, vertices are nonzeros. 297 . abuf - buffer for sending matrix values 298 . rows - array containing indices of (local) rows that this rank needs to bcast to others. Each receiver rank has a chunk in rows[]. 299 Values in rows[] might have repeats, which simply indicates a row will be bcast'ed to multiple neighbors. 300 . rowoffset - For each row in rows[], it will be copied to rowoffset[] at abuf[] 301 - C - the SEQAIJKOKKOS matrix made of the bcast'ed rows, using local col ids. 302 */ 303 static PetscErrorCode MatSeqAIJKokkosBcast(Mat B,MatReuse reuse,PetscInt N,const ConstMatColIdxKokkosView& l2g,PetscSF ownerSF, 304 PetscSF& bcastSF,MatScalarKokkosView& abuf,MatColIdxKokkosView& rows, 305 MatRowMapKokkosView& rowoffset,Mat& C) 306 { 307 PetscErrorCode ierr; 308 Mat_SeqAIJKokkos *bkok,*ckok; 309 310 PetscFunctionBegin; 311 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); /* Make sure B->spptr is accessible */ 312 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 313 314 if (reuse == MAT_REUSE_MATRIX) { 315 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 316 317 const auto& Ba = bkok->a_dual.view_device(); 318 const auto& Bi = bkok->i_dual.view_device(); 319 const auto& Ca = ckok->a_dual.view_device(); 320 321 /* Copy Ba to abuf */ 322 Kokkos::parallel_for(Kokkos::TeamPolicy<>(rows.extent(0), Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 323 PetscInt i = t.league_rank(); /* rows[i] is r-th row of B */ 324 PetscInt r = rows(i); 325 PetscInt base = rowoffset(i); /* Copy r-th row of B to this offset in abuf[] */ 326 Kokkos::parallel_for(Kokkos::TeamThreadRange(t,Bi(r+1)-Bi(r)),[&](PetscInt k) { 327 abuf(base+k) = Ba(Bi(r)+k); 328 }); 329 }); 330 331 /* Send abuf to Ca through bcastSF and then mark C is updated on device */ 332 ierr = PetscSFBcastBegin(bcastSF,MPIU_SCALAR,abuf.data(),Ca.data(),MPI_REPLACE);CHKERRQ(ierr); /* TODO: get memtype for abuf */ 333 ierr = PetscSFBcastEnd (bcastSF,MPIU_SCALAR,abuf.data(),Ca.data(),MPI_REPLACE);CHKERRQ(ierr); 334 ckok->a_dual.modify_device(); 335 } else if (reuse == MAT_INITIAL_MATRIX) { 336 MPI_Comm comm; 337 PetscMPIInt tag; 338 PetscInt k,Cm,Cn,Cnnz,*Ci_h,nroots,nleaves; 339 340 ierr = PetscObjectGetComm((PetscObject)ownerSF,&comm);CHKERRMPI(ierr); 341 ierr = PetscSFGetGraph(ownerSF,&nroots,&nleaves,NULL,NULL);CHKERRQ(ierr); 342 Cm = nleaves; /* row size of C */ 343 Cn = N; /* col size of C, which initially uses global ids, so we can safely set its col size as N */ 344 345 /* Get row lens (nz) of B's rows for later fast query */ 346 PetscInt *Browlens; 347 const PetscInt *tmp = bkok->i_host_data(); 348 ierr = PetscMalloc1(nroots,&Browlens);CHKERRQ(ierr); 349 for (k=0; k<nroots; k++) Browlens[k] = tmp[k+1]-tmp[k]; 350 351 /* By ownerSF, each proc gets lens of rows of C */ 352 MatRowMapKokkosDualView Ci("i",Cm+1); /* C's rowmap */ 353 Ci_h = Ci.view_host().data(); 354 Ci_h[0] = 0; 355 ierr = PetscSFBcastWithMemTypeBegin(ownerSF,MPIU_INT,PETSC_MEMTYPE_HOST,Browlens,PETSC_MEMTYPE_HOST,&Ci_h[1],MPI_REPLACE);CHKERRQ(ierr); 356 ierr = PetscSFBcastEnd(ownerSF,MPIU_INT,Browlens,&Ci_h[1],MPI_REPLACE);CHKERRQ(ierr); 357 for (k=1; k<Cm+1; k++) Ci_h[k] += Ci_h[k-1]; /* Convert lens to CSR */ 358 Cnnz = Ci_h[Cm]; 359 Ci.modify_host(); 360 Ci.sync_device(); 361 362 /* With the newly known Cnnz, we are able to allocate (j, a) for C on host & device */ 363 MatColIdxKokkosDualView Cj("j",Cnnz); 364 MatScalarKokkosDualView Ca("a",Cnnz); 365 366 /* Now build the bcastSF to fill Ca, Cj. This plain SF only does (contiguous) buffer to buffer send/recv */ 367 const PetscMPIInt *iranks,*ranks; 368 const PetscInt *ioffset,*irootloc,*roffset; 369 PetscInt i,j,niranks,nranks,*sdisp,*rdisp,*rowptr; 370 MPI_Request *reqs; 371 372 ierr = PetscSFGetLeafRanks(ownerSF,&niranks,&iranks,&ioffset,&irootloc);CHKERRQ(ierr); /* irootloc[] contains indices of rows I need to send to each receiver */ 373 ierr = PetscSFGetRootRanks(ownerSF,&nranks,&ranks,&roffset,NULL/*rmine*/,NULL/*rremote*/);CHKERRQ(ierr); /* recv info */ 374 375 /* figure out offsets at the send buffer, to build the SF 376 sdisp[] - stores offsets of nonzeros (in abuf or jbuf, see later) I need to send, per receiver. 377 rowptr[] - stores offsets for data of each row in abuf 378 379 rdisp[] - to receive sdisp[] 380 */ 381 ierr = PetscMalloc3(niranks+1,&sdisp,nranks,&rdisp,niranks+nranks,&reqs);CHKERRQ(ierr); 382 MatRowMapKokkosViewHost rowptr_h("rowptr_h",ioffset[niranks]+1); /* Let Kokkos do the allocation, so that we can do an easy mirror later */ 383 rowptr = rowptr_h.data(); 384 385 sdisp[0] = 0; 386 rowptr[0] = 0; 387 for (i=0; i<niranks; i++) { /* for each receiver */ 388 PetscInt len, nz = 0; 389 for (j=ioffset[i]; j<ioffset[i+1]; j++) { /* for each row to this receiver */ 390 len = Browlens[irootloc[j]]; 391 rowptr[j+1] = rowptr[j] + len; 392 nz += len; 393 } 394 sdisp[i+1] = sdisp[i] + nz; 395 } 396 ierr = PetscCommGetNewTag(comm,&tag);CHKERRMPI(ierr); 397 for (i=0; i<nranks; i++) {ierr = MPI_Irecv(&rdisp[i],1,MPIU_INT,ranks[i],tag,comm,&reqs[i]);CHKERRMPI(ierr);} 398 for (i=0; i<niranks; i++) {ierr = MPI_Isend(&sdisp[i],1,MPIU_INT,iranks[i],tag,comm,&reqs[nranks+i]);CHKERRMPI(ierr);} 399 ierr = MPI_Waitall(niranks+nranks,reqs,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 400 401 PetscInt nleaves2 = Cnnz; /* leaves are the nonzeros I will receive */ 402 PetscInt nroots2 = sdisp[niranks]; /* roots are the nonzeros (in abuf) I will send */ 403 PetscSFNode *iremote; 404 ierr = PetscMalloc1(nleaves2,&iremote);CHKERRQ(ierr); 405 for (i=0; i<nranks; i++) { /* for each sender */ 406 k = 0; 407 for (j=Ci_h[roffset[i]]; j<Ci_h[roffset[i+1]]; j++) { 408 iremote[j].rank = ranks[i]; 409 iremote[j].index = rdisp[i] + k; 410 k++; 411 } 412 } 413 /* TODO: we should extend PetscSF APIs for this buffer-to-buffer send/recv */ 414 ierr = PetscSFCreate(comm,&bcastSF);CHKERRQ(ierr); 415 ierr = PetscSFSetGraph(bcastSF,nroots2,nleaves2,NULL/*ilocal*/,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr); 416 417 /* Extract selected rows of B, and copy their (a, j) into abuf[] and jbuf[], with j converted 418 from local to global. Then use bcastSF to fill Ca, Cj. 419 */ 420 ConstMatColIdxKokkosViewHost rows_h(irootloc,ioffset[niranks]); /* irootloc[] stores indices of rows I need to send */ 421 MatColIdxKokkosView rows("rows",ioffset[niranks]); 422 Kokkos::deep_copy(rows,rows_h); /* Use deep copy since irootoc is managed by PetscSF and we want 'rows' to be standalone */ 423 424 rowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),rowptr_h); /* If no device, rowoffset will be an alias to rowptr_h */ 425 426 MatColIdxKokkosView jbuf("jbuf",sdisp[niranks]); /* send buf for (global) col ids */ 427 abuf = MatScalarKokkosView("abuf",sdisp[niranks]); /* send buf for mat values */ 428 429 const auto& Ba = bkok->a_dual.view_device(); 430 const auto& Bi = bkok->i_dual.view_device(); 431 const auto& Bj = bkok->j_dual.view_device(); 432 433 /* Copy Ba, Bj to abuf, jbuf with change col ids from local to global */ 434 Kokkos::parallel_for(Kokkos::TeamPolicy<>(rows.extent(0), Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 435 PetscInt i = t.league_rank(); /* rows[i] is r-th row of B */ 436 PetscInt r = rows(i); 437 PetscInt base = rowoffset(i); /* Copy r-th row of B to this offset in abuf[] */ 438 Kokkos::parallel_for(Kokkos::TeamThreadRange(t,Bi(r+1)-Bi(r)),[&](PetscInt k) { 439 abuf(base+k) = Ba(Bi(r)+k); 440 jbuf(base+k) = l2g(Bj(Bi(r)+k)); 441 }); 442 }); 443 444 /* Send abuf & jbuf to fill Ca, Cj */ 445 ierr = PetscSFBcastBegin(bcastSF,MPIU_INT, jbuf.data(),Cj.view_device().data(),MPI_REPLACE);CHKERRQ(ierr); 446 ierr = PetscSFBcastBegin(bcastSF,MPIU_SCALAR,abuf.data(),Ca.view_device().data(),MPI_REPLACE);CHKERRQ(ierr); 447 ierr = PetscSFBcastEnd (bcastSF,MPIU_INT, jbuf.data(),Cj.view_device().data(),MPI_REPLACE);CHKERRQ(ierr); 448 ierr = PetscSFBcastEnd (bcastSF,MPIU_SCALAR,abuf.data(),Ca.view_device().data(),MPI_REPLACE);CHKERRQ(ierr); 449 Cj.modify_device(); /* Mark Cj, Ca modified on device, but only sync Cj since we might not need Ca on host at all */ 450 Cj.sync_host(); 451 Ca.modify_device(); 452 453 /* Construct C with Ca, Ci, Cj */ 454 auto ckok = new Mat_SeqAIJKokkos(Cm,Cn,Cnnz,Ci,Cj,Ca); 455 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,&C);CHKERRQ(ierr); 456 ierr = PetscFree3(sdisp,rdisp,reqs);CHKERRQ(ierr); 457 ierr = PetscFree(Browlens);CHKERRQ(ierr); 458 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unsupported MatReuse enum %d",reuse); 459 PetscFunctionReturn(0); 460 } 461 462 /* MatSeqAIJKokkosReduce - Reduce rows of a SEQAIJKOKKOS matrix (A) to form a Kokkos Csr matrix (C) 463 464 It is the reverse of MatSeqAIJKokkosBcast in some sense. 465 466 Think each row of A as a leaf, then the given ownerSF specifies roots of the leaves. Roots may connect to multiple leaves. 467 In this routine, we reduce (i.e., concatenate) leaves (rows) at their roots to form potentially longer rows in C. Such rows might 468 contain repeats, which does not matter since they will be summed up by other routines. C's row size will be nroots of ownerSF. 469 470 Input Parameters: 471 + A - the SEQAIJKOKKOS matrix to be reduced 472 . reuse - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX 473 . local - true if A uses local col ids; false if A is already in global col ids. 474 . N - if local, N is A's global col size 475 . l2g - if local, a map mapping A's local col ids to global ones, which are in range of [0,N). 476 - ownerSF - the SF specifies ownership (root) of rows in A 477 478 Output Parameters: 479 + reduceSF - the SF to reduce A's rows to contiguous buffers at the receiver side 480 . abuf - a contiguous buffer to receive A's rows sent to this proc. Suppose there are 'nrows' such rows. 481 . srcrowoffset - offset array of size nrows+1. Each entry is the corresponding row's offset in abuf[]. srcrowoffset[i+1]-srcrowoffset[i] is row i's len. 482 . dstrowoffset - offset array of size nrows. Each entry is the corresponding row's offset in Ca[], i.e., C's 'a' array. Row i, i+1 in abuf[] may go to 483 unrelated places in Ca, so dstrowoffset is not in CSR-like format as srcrowoffset. 484 - C - the matrix made up by rows sent to me from other ranks, using global col ids 485 486 TODO: we can even have MatSeqAIJKokkosReduceBegin/End to provide oppertunity for callers to overlap comp./comm. when reuse = MAT_REUSE_MATRIX. 487 */ 488 static PetscErrorCode MatSeqAIJKokkosReduce(Mat A,MatReuse reuse,PetscBool local,PetscInt N,const ConstMatColIdxKokkosView& l2g,PetscSF ownerSF, 489 PetscSF& reduceSF,MatScalarKokkosView& abuf, 490 MatRowMapKokkosView& srcrowoffset,MatRowMapKokkosView& dstrowoffset, 491 KokkosCsrMatrix& C) 492 { 493 PetscErrorCode ierr; 494 PetscInt i,r,Am,An,Annz,Cnnz,nrows; 495 const PetscInt *Ai; 496 Mat_SeqAIJKokkos *akok; 497 498 PetscFunctionBegin; 499 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); /* So that A's latest data is on device */ 500 ierr = MatGetSize(A,&Am,&An); 501 Ai = static_cast<Mat_SeqAIJ*>(A->data)->i; 502 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 503 Annz = Ai[Am]; 504 505 if (reuse == MAT_REUSE_MATRIX) { 506 /* Send Aa to abuf */ 507 ierr = PetscSFReduceBegin(reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE);CHKERRMPI(ierr); 508 ierr = PetscSFReduceEnd (reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE);CHKERRMPI(ierr); 509 510 /* Copy abuf to Ca */ 511 const MatScalarKokkosView& Ca = C.values; 512 nrows = dstrowoffset.extent(0); /* Not srcrowoffset[] since it has an extra entry for CSR */ 513 Kokkos::parallel_for(Kokkos::TeamPolicy<>(nrows, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 514 PetscInt i = t.league_rank(); 515 PetscInt src = srcrowoffset(i), dst = dstrowoffset(i); 516 PetscInt len = srcrowoffset(i+1) - srcrowoffset(i); 517 Kokkos::parallel_for(Kokkos::TeamThreadRange(t,len), [&](PetscInt k) {Ca(dst+k) = abuf(src+k);}); 518 }); 519 } else if (reuse == MAT_INITIAL_MATRIX) { 520 MPI_Comm comm; 521 MPI_Request *reqs; 522 PetscMPIInt tag; 523 PetscInt Cm; 524 525 ierr = PetscObjectGetComm((PetscObject)ownerSF,&comm);CHKERRQ(ierr); 526 ierr = PetscCommGetNewTag(comm,&tag);CHKERRQ(ierr); 527 528 PetscInt niranks,nranks,nroots,nleaves; 529 const PetscMPIInt *iranks,*ranks; 530 const PetscInt *ioffset,*rows,*roffset; /* rows[] contains local indices of rows scattered to me from others. ioffset[] is a CSR on rows[] */ 531 ierr = PetscSFSetUp(ownerSF);CHKERRQ(ierr); 532 ierr = PetscSFGetLeafRanks(ownerSF,&niranks,&iranks,&ioffset,&rows);CHKERRQ(ierr); /* recv info: iranks[] will send rows to me */ 533 ierr = PetscSFGetRootRanks(ownerSF,&nranks,&ranks,&roffset,NULL/*rmine*/,NULL/*rremote*/);CHKERRQ(ierr); /* send info */ 534 ierr = PetscSFGetGraph(ownerSF,&nroots,&nleaves,NULL,NULL);CHKERRQ(ierr); 535 PetscCheckFalse(nleaves != Am,PETSC_COMM_SELF,PETSC_ERR_PLIB,"ownerSF's nleaves(%" PetscInt_FMT ") != row size of A(%" PetscInt_FMT ")",nleaves,Am); 536 Cm = nroots; 537 nrows = ioffset[niranks]; /* # of rows to be received. Might receive same row (each is partial) from different senders */ 538 539 /* Tell owners how long each row I will send */ 540 PetscInt *srowlens; /* send buf of row lens */ 541 MatRowMapKokkosViewHost rrowlens_h("rrowoffset_h",nrows+1); /* recv buf of row lens. +1 to make CSR later. Memory might be passed to other views */ 542 PetscInt *rrowlens = rrowlens_h.data(); 543 544 ierr = PetscMalloc2(Am,&srowlens,niranks+nranks,&reqs);CHKERRQ(ierr); 545 for (i=0; i<Am; i++) srowlens[i] = Ai[i+1] - Ai[i]; 546 rrowlens[0] = 0; 547 rrowlens++; /* shift the pointer to make the following expression more readable */ 548 for (i=0; i<niranks; i++){ierr = MPI_Irecv(&rrowlens[ioffset[i]],ioffset[i+1]-ioffset[i],MPIU_INT,iranks[i],tag,comm,&reqs[i]);CHKERRMPI(ierr);} 549 for (i=0; i<nranks; i++) {ierr = MPI_Isend(&srowlens[roffset[i]],roffset[i+1]-roffset[i],MPIU_INT,ranks[i],tag,comm,&reqs[niranks+i]);CHKERRMPI(ierr);} 550 ierr = MPI_Waitall(niranks+nranks,reqs,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 551 552 /* Owner builds Ci on host by histogramming rrowlens[] */ 553 MatRowMapKokkosViewHost Ci_h("i",Cm+1); 554 Kokkos::deep_copy(Ci_h,0); /* Zero Ci */ 555 MatRowMapType *Ci_ptr = Ci_h.data(); 556 557 for (i=0; i<nrows; i++) { 558 r = rows[i]; /* local row id of i-th received row */ 559 #if defined(PETSC_USE_DEBUG) 560 PetscCheckFalse(r<0 || r>=Cm,PETSC_COMM_SELF,PETSC_ERR_PLIB,"local row id (%" PetscInt_FMT ") is out of range [0,%" PetscInt_FMT ")",r,Cm); 561 #endif 562 Ci_ptr[r+1] += rrowlens[i]; /* add to length of row r in C */ 563 } 564 for (i=0; i<Cm; i++) Ci_ptr[i+1] += Ci_ptr[i]; /* to CSR format */ 565 Cnnz = Ci_ptr[Cm]; 566 567 /* For each received row, compute src & dst offsets in memory copying (from recv bufs abuf, jbuf to Ca, Cj) */ 568 MatRowMapKokkosViewHost dstrowoffset_h("dstrowoffset_h",nrows); 569 PetscInt *dstrowoffset_hptr = dstrowoffset_h.data(); 570 PetscInt *currowlens; /* Current row lens. They are temp accumulators for row lens in C, to help build dstrowoffset */ 571 572 ierr = PetscCalloc1(Cm,&currowlens);CHKERRQ(ierr); /* Init with zero, to be added to */ 573 for (i=0; i<nrows; i++) { /* for each row I receive */ 574 r = rows[i]; /* row id in C */ 575 dstrowoffset_hptr[i] = Ci_ptr[r] + currowlens[r]; /* dst offset of the new place for each recv'ed row in Ca/Cj */ 576 currowlens[r] += rrowlens[i]; /* accumulate to length of row r in C */ 577 } 578 ierr = PetscFree(currowlens);CHKERRQ(ierr); 579 580 rrowlens--; 581 for (i=0; i<nrows; i++) rrowlens[i+1] += rrowlens[i]; /* Change rrowlens[] to CSR format */ 582 dstrowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),dstrowoffset_h); 583 srcrowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),rrowlens_h); /* src offset of each recv'ed row in abuf/jbuf */ 584 585 /* Build the reduceSF, which performs buffer to buffer send/recv */ 586 PetscInt *sdisp,*rdisp; /* buffer to send offsets of roots, and buffer to recv them */ 587 ierr = PetscMalloc2(niranks,&sdisp,nranks,&rdisp);CHKERRQ(ierr); 588 for (i=0; i<niranks; i++) sdisp[i] = rrowlens[ioffset[i]]; 589 for (i=0; i<nranks; i++) {ierr = MPI_Irecv(&rdisp[i],1,MPIU_INT,ranks[i],tag,comm,&reqs[i]);CHKERRMPI(ierr);} 590 for (i=0; i<niranks; i++) {ierr = MPI_Isend(&sdisp[i],1,MPIU_INT,iranks[i],tag,comm,&reqs[nranks+i]);CHKERRMPI(ierr);} 591 ierr = MPI_Waitall(niranks+nranks,reqs,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 592 593 /* Nonzeros in abuf/jbuf are roots and those in A are leaves */ 594 PetscInt nroots2 = Cnnz,nleaves2 = Annz; 595 PetscSFNode *iremote; 596 ierr = PetscMalloc1(nleaves2,&iremote);CHKERRQ(ierr); /* no free, since memory will be given to reduceSF */ 597 for (i=0; i<nranks; i++) { 598 PetscInt rootbase = rdisp[i]; /* root offset at this root rank */ 599 PetscInt leafbase = Ai[roffset[i]]; /* leaf base */ 600 PetscInt nz = Ai[roffset[i+1]] - leafbase; /* I will send nz nonzeros to this root rank */ 601 for (PetscInt k=0; k<nz; k++) { 602 iremote[leafbase+k].rank = ranks[i]; 603 iremote[leafbase+k].index = rootbase + k; 604 } 605 } 606 ierr = PetscSFCreate(comm,&reduceSF);CHKERRQ(ierr); 607 ierr = PetscSFSetGraph(reduceSF,nroots2,nleaves2,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr); 608 ierr = PetscFree2(sdisp,rdisp);CHKERRQ(ierr); 609 610 /* Reduce Aa, Ajg to abuf and jbuf */ 611 612 /* If A uses local col ids, convert them to global ones before sending */ 613 MatColIdxKokkosView Ajg; 614 if (local) { 615 Ajg = MatColIdxKokkosView("j",Annz); 616 const MatColIdxKokkosView& Aj = akok->j_dual.view_device(); 617 Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscInt i) {Ajg(i) = l2g(Aj(i));}); 618 } else { 619 Ajg = akok->j_dual.view_device(); /* no data copy, just take a reference */ 620 } 621 622 MatColIdxKokkosView jbuf("jbuf",Cnnz); 623 abuf = MatScalarKokkosView("abuf",Cnnz); 624 ierr = PetscSFReduceBegin(reduceSF,MPIU_INT, Ajg.data(), jbuf.data(),MPI_REPLACE);CHKERRMPI(ierr); 625 ierr = PetscSFReduceEnd (reduceSF,MPIU_INT, Ajg.data(), jbuf.data(),MPI_REPLACE);CHKERRMPI(ierr); 626 ierr = PetscSFReduceBegin(reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE);CHKERRMPI(ierr); 627 ierr = PetscSFReduceEnd (reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE);CHKERRMPI(ierr); 628 629 /* Copy data from abuf, jbuf to Ca, Cj */ 630 MatRowMapKokkosView Ci = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),Ci_h); /* Ci is an alias of Ci_h if no device */ 631 MatColIdxKokkosView Cj("j",Cnnz); 632 MatScalarKokkosView Ca("a",Cnnz); 633 634 Kokkos::parallel_for(Kokkos::TeamPolicy<>(nrows, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 635 PetscInt i = t.league_rank(); 636 PetscInt src = srcrowoffset(i), dst = dstrowoffset(i); 637 PetscInt len = srcrowoffset(i+1) - srcrowoffset(i); 638 Kokkos::parallel_for(Kokkos::TeamThreadRange(t,len), [&](PetscInt k) { 639 Ca(dst+k) = abuf(src+k); 640 Cj(dst+k) = jbuf(src+k); 641 }); 642 }); 643 644 /* Build C with Ca, Ci, Cj */ 645 C = KokkosCsrMatrix("csrmat",Cm,N,Cnnz,Ca,Ci,Cj); 646 ierr = PetscFree2(srowlens,reqs);CHKERRQ(ierr); 647 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unsupported MatReuse enum %d",reuse); 648 PetscFunctionReturn(0); 649 } 650 651 /* MatSetMPIAIJKokkosWithGlobalCSRMatrix - Set the diag and offdiag parts of a MATMPIAIJKOKKOS matrix by splitting a KokkosCsrMatrix 652 653 Input Parameters: 654 + C - the MATMPIAIJKOKKOS matrix, of size m,n,M,N 655 . reuse - indicate whether the matrix has called this function before 656 . csrmat - the KokkosCsrMatrix, of size m,N 657 - Cdstart - when reuse == MAT_REUSE_MATRIX, it is an input parameter. For each row in csrmat, it stores the start of the first 658 entry of the diag block of C in csrmat's j array. E.g, if row i has col ids = {0, 3, 4, 5, 7, 9} and the first diag 659 entry is 5, then Cdstart[i] = 3. 660 661 Output Parameters: 662 + C - the updated MATMPIAIJKOKKOS matrix 663 - Cdstart - when reuse == MAT_INITIAL_MATRIX, it is an output parameter 664 665 Notes: 666 Between calls with MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX, csrmat must have the same nonzero pattern 667 */ 668 static PetscErrorCode MatSetMPIAIJKokkosWithGlobalCSRMatrix(Mat C,MatReuse reuse,const KokkosCsrMatrix& csrmat,MatRowMapKokkosView& Cdstart) 669 { 670 PetscErrorCode ierr; 671 const MatScalarKokkosView& Ca = csrmat.values; 672 const ConstMatRowMapKokkosView& Ci = csrmat.graph.row_map; 673 PetscInt m,n,N; 674 675 PetscFunctionBegin; 676 ierr = MatGetLocalSize(C,&m,&n);CHKERRQ(ierr); 677 ierr = MatGetSize(C,NULL,&N);CHKERRQ(ierr); 678 679 if (reuse == MAT_REUSE_MATRIX) { 680 Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ*>(C->data); 681 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(mpiaij->A->spptr); 682 Mat_SeqAIJKokkos *bkok = static_cast<Mat_SeqAIJKokkos*>(mpiaij->B->spptr); 683 const MatScalarKokkosView& Cda = akok->a_dual.view_device(),Coa = bkok->a_dual.view_device(); 684 const MatRowMapKokkosView& Cdi = akok->i_dual.view_device(),Coi = bkok->i_dual.view_device(); 685 686 /* Fill 'a' of Cd and Co on device */ 687 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 688 PetscInt i = t.league_rank(); /* row i */ 689 PetscInt clen = Ci(i+1) - Ci(i); /* len of row i of C */ 690 PetscInt cdlen = Cdi(i+1) - Cdi(i); /* len of row i of Cd */ 691 PetscInt cdstart = Cdstart(i); /* [start, end) of row i of Cd in C */ 692 PetscInt cdend = cdstart + cdlen; 693 /* [0, clen) is cut into three blocks: [0, cdstart), [cdstart, cdend), [cdend, clen) */ 694 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clen), [&](PetscInt k) { 695 if (k < cdstart) { /* k in [0, cdstart) */ 696 Coa(Coi(i)+k) = Ca(Ci(i)+k); 697 } else if (k < cdend) { /* k in [cdstart, cdend) */ 698 Cda(Cdi(i)+(k-cdstart)) = Ca(Ci(i)+k); 699 } else { /* k in [cdend, clen) */ 700 Coa(Coi(i)+k-cdlen) = Ca(Ci(i)+k); 701 } 702 }); 703 }); 704 705 akok->a_dual.modify_device(); 706 bkok->a_dual.modify_device(); 707 } else if (reuse == MAT_INITIAL_MATRIX) { 708 Mat Cd,Co; 709 const MatColIdxKokkosView& Cj = csrmat.graph.entries; 710 MatRowMapKokkosDualView Cdi_dual("i",m+1),Coi_dual("i",m+1); 711 MatRowMapKokkosView Cdi = Cdi_dual.view_device(),Coi = Coi_dual.view_device(); 712 PetscInt cstart,cend; 713 714 /* Note that each row of C is sorted by col ids. We want to find out how to cut each row into three blocks: 715 left to the diag block, diag block, right to the diag block. The diag block have col ids in [cstart,cend). 716 Suppose a row of C has len nonzeros, indexed by [0, len). We want to know two indices: cdstart and cdend, 717 such that the three blocks are [0,cdstart), [cdstart,cdend), [cdend,len). The following code equivalentaly 718 stores values of cdstart and cdend-cstart (aka Cdi[]) instead. 719 */ 720 Cdstart = MatRowMapKokkosView("Cdstart",m); 721 ierr = PetscLayoutGetRange(C->cmap,&cstart,&cend);CHKERRQ(ierr); /* Not MatGetOwnershipRangeColumn() since C has not been preallocated yet */ 722 723 /* I could use RangePolicy and one thread per row. But since each thread essentially does binary search, threads in a 724 CUDA warp would completely diverge. So I use TeamPolicy with a team size 1. 725 */ 726 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 727 Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 728 PetscInt i = t.league_rank(); /* row i */ 729 PetscInt j,first,count,step; 730 731 if (i == 0) { /* Set the first entry of the i arrays to zero on device, to be used in CSR */ 732 Cdi(0) = 0; 733 Coi(0) = 0; 734 } 735 736 /* Do std::lower_bound(Ci(i),Ci(i+1),cstart) on Cj[]. We use j as the iterator. lower_bound() returns 737 in 'first' the first iterator with a value >= cstart, or last iterator if no such element is found. 738 */ 739 count = Ci(i+1)-Ci(i); 740 first = Ci(i); 741 while (count > 0) { 742 j = first; 743 step = count / 2; 744 j += step; 745 if (Cj(j) < cstart) { 746 first = ++j; 747 count -= step + 1; 748 } else count = step; 749 } 750 Cdstart(i) = first - Ci(i); /* 'first' is the while-loop's output */ 751 752 /* Do std::lower_bound(first,Ci(i+1),cend) on Cj[] */ 753 count = Ci(i+1) - first; 754 while (count > 0) { 755 j = first; 756 step = count / 2; 757 j += step; 758 if (Cj(j) < cend) { 759 first = ++j; 760 count -= step + 1; 761 } else count = step; 762 } 763 Cdi(i+1) = first - (Ci(i)+Cdstart(i)); /* 'first' is the while-loop's output */ 764 Coi(i+1) = (Ci(i+1)-Ci(i)) - Cdi(i+1); /* Co's row len = C's row len - Cd's row len */ 765 }); 766 }); 767 768 /* Convert row lens in Cdi[], Coi[] to CSR format using inclusive scan, e.g., changing [0,1,2,3] into [0,1,3,6] */ 769 Kokkos::parallel_scan(m+1,KOKKOS_LAMBDA(const PetscInt i,PetscInt& update,const bool final) { 770 update += Cdi(i); 771 if (final) Cdi(i) = update; 772 }); 773 Kokkos::parallel_scan(m+1,KOKKOS_LAMBDA(const PetscInt i,PetscInt& update,const bool final) { 774 update += Coi(i); 775 if (final) Coi(i) = update; 776 }); 777 778 /* Get Cdi, Coi on host (it is not a waste, since we do need them on host in 779 MatCreateSeqAIJKokkosWithCSRMatrix() below), then get nnz of Cd and Co. 780 */ 781 Cdi_dual.modify_device(); 782 Coi_dual.modify_device(); 783 Cdi_dual.sync_host(); 784 Coi_dual.sync_host(); 785 PetscInt Cd_nnz = Cdi_dual.view_host().data()[m]; 786 PetscInt Co_nnz = Coi_dual.view_host().data()[m]; 787 788 /* With nnz, allocate a, j for Cd and Co */ 789 MatColIdxKokkosDualView Cdj_dual("j",Cd_nnz),Coj_dual("j",Co_nnz); 790 MatScalarKokkosDualView Cda_dual("a",Cd_nnz),Coa_dual("a",Co_nnz); 791 792 /* Fill a, j of Cd and Co on device */ 793 MatColIdxKokkosView Cdj = Cdj_dual.view_device(),Coj = Coj_dual.view_device(); 794 MatScalarKokkosView Cda = Cda_dual.view_device(),Coa = Coa_dual.view_device(); 795 796 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 797 PetscInt i = t.league_rank(); /* row i */ 798 PetscInt clen = Ci(i+1) - Ci(i); /* len of row i of C */ 799 PetscInt cdlen = Cdi(i+1) - Cdi(i); /* len of row i of Cd */ 800 PetscInt cdstart = Cdstart(i); /* [start, end) of row i of Cd in C */ 801 PetscInt cdend = cdstart + cdlen; 802 /* [0, clen) is cut into three blocks: [0, cdstart), [cdstart, cdend), [cdend, clen) */ 803 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clen), [&](PetscInt k) { 804 if (k < cdstart) { /* k in [0, cdstart) */ 805 Coa(Coi(i)+k) = Ca(Ci(i)+k); 806 Coj(Coi(i)+k) = Cj(Ci(i)+k); 807 } else if (k < cdend) { /* k in [cdstart, cdend) */ 808 Cda(Cdi(i)+(k-cdstart)) = Ca(Ci(i)+k); 809 Cdj(Cdi(i)+(k-cdstart)) = Cj(Ci(i)+k) - cstart; /* Use local col ids in Cdj */ 810 } else { /* k in [cdend, clen) */ 811 Coa(Coi(i)+k-cdlen) = Ca(Ci(i)+k); 812 Coj(Coi(i)+k-cdlen) = Cj(Ci(i)+k); 813 } 814 }); 815 }); 816 817 Cdj_dual.modify_device(); 818 Cda_dual.modify_device(); 819 Coj_dual.modify_device(); 820 Coa_dual.modify_device(); 821 /* With a, i, j for Cd and Co, finally build Cd, Co and then C. Their offloadmask will be set in each's MatAssemblyEnd */ 822 auto cdkok = new Mat_SeqAIJKokkos(m,n,Cd_nnz,Cdi_dual,Cdj_dual,Cda_dual); 823 auto cokok = new Mat_SeqAIJKokkos(m,N,Co_nnz,Coi_dual,Coj_dual,Coa_dual); 824 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,cdkok,&Cd);CHKERRQ(ierr); 825 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,cokok,&Co);CHKERRQ(ierr); 826 ierr = MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices(C,Cd,Co);CHKERRQ(ierr); /* Coj will be converted to local ids within */ 827 } 828 PetscFunctionReturn(0); 829 } 830 831 /* MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos - Compact a SEQAIJKOKKS matrix's global col ids. 832 833 It is similar to MatSeqAIJCompactOutExtraColumns_SeqAIJ, but it applies to SEQAIJKOKKOS and returns the l2g map in Kokkos view. 834 835 Input Parameters: 836 + C - the MATMPIAIJKOKKOS matrix, of size m,n,M,N 837 . reuse - indicate whether the matrix has called this function before 838 . csrmat - the KokkosCsrMatrix, of size m,N 839 - Cdoffset - when reuse == MAT_REUSE_MATRIX, it is an input parameter. For each row in csrmat, it stores the offset of the first 840 entry of the diag block of C in csrmat's j array. 841 842 Output Parameters: 843 + C - the updated MATMPIAIJKOKKOS matrix 844 - Cdoffset - when reuse == MAT_INITIAL_MATRIX, it is an output parameter 845 846 Notes: the input matrix's col ids and col size will be changed. 847 */ 848 static PetscErrorCode MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos(Mat C,MatColIdxKokkosView& l2g) 849 { 850 PetscErrorCode ierr; 851 Mat_SeqAIJKokkos *ckok; 852 ISLocalToGlobalMapping l2gmap; 853 const PetscInt *garray; 854 PetscInt sz; 855 856 PetscFunctionBegin; 857 /* Compact P_other's global col ids and col size. We do it since we guess with local ids KK might be more memory scalable */ 858 ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(C,&l2gmap);CHKERRQ(ierr); 859 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 860 ckok->j_dual.modify_host(); /* P_other's j is modified on host; we need to sync it on device */ 861 ckok->j_dual.sync_device(); 862 ckok->SetColSize(C->cmap->n); /* Update col size of the csrmat in spptr */ 863 864 /* Build l2g -- the local to global mapping of C's cols */ 865 ierr = ISLocalToGlobalMappingGetIndices(l2gmap,&garray);CHKERRQ(ierr); 866 ierr = ISLocalToGlobalMappingGetSize(l2gmap,&sz);CHKERRQ(ierr); 867 PetscCheckFalse(C->cmap->n != sz,PETSC_COMM_SELF,PETSC_ERR_PLIB,"matrix column size(%" PetscInt_FMT ") != l2g mapping size(%" PetscInt_FMT ")", C->cmap->n,sz); 868 869 ConstMatColIdxKokkosViewHost tmp(garray,sz); 870 l2g = MatColIdxKokkosView("l2g",sz); 871 Kokkos::deep_copy(l2g,tmp); 872 873 ierr = ISLocalToGlobalMappingRestoreIndices(l2gmap,&garray);CHKERRQ(ierr); 874 ierr = ISLocalToGlobalMappingDestroy(&l2gmap);CHKERRQ(ierr); 875 PetscFunctionReturn(0); 876 } 877 878 /* MatProductSymbolic_MPIAIJKokkos_AB - AB flavor of MatProductSymbolic_MPIAIJKokkos 879 880 Input Parameters: 881 + product - Mat_Product which carried out the computation. Passed in to access info about this mat product. 882 . A - an MPIAIJKOKKOS matrix 883 . B - an MPIAIJKOKKOS matrix 884 - mm - a struct used to stash intermediate data when computing AB. Persist from symbolic to numeric operations. 885 886 Notes: The local part of the result C is stored as mm->C_global, which is of type KokkosCsrMatrix and uses global col ids. 887 */ 888 static PetscErrorCode MatProductSymbolic_MPIAIJKokkos_AB(Mat_Product *product,Mat A,Mat B,MatMatStruct_AB *mm) 889 { 890 PetscErrorCode ierr; 891 Mat_MPIAIJ *a = static_cast<Mat_MPIAIJ*>(A->data); 892 Mat Ad = a->A,Ao = a->B; /* diag and offdiag of A */ 893 IS glob = NULL; 894 const PetscInt *garray; 895 PetscInt N = B->cmap->N,sz; 896 ConstMatColIdxKokkosView l2g1; /* two temp maps mapping local col ids to global ones */ 897 MatColIdxKokkosView l2g2; 898 Mat C1,C2; /* intermediate matrices */ 899 900 PetscFunctionBegin; 901 /* C1 = Ad * B_local. B_local is a matrix got by merging Bd and Bo, and uses local col ids */ 902 ierr = MatMPIAIJGetLocalMatMerge(B,MAT_INITIAL_MATRIX,&glob,&mm->B_local);CHKERRQ(ierr); 903 ierr = MatProductCreate(Ad,mm->B_local,NULL,&C1);CHKERRQ(ierr); 904 ierr = MatProductSetType(C1,MATPRODUCT_AB);CHKERRQ(ierr); 905 ierr = MatProductSetFill(C1,product->fill);CHKERRQ(ierr); 906 C1->product->api_user = product->api_user; 907 ierr = MatProductSetFromOptions(C1);CHKERRQ(ierr); 908 PetscCheckFalse(!C1->ops->productsymbolic,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C1->product->type]); 909 ierr = (*C1->ops->productsymbolic)(C1);CHKERRQ(ierr); 910 911 ierr = ISGetIndices(glob,&garray);CHKERRQ(ierr); 912 ierr = ISGetSize(glob,&sz);CHKERRQ(ierr); 913 const auto& tmp = ConstMatColIdxKokkosViewHost(garray,sz); /* wrap garray as a view */ 914 l2g1 = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),tmp); /* maybe just an alias to tmp, so we restore garray at the very end */ 915 ierr = MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C1,N,l2g1,mm->C1_global); 916 917 /* C2 = Ao * B_other. B_other is a matrix consisting of needed rows of B gathered from other procs */ 918 ierr = MatSeqAIJKokkosBcast(mm->B_local,MAT_INITIAL_MATRIX,N,l2g1,a->Mvctx,mm->sf, 919 mm->abuf,mm->rows,mm->rowoffset,mm->B_other);CHKERRQ(ierr); 920 921 /* Compact B_other to use local ids as we guess KK spgemm is more memroy scalable with that; We could skip the compaction to simplify code */ 922 ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos(mm->B_other,l2g2);CHKERRQ(ierr); 923 ierr = MatProductCreate(Ao,mm->B_other,NULL,&C2);CHKERRQ(ierr); 924 ierr = MatProductSetType(C2,MATPRODUCT_AB);CHKERRQ(ierr); 925 ierr = MatProductSetFill(C2,product->fill);CHKERRQ(ierr); 926 C2->product->api_user = product->api_user; 927 ierr = MatProductSetFromOptions(C2);CHKERRQ(ierr); 928 PetscCheckFalse(!C2->ops->productsymbolic,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C2->product->type]); 929 ierr = (*C2->ops->productsymbolic)(C2);CHKERRQ(ierr); 930 ierr = MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C2,N,l2g2,mm->C2_global); 931 932 /* C = C1 + C2. We actually use their global col ids versions in adding */ 933 mm->kh.create_spadd_handle(false); /* Input C1, C2 are NOT sorted, since B_local, B_other are not */ 934 KokkosSparse::spadd_symbolic(&mm->kh,mm->C1_global,mm->C2_global,mm->C_global); 935 /* Have to do numeric since spadd_symbolic does not really populate column indices of the result matrix */ 936 KokkosSparse::spadd_numeric(&mm->kh,(MatScalarType)1.0,mm->C1_global,(MatScalarType)1.0,mm->C2_global,mm->C_global); 937 938 mm->C1 = C1; 939 mm->C2 = C2; 940 ierr = ISRestoreIndices(glob,&garray);CHKERRQ(ierr); 941 ierr = ISDestroy(&glob);CHKERRQ(ierr); 942 PetscFunctionReturn(0); 943 } 944 945 /* MatProductSymbolic_MPIAIJKokkos_AtB - A^tB flavor of MatProductSymbolic_MPIAIJKokkos 946 947 Input Parameters: 948 + product - Mat_Product which carried out the computation. Passed in to access info about this mat product. 949 . A - an MPIAIJKOKKOS matrix 950 . B - a SEQAIJKOKKOS matrix. It works as if A^t is multiplied by a parallel matrix made up of Bs on each rank. 951 . localB - Does B use local col ids? If false, then B is already in global col ids. 952 . N - col size of the "parallel B matrix". It implies B's global col ids are in range of [0,N) and N is the same across the communicator. 953 . l2g - If localB, then l2g maps B's local col ids to global ones. 954 - mm - a struct used to stash intermediate data in AtB 955 956 Notes: The local part of the result C is stored as mm->C_global, which is of type KokkosCsrMatrix and uses global col ids. 957 */ 958 static PetscErrorCode MatProductSymbolic_MPIAIJKokkos_AtB(Mat_Product *product,Mat A,Mat B,PetscBool localB,PetscInt N,const ConstMatColIdxKokkosView& l2g,MatMatStruct_AtB *mm) 959 { 960 PetscErrorCode ierr; 961 Mat_MPIAIJ *a = static_cast<Mat_MPIAIJ*>(A->data); 962 Mat Ad = a->A,Ao = a->B; /* diag and offdiag of A */ 963 Mat C1,C2; /* intermediate matrices */ 964 965 PetscFunctionBegin; 966 /* C1 = Ad^t * B */ 967 ierr = MatProductCreate(Ad,B,NULL,&C1);CHKERRQ(ierr); 968 ierr = MatProductSetType(C1,MATPRODUCT_AtB);CHKERRQ(ierr); 969 ierr = MatProductSetFill(C1,product->fill);CHKERRQ(ierr); 970 C1->product->api_user = product->api_user; 971 ierr = MatProductSetFromOptions(C1);CHKERRQ(ierr); 972 PetscCheckFalse(!C1->ops->productsymbolic,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C1->product->type]); 973 ierr = (*C1->ops->productsymbolic)(C1);CHKERRQ(ierr); 974 975 if (localB) {ierr = MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C1,N,l2g,mm->C1_global);} 976 else mm->C1_global = static_cast<Mat_SeqAIJKokkos*>(C1->spptr)->csrmat; /* the csrmat already uses global col ids */ 977 978 /* C2 = Ao^t * B */ 979 ierr = MatProductCreate(Ao,B,NULL,&C2);CHKERRQ(ierr); 980 ierr = MatProductSetType(C2,MATPRODUCT_AtB);CHKERRQ(ierr); 981 ierr = MatProductSetFill(C2,product->fill);CHKERRQ(ierr); 982 C2->product->api_user = product->api_user; 983 ierr = MatProductSetFromOptions(C2);CHKERRQ(ierr); 984 PetscCheckFalse(!C2->ops->productsymbolic,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C2->product->type]); 985 ierr = (*C2->ops->productsymbolic)(C2);CHKERRQ(ierr); 986 987 ierr = MatSeqAIJKokkosReduce(C2,MAT_INITIAL_MATRIX,localB,N,l2g,a->Mvctx,mm->sf,mm->abuf, 988 mm->srcrowoffset,mm->dstrowoffset,mm->C2_global);CHKERRQ(ierr); 989 990 mm->kh.create_spadd_handle(false); /* Input C1, C2 are NOT sorted, since B may be not */ 991 KokkosSparse::spadd_symbolic(&mm->kh,mm->C1_global,mm->C2_global,mm->C_global); 992 /* Have to do numeric since spadd_symbolic does not really populate column indices of the result matrix */ 993 KokkosSparse::spadd_numeric(&mm->kh,(MatScalarType)1.0,mm->C1_global,(MatScalarType)1.0,mm->C2_global,mm->C_global); 994 mm->C1 = C1; 995 mm->C2 = C2; 996 PetscFunctionReturn(0); 997 } 998 999 PetscErrorCode MatProductNumeric_MPIAIJKokkos(Mat C) 1000 { 1001 PetscErrorCode ierr; 1002 Mat_Product *product = C->product; 1003 MatProductType ptype; 1004 MatProductData_MPIAIJKokkos *mmdata; 1005 MatMatStruct *mm = NULL; 1006 MatMatStruct_AB *ab; 1007 MatMatStruct_AtB *atb; 1008 Mat A,B,Ad,Ao,Bd,Bo; 1009 const MatScalarType one = 1.0; /* Not use literal 1.0 directly, to avoid wrong template instantiation in KokkosSparse::spadd_numeric */ 1010 1011 PetscFunctionBegin; 1012 MatCheckProduct(C,1); 1013 mmdata = static_cast<MatProductData_MPIAIJKokkos*>(product->data); 1014 ptype = product->type; 1015 A = product->A; 1016 B = product->B; 1017 Ad = static_cast<Mat_MPIAIJ*>(A->data)->A; 1018 Ao = static_cast<Mat_MPIAIJ*>(A->data)->B; 1019 Bd = static_cast<Mat_MPIAIJ*>(B->data)->A; 1020 Bo = static_cast<Mat_MPIAIJ*>(B->data)->B; 1021 1022 if (mmdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 1023 mmdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 1024 ab = mmdata->mmAB; 1025 atb = mmdata->mmAtB; 1026 if (ab) { 1027 static_cast<MatProductData_SeqAIJKokkos*>(ab->C1->product->data)->reusesym = PETSC_FALSE; 1028 static_cast<MatProductData_SeqAIJKokkos*>(ab->C2->product->data)->reusesym = PETSC_FALSE; 1029 } 1030 if (atb) { 1031 static_cast<MatProductData_SeqAIJKokkos*>(atb->C1->product->data)->reusesym = PETSC_FALSE; 1032 static_cast<MatProductData_SeqAIJKokkos*>(atb->C2->product->data)->reusesym = PETSC_FALSE; 1033 } 1034 PetscFunctionReturn(0); 1035 } 1036 1037 if (ptype == MATPRODUCT_AB) { 1038 ab = mmdata->mmAB; 1039 /* C1 = Ad * B_local */ 1040 PetscCheckFalse(!ab->C1->ops->productnumeric || !ab->C2->ops->productnumeric,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing numeric op for MATPRODUCT_AB"); 1041 ierr = MatMPIAIJGetLocalMatMerge(B,MAT_REUSE_MATRIX,NULL/*glob*/,&ab->B_local);CHKERRQ(ierr); 1042 PetscCheckFalse(ab->C1->product->B != ab->B_local,PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AB, internal mat product matrix C1->B has unexpectedly changed"); 1043 if (ab->C1->product->A != Ad) {ierr = MatProductReplaceMats(Ad,NULL,NULL,ab->C1);CHKERRQ(ierr);} 1044 ierr = (*ab->C1->ops->productnumeric)(ab->C1);CHKERRQ(ierr); 1045 ierr = MatSeqAIJKokkosBcast(ab->B_local,MAT_REUSE_MATRIX,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,ab->sf, 1046 ab->abuf,ab->rows,ab->rowoffset,ab->B_other);CHKERRQ(ierr); 1047 /* C2 = Ao * B_other */ 1048 PetscCheckFalse(ab->C2->product->B != ab->B_other,PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AB, internal mat product matrix C2->B has unexpectedly changed"); 1049 if (ab->C1->product->A != Ao) {ierr = MatProductReplaceMats(Ao,NULL,NULL,ab->C2);CHKERRQ(ierr);} 1050 ierr = (*ab->C2->ops->productnumeric)(ab->C2);CHKERRQ(ierr); 1051 /* C = C1_global + C2_global */ 1052 KokkosSparse::spadd_numeric(&ab->kh,one,ab->C1_global,one,ab->C2_global,ab->C_global); 1053 mm = static_cast<MatMatStruct*>(ab); 1054 } else if (ptype == MATPRODUCT_AtB) { 1055 atb = mmdata->mmAtB; 1056 PetscCheckFalse(!atb->C1->ops->productnumeric || !atb->C2->ops->productnumeric,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing numeric op for MATPRODUCT_AtB"); 1057 /* C1 = Ad^t * B_local */ 1058 ierr = MatMPIAIJGetLocalMatMerge(B,MAT_REUSE_MATRIX,NULL/*glob*/,&atb->B_local);CHKERRQ(ierr); 1059 PetscCheckFalse(atb->C1->product->B != atb->B_local,PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AtB, internal mat product matrix C1->B has unexpectedly changed"); 1060 if (atb->C1->product->A != Ad) {ierr = MatProductReplaceMats(Ad,NULL,NULL,atb->C1);CHKERRQ(ierr);} 1061 ierr = (*atb->C1->ops->productnumeric)(atb->C1);CHKERRQ(ierr); 1062 1063 /* C2 = Ao^t * B_local */ 1064 PetscCheckFalse(atb->C2->product->B != atb->B_local,PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AtB, internal mat product matrix C2->B has unexpectedly changed"); 1065 if (atb->C2->product->A != Ao) {ierr = MatProductReplaceMats(Ao,NULL,NULL,atb->C2);CHKERRQ(ierr);} 1066 ierr = (*atb->C2->ops->productnumeric)(atb->C2);CHKERRQ(ierr); 1067 /* Form C2_global */ 1068 ierr = MatSeqAIJKokkosReduce(atb->C2,MAT_REUSE_MATRIX,PETSC_TRUE,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,atb->sf, 1069 atb->abuf,atb->srcrowoffset,atb->dstrowoffset,atb->C2_global);CHKERRQ(ierr); 1070 /* C = C1_global + C2_global */ 1071 KokkosSparse::spadd_numeric(&atb->kh,one,atb->C1_global,one,atb->C2_global,atb->C_global); 1072 mm = static_cast<MatMatStruct*>(atb); 1073 } else if (ptype == MATPRODUCT_PtAP) { /* BtAB */ 1074 ab = mmdata->mmAB; 1075 ierr = MatMPIAIJGetLocalMatMerge(B,MAT_REUSE_MATRIX,NULL/*glob*/,&ab->B_local);CHKERRQ(ierr); 1076 1077 /* ab->C1 = Ad * B_local */ 1078 PetscCheckFalse(ab->C1->product->B != ab->B_local,PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_PtAP, internal mat product matrix ab->C1->B has unexpectedly changed"); 1079 if (ab->C1->product->A != Ad) {ierr = MatProductReplaceMats(Ad,NULL,NULL,ab->C1);CHKERRQ(ierr);} 1080 ierr = (*ab->C1->ops->productnumeric)(ab->C1);CHKERRQ(ierr); 1081 ierr = MatSeqAIJKokkosBcast(ab->B_local,MAT_REUSE_MATRIX,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,ab->sf, 1082 ab->abuf,ab->rows,ab->rowoffset,ab->B_other);CHKERRQ(ierr); 1083 /* ab->C2 = Ao * B_other */ 1084 if (ab->C2->product->A != Ao) {ierr = MatProductReplaceMats(Ao,NULL,NULL,ab->C2);CHKERRQ(ierr);} 1085 ierr = (*ab->C2->ops->productnumeric)(ab->C2);CHKERRQ(ierr); /* C2 = Ao * B_other */ 1086 KokkosSparse::spadd_numeric(&ab->kh,one,ab->C1_global,one,ab->C2_global,ab->C_global); 1087 1088 /* atb->C1 = Bd^t * ab->C_petsc */ 1089 atb = mmdata->mmAtB; 1090 PetscCheckFalse(atb->C1->product->B != ab->C_petsc,PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_PtAP, internal mat product matrix atb->C1->B has unexpectedly changed"); 1091 if (atb->C1->product->A != Bd) {ierr = MatProductReplaceMats(Bd,NULL,NULL,atb->C1);CHKERRQ(ierr);} 1092 ierr = (*atb->C1->ops->productnumeric)(atb->C1);CHKERRQ(ierr); 1093 /* atb->C2 = Bo^t * ab->C_petsc */ 1094 if (atb->C2->product->A != Bo) {ierr = MatProductReplaceMats(Bo,NULL,NULL,atb->C2);CHKERRQ(ierr);} 1095 ierr = (*atb->C2->ops->productnumeric)(atb->C2);CHKERRQ(ierr); 1096 ierr = MatSeqAIJKokkosReduce(atb->C2,MAT_REUSE_MATRIX,PETSC_FALSE,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,atb->sf, 1097 atb->abuf,atb->srcrowoffset,atb->dstrowoffset,atb->C2_global);CHKERRQ(ierr); 1098 KokkosSparse::spadd_numeric(&atb->kh,one,atb->C1_global,one,atb->C2_global,atb->C_global); 1099 mm = static_cast<MatMatStruct*>(atb); 1100 } 1101 /* Split C_global to form C */ 1102 ierr = MatSetMPIAIJKokkosWithGlobalCSRMatrix(C,MAT_REUSE_MATRIX,mm->C_global,mm->Cdstart);CHKERRQ(ierr); 1103 PetscFunctionReturn(0); 1104 } 1105 1106 PetscErrorCode MatProductSymbolic_MPIAIJKokkos(Mat C) 1107 { 1108 PetscErrorCode ierr; 1109 Mat A,B; 1110 Mat_Product *product = C->product; 1111 MatProductType ptype; 1112 MatProductData_MPIAIJKokkos *mmdata; 1113 MatMatStruct *mm = NULL; 1114 IS glob = NULL; 1115 const PetscInt *garray; 1116 PetscInt m,n,M,N,sz; 1117 ConstMatColIdxKokkosView l2g; /* map local col ids to global ones */ 1118 1119 PetscFunctionBegin; 1120 MatCheckProduct(C,1); 1121 PetscCheckFalse(product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 1122 ptype = product->type; 1123 A = product->A; 1124 B = product->B; 1125 1126 switch (ptype) { 1127 case MATPRODUCT_AB: m = A->rmap->n; n = B->cmap->n; M = A->rmap->N; N = B->cmap->N; break; 1128 case MATPRODUCT_AtB: m = A->cmap->n; n = B->cmap->n; M = A->cmap->N; N = B->cmap->N; break; 1129 case MATPRODUCT_PtAP: m = B->cmap->n; n = B->cmap->n; M = B->cmap->N; N = B->cmap->N; break; /* BtAB */ 1130 default: SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for product type %s",MatProductTypes[ptype]); 1131 } 1132 1133 ierr = MatSetSizes(C,m,n,M,N);CHKERRQ(ierr); 1134 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 1135 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 1136 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 1137 1138 mmdata = new MatProductData_MPIAIJKokkos(); 1139 mmdata->reusesym = product->api_user; 1140 1141 if (ptype == MATPRODUCT_AB) { 1142 mmdata->mmAB = new MatMatStruct_AB(); 1143 ierr = MatProductSymbolic_MPIAIJKokkos_AB(product,A,B,mmdata->mmAB);CHKERRQ(ierr); 1144 mm = static_cast<MatMatStruct*>(mmdata->mmAB); 1145 } else if (ptype == MATPRODUCT_AtB) { 1146 mmdata->mmAtB = new MatMatStruct_AtB(); 1147 auto atb = mmdata->mmAtB; 1148 ierr = MatMPIAIJGetLocalMatMerge(B,MAT_INITIAL_MATRIX,&glob,&atb->B_local);CHKERRQ(ierr); 1149 ierr = ISGetIndices(glob,&garray);CHKERRQ(ierr); 1150 ierr = ISGetSize(glob,&sz);CHKERRQ(ierr); 1151 l2g = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatColIdxKokkosViewHost(garray,sz)); 1152 ierr = MatProductSymbolic_MPIAIJKokkos_AtB(product,A,atb->B_local,PETSC_TRUE,N,l2g,atb);CHKERRQ(ierr); 1153 ierr = ISRestoreIndices(glob,&garray);CHKERRQ(ierr); 1154 ierr = ISDestroy(&glob);CHKERRQ(ierr); 1155 mm = static_cast<MatMatStruct*>(atb); 1156 } else if (ptype == MATPRODUCT_PtAP) { /* BtAB */ 1157 mmdata->mmAB = new MatMatStruct_AB(); /* tmp=A*B */ 1158 mmdata->mmAtB = new MatMatStruct_AtB(); /* C=B^t*tmp */ 1159 auto ab = mmdata->mmAB; 1160 auto atb = mmdata->mmAtB; 1161 ierr = MatProductSymbolic_MPIAIJKokkos_AB(product,A,B,ab);CHKERRQ(ierr); 1162 auto tmp = new Mat_SeqAIJKokkos(ab->C_global); /* Memory will be owned by ab->C_petsc */ 1163 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,tmp,&ab->C_petsc);CHKERRQ(ierr); 1164 ierr = MatProductSymbolic_MPIAIJKokkos_AtB(product,B,ab->C_petsc,PETSC_FALSE,N,l2g/*not used*/,atb);CHKERRQ(ierr); 1165 mm = static_cast<MatMatStruct*>(atb); 1166 } 1167 /* Split the C_global into petsc A, B format */ 1168 ierr = MatSetMPIAIJKokkosWithGlobalCSRMatrix(C,MAT_INITIAL_MATRIX,mm->C_global,mm->Cdstart);CHKERRQ(ierr); 1169 C->product->data = mmdata; 1170 C->product->destroy = MatProductDataDestroy_MPIAIJKokkos; 1171 C->ops->productnumeric = MatProductNumeric_MPIAIJKokkos; 1172 PetscFunctionReturn(0); 1173 } 1174 1175 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_MPIAIJKokkos(Mat mat) 1176 { 1177 PetscErrorCode ierr; 1178 Mat_Product *product = mat->product; 1179 PetscBool match = PETSC_FALSE; 1180 PetscBool usecpu = PETSC_FALSE; 1181 1182 PetscFunctionBegin; 1183 MatCheckProduct(mat,1); 1184 if (!product->A->boundtocpu && !product->B->boundtocpu) { 1185 ierr = PetscObjectTypeCompare((PetscObject)product->B,((PetscObject)product->A)->type_name,&match);CHKERRQ(ierr); 1186 } 1187 if (match) { /* we can always fallback to the CPU if requested */ 1188 switch (product->type) { 1189 case MATPRODUCT_AB: 1190 if (product->api_user) { 1191 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr); 1192 ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 1193 ierr = PetscOptionsEnd();CHKERRQ(ierr); 1194 } else { 1195 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr); 1196 ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 1197 ierr = PetscOptionsEnd();CHKERRQ(ierr); 1198 } 1199 break; 1200 case MATPRODUCT_AtB: 1201 if (product->api_user) { 1202 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr); 1203 ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 1204 ierr = PetscOptionsEnd();CHKERRQ(ierr); 1205 } else { 1206 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr); 1207 ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 1208 ierr = PetscOptionsEnd();CHKERRQ(ierr); 1209 } 1210 break; 1211 case MATPRODUCT_PtAP: 1212 if (product->api_user) { 1213 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 1214 ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 1215 ierr = PetscOptionsEnd();CHKERRQ(ierr); 1216 } else { 1217 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr); 1218 ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 1219 ierr = PetscOptionsEnd();CHKERRQ(ierr); 1220 } 1221 break; 1222 default: 1223 break; 1224 } 1225 match = (PetscBool)!usecpu; 1226 } 1227 if (match) { 1228 switch (product->type) { 1229 case MATPRODUCT_AB: 1230 case MATPRODUCT_AtB: 1231 case MATPRODUCT_PtAP: 1232 mat->ops->productsymbolic = MatProductSymbolic_MPIAIJKokkos; 1233 break; 1234 default: 1235 break; 1236 } 1237 } 1238 /* fallback to MPIAIJ ops */ 1239 if (!mat->ops->productsymbolic) { 1240 ierr = MatProductSetFromOptions_MPIAIJ(mat);CHKERRQ(ierr); 1241 } 1242 PetscFunctionReturn(0); 1243 } 1244 1245 static PetscErrorCode MatSetPreallocationCOO_MPIAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 1246 { 1247 PetscErrorCode ierr; 1248 Mat newmat; 1249 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 1250 1251 PetscFunctionBegin; 1252 ierr = MatCreate(PetscObjectComm((PetscObject)mat),&newmat);CHKERRQ(ierr); 1253 ierr = MatSetSizes(newmat,mat->rmap->n,mat->cmap->n,mat->rmap->N,mat->cmap->N);CHKERRQ(ierr); 1254 ierr = MatSetType(newmat,MATMPIAIJ);CHKERRQ(ierr); 1255 ierr = MatSetOption(newmat,MAT_IGNORE_OFF_PROC_ENTRIES,mpiaij->donotstash);CHKERRQ(ierr); /* Inherit the two options that we respect from mat */ 1256 ierr = MatSetOption(newmat,MAT_NO_OFF_PROC_ENTRIES,mat->nooffprocentries);CHKERRQ(ierr); 1257 ierr = MatSetPreallocationCOO_MPIAIJ(newmat,coo_n,coo_i,coo_j);CHKERRQ(ierr); 1258 ierr = MatConvert(newmat,MATMPIAIJKOKKOS,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr); 1259 ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr); /* Not MatHeaderReplace() since we want to keep some mat's info */ 1260 ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */ 1261 mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); /* mat->data was changed in MatHeaderReplace() */ 1262 mpiaij->spptr = new Mat_MPIAIJKokkos(mpiaij); 1263 PetscFunctionReturn(0); 1264 } 1265 1266 static PetscErrorCode MatSetValuesCOO_MPIAIJKokkos(Mat mat,const PetscScalar v[],InsertMode imode) 1267 { 1268 PetscErrorCode ierr; 1269 Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); 1270 Mat_MPIAIJKokkos *mpikok = static_cast<Mat_MPIAIJKokkos*>(mpiaij->spptr); 1271 Mat A = mpiaij->A,B = mpiaij->B; 1272 PetscCount Annz1 = mpiaij->Annz1,Annz2 = mpiaij->Annz2,Bnnz1 = mpiaij->Bnnz1,Bnnz2 = mpiaij->Bnnz2; 1273 MatScalarKokkosView Aa,Ba; 1274 MatScalarKokkosView v1; 1275 MatScalarKokkosView& vsend = mpikok->sendbuf_d; 1276 const MatScalarKokkosView& v2 = mpikok->recvbuf_d; 1277 const PetscCountKokkosView& Ajmap1 = mpikok->Ajmap1_d,Ajmap2 = mpikok->Ajmap2_d,Aimap1 = mpikok->Aimap1_d,Aimap2 = mpikok->Aimap2_d; 1278 const PetscCountKokkosView& Bjmap1 = mpikok->Bjmap1_d,Bjmap2 = mpikok->Bjmap2_d,Bimap1 = mpikok->Bimap1_d,Bimap2 = mpikok->Bimap2_d; 1279 const PetscCountKokkosView& Aperm1 = mpikok->Aperm1_d,Aperm2 = mpikok->Aperm2_d,Bperm1 = mpikok->Bperm1_d,Bperm2 = mpikok->Bperm2_d; 1280 const PetscCountKokkosView& Cperm1 = mpikok->Cperm1_d; 1281 PetscMemType memtype; 1282 1283 PetscFunctionBegin; 1284 PetscAssert(mat->assembled,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Expected matrix to be already assembled in MatSetPreallocationCOO()"); 1285 ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr); /* Return PETSC_MEMTYPE_HOST when v is NULL */ 1286 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device if any */ 1287 v1 = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),MatScalarKokkosViewHost((PetscScalar*)v,mpiaij->coo_n)); 1288 } else { 1289 v1 = MatScalarKokkosView((PetscScalar*)v,mpiaij->coo_n); /* Directly use v[]'s memory */ 1290 } 1291 1292 if (imode == INSERT_VALUES) { 1293 ierr = MatSeqAIJGetKokkosViewWrite(A,&Aa);CHKERRQ(ierr); /* write matrix values */ 1294 ierr = MatSeqAIJGetKokkosViewWrite(B,&Ba);CHKERRQ(ierr); 1295 Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */ 1296 Kokkos::deep_copy(Ba,0.0); 1297 } else { 1298 ierr = MatSeqAIJGetKokkosView(A,&Aa);CHKERRQ(ierr); /* read & write matrix values */ 1299 ierr = MatSeqAIJGetKokkosView(B,&Ba);CHKERRQ(ierr); 1300 } 1301 1302 /* Pack entries to be sent to remote */ 1303 Kokkos::parallel_for(vsend.extent(0),KOKKOS_LAMBDA(const PetscCount i) {vsend(i) = v1(Cperm1(i));}); 1304 1305 /* Send remote entries to their owner and overlap the communication with local computation */ 1306 ierr = PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_KOKKOS,vsend.data(),PETSC_MEMTYPE_KOKKOS,v2.data(),MPI_REPLACE);CHKERRQ(ierr); 1307 /* Add local entries to A and B */ 1308 Kokkos::parallel_for(Annz1,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=Ajmap1(i); k<Ajmap1(i+1); k++) Aa(Aimap1(i)) += v1(Aperm1(k));}); 1309 Kokkos::parallel_for(Bnnz1,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=Bjmap1(i); k<Bjmap1(i+1); k++) Ba(Bimap1(i)) += v1(Bperm1(k));}); 1310 ierr = PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend.data(),v2.data(),MPI_REPLACE);CHKERRQ(ierr); 1311 1312 /* Add received remote entries to A and B */ 1313 Kokkos::parallel_for(Annz2,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=Ajmap2(i); k<Ajmap2(i+1); k++) Aa(Aimap2(i)) += v2(Aperm2(k));}); 1314 Kokkos::parallel_for(Bnnz2,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=Bjmap2(i); k<Bjmap2(i+1); k++) Ba(Bimap2(i)) += v2(Bperm2(k));}); 1315 1316 if (imode == INSERT_VALUES) { 1317 ierr = MatSeqAIJRestoreKokkosViewWrite(A,&Aa);CHKERRQ(ierr); /* Increase A & B's state etc. */ 1318 ierr = MatSeqAIJRestoreKokkosViewWrite(B,&Ba);CHKERRQ(ierr); 1319 } else { 1320 ierr = MatSeqAIJRestoreKokkosView(A,&Aa);CHKERRQ(ierr); 1321 ierr = MatSeqAIJRestoreKokkosView(B,&Ba);CHKERRQ(ierr); 1322 } 1323 PetscFunctionReturn(0); 1324 } 1325 1326 PetscErrorCode MatDestroy_MPIAIJKokkos(Mat A) 1327 { 1328 PetscErrorCode ierr; 1329 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 1330 1331 PetscFunctionBegin; 1332 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 1333 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr); 1334 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C", NULL);CHKERRQ(ierr); 1335 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C", NULL);CHKERRQ(ierr); 1336 delete (Mat_MPIAIJKokkos*)mpiaij->spptr; 1337 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 1338 PetscFunctionReturn(0); 1339 } 1340 1341 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 1342 { 1343 PetscErrorCode ierr; 1344 Mat B; 1345 Mat_MPIAIJ *a; 1346 1347 PetscFunctionBegin; 1348 if (reuse == MAT_INITIAL_MATRIX) { 1349 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 1350 } else if (reuse == MAT_REUSE_MATRIX) { 1351 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 1352 } 1353 B = *newmat; 1354 1355 B->boundtocpu = PETSC_FALSE; 1356 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 1357 ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr); 1358 ierr = PetscObjectChangeTypeName((PetscObject)B,MATMPIAIJKOKKOS);CHKERRQ(ierr); 1359 1360 a = static_cast<Mat_MPIAIJ*>(A->data); 1361 if (a->A) {ierr = MatSetType(a->A,MATSEQAIJKOKKOS);CHKERRQ(ierr);} 1362 if (a->B) {ierr = MatSetType(a->B,MATSEQAIJKOKKOS);CHKERRQ(ierr);} 1363 if (a->lvec) {ierr = VecSetType(a->lvec,VECSEQKOKKOS);CHKERRQ(ierr);} 1364 1365 B->ops->assemblyend = MatAssemblyEnd_MPIAIJKokkos; 1366 B->ops->mult = MatMult_MPIAIJKokkos; 1367 B->ops->multadd = MatMultAdd_MPIAIJKokkos; 1368 B->ops->multtranspose = MatMultTranspose_MPIAIJKokkos; 1369 B->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJKokkos; 1370 B->ops->destroy = MatDestroy_MPIAIJKokkos; 1371 1372 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJKokkos);CHKERRQ(ierr); 1373 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJKokkos);CHKERRQ(ierr); 1374 ierr = PetscObjectComposeFunction((PetscObject)B,"MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJKokkos);CHKERRQ(ierr); 1375 ierr = PetscObjectComposeFunction((PetscObject)B,"MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJKokkos);CHKERRQ(ierr); 1376 PetscFunctionReturn(0); 1377 } 1378 /*MC 1379 MATSMPIAIJKOKKOS - MATAIJKOKKOS = "(mpi)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 1380 1381 A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 1382 1383 Options Database Keys: 1384 . -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions() 1385 1386 Level: beginner 1387 1388 .seealso: MatCreateAIJKokkos(), MATSEQAIJKOKKOS 1389 M*/ 1390 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJKokkos(Mat A) 1391 { 1392 PetscErrorCode ierr; 1393 1394 PetscFunctionBegin; 1395 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 1396 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 1397 ierr = MatConvert_MPIAIJ_MPIAIJKokkos(A,MATMPIAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 1398 PetscFunctionReturn(0); 1399 } 1400 1401 /*@C 1402 MatCreateAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 1403 (the default parallel PETSc format). This matrix will ultimately pushed down 1404 to Kokkos for calculations. For good matrix 1405 assembly performance the user should preallocate the matrix storage by setting 1406 the parameter nz (or the array nnz). By setting these parameters accurately, 1407 performance during matrix assembly can be increased by more than a factor of 50. 1408 1409 Collective 1410 1411 Input Parameters: 1412 + comm - MPI communicator, set to PETSC_COMM_SELF 1413 . m - number of rows 1414 . n - number of columns 1415 . nz - number of nonzeros per row (same for all rows) 1416 - nnz - array containing the number of nonzeros in the various rows 1417 (possibly different for each row) or NULL 1418 1419 Output Parameter: 1420 . A - the matrix 1421 1422 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1423 MatXXXXSetPreallocation() paradigm instead of this routine directly. 1424 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1425 1426 Notes: 1427 If nnz is given then nz is ignored 1428 1429 The AIJ format (also called the Yale sparse matrix format or 1430 compressed row storage), is fully compatible with standard Fortran 77 1431 storage. That is, the stored row and column indices can begin at 1432 either one (as in Fortran) or zero. See the users' manual for details. 1433 1434 Specify the preallocated storage with either nz or nnz (not both). 1435 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1436 allocation. For large problems you MUST preallocate memory or you 1437 will get TERRIBLE performance, see the users' manual chapter on matrices. 1438 1439 By default, this format uses inodes (identical nodes) when possible, to 1440 improve numerical efficiency of matrix-vector products and solves. We 1441 search for consecutive rows with the same nonzero structure, thereby 1442 reusing matrix information to achieve increased efficiency. 1443 1444 Level: intermediate 1445 1446 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJKOKKOS, MATAIJKokkos 1447 @*/ 1448 PetscErrorCode MatCreateAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 1449 { 1450 PetscErrorCode ierr; 1451 PetscMPIInt size; 1452 1453 PetscFunctionBegin; 1454 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1455 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 1456 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 1457 if (size > 1) { 1458 ierr = MatSetType(*A,MATMPIAIJKOKKOS);CHKERRQ(ierr); 1459 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 1460 } else { 1461 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1462 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 1463 } 1464 PetscFunctionReturn(0); 1465 } 1466 1467 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 1468 PetscErrorCode MatKokkosGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) 1469 { 1470 PetscMPIInt size,rank; 1471 MPI_Comm comm; 1472 PetscErrorCode ierr; 1473 PetscSplitCSRDataStructure d_mat=NULL; 1474 1475 PetscFunctionBegin; 1476 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 1477 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 1478 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 1479 if (size == 1) { 1480 ierr = MatSeqAIJKokkosGetDeviceMat(A,&d_mat);CHKERRQ(ierr); 1481 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); /* Since we are going to modify matrix values on device */ 1482 } else { 1483 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 1484 ierr = MatSeqAIJKokkosGetDeviceMat(aij->A,&d_mat);CHKERRQ(ierr); 1485 ierr = MatSeqAIJKokkosModifyDevice(aij->A);CHKERRQ(ierr); 1486 ierr = MatSeqAIJKokkosModifyDevice(aij->B);CHKERRQ(ierr); 1487 PetscCheck(A->nooffprocentries || aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 1488 } 1489 // act like MatSetValues because not called on host 1490 if (A->assembled) { 1491 if (A->was_assembled) { 1492 ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr); 1493 } 1494 A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 1495 } else { 1496 ierr = PetscInfo(A,"Warning !assemble ??? assembled=%" PetscInt_FMT "\n",A->assembled);CHKERRQ(ierr); 1497 } 1498 if (!d_mat) { 1499 struct _n_SplitCSRMat h_mat; /* host container */ 1500 Mat_SeqAIJKokkos *aijkokA; 1501 Mat_SeqAIJ *jaca; 1502 PetscInt n = A->rmap->n, nnz; 1503 Mat Amat; 1504 PetscInt *colmap; 1505 1506 /* create and copy h_mat */ 1507 h_mat.M = A->cmap->N; // use for debug build 1508 ierr = PetscInfo(A,"Create device matrix in Kokkos\n");CHKERRQ(ierr); 1509 if (size == 1) { 1510 Amat = A; 1511 jaca = (Mat_SeqAIJ*)A->data; 1512 h_mat.rstart = 0; h_mat.rend = A->rmap->n; 1513 h_mat.cstart = 0; h_mat.cend = A->cmap->n; 1514 h_mat.offdiag.i = h_mat.offdiag.j = NULL; 1515 h_mat.offdiag.a = NULL; 1516 aijkokA = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1517 } else { 1518 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 1519 Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 1520 PetscInt ii; 1521 Mat_SeqAIJKokkos *aijkokB; 1522 1523 Amat = aij->A; 1524 aijkokA = static_cast<Mat_SeqAIJKokkos*>(aij->A->spptr); 1525 aijkokB = static_cast<Mat_SeqAIJKokkos*>(aij->B->spptr); 1526 jaca = (Mat_SeqAIJ*)aij->A->data; 1527 PetscCheckFalse(aij->B->cmap->n && !aij->garray,comm,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 1528 PetscCheckFalse(aij->B->rmap->n != aij->A->rmap->n,comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 1529 aij->donotstash = PETSC_TRUE; 1530 aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 1531 jaca->nonew = jacb->nonew = PETSC_TRUE; // no more disassembly 1532 ierr = PetscCalloc1(A->cmap->N,&colmap);CHKERRQ(ierr); 1533 ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N)*sizeof(PetscInt));CHKERRQ(ierr); 1534 for (ii=0; ii<aij->B->cmap->n; ii++) colmap[aij->garray[ii]] = ii+1; 1535 // allocate B copy data 1536 h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 1537 h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 1538 nnz = jacb->i[n]; 1539 if (jacb->compressedrow.use) { 1540 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_i_k (jacb->i,n+1); 1541 aijkokB->i_uncompressed_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_i_k)); 1542 Kokkos::deep_copy (aijkokB->i_uncompressed_d, h_i_k); 1543 h_mat.offdiag.i = aijkokB->i_uncompressed_d.data(); 1544 } else { 1545 h_mat.offdiag.i = aijkokB->i_device_data(); 1546 } 1547 h_mat.offdiag.j = aijkokB->j_device_data(); 1548 h_mat.offdiag.a = aijkokB->a_device_data(); 1549 { 1550 Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_colmap_k (colmap,A->cmap->N); 1551 aijkokB->colmap_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_colmap_k)); 1552 Kokkos::deep_copy (aijkokB->colmap_d, h_colmap_k); 1553 h_mat.colmap = aijkokB->colmap_d.data(); 1554 ierr = PetscFree(colmap);CHKERRQ(ierr); 1555 } 1556 h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 1557 h_mat.offdiag.n = n; 1558 } 1559 // allocate A copy data 1560 nnz = jaca->i[n]; 1561 h_mat.diag.n = n; 1562 h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 1563 ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr); 1564 PetscCheckFalse(jaca->compressedrow.use,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A does not suppport compressed row (todo)"); 1565 else { 1566 h_mat.diag.i = aijkokA->i_device_data(); 1567 } 1568 h_mat.diag.j = aijkokA->j_device_data(); 1569 h_mat.diag.a = aijkokA->a_device_data(); 1570 // copy pointers and metdata to device 1571 ierr = MatSeqAIJKokkosSetDeviceMat(Amat,&h_mat);CHKERRQ(ierr); 1572 ierr = MatSeqAIJKokkosGetDeviceMat(Amat,&d_mat);CHKERRQ(ierr); 1573 ierr = PetscInfo(A,"Create device Mat n=%" PetscInt_FMT " nnz=%" PetscInt_FMT "\n",h_mat.diag.n, nnz);CHKERRQ(ierr); 1574 } 1575 *B = d_mat; // return it, set it in Mat, and set it up 1576 A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 1577 PetscFunctionReturn(0); 1578 } 1579 1580 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetOffloadMask(Mat A, const char **mask) 1581 { 1582 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1583 1584 PetscFunctionBegin; 1585 if (!aijkok) *mask = "AIJKOK_UNALLOCATED"; 1586 else if (aijkok->a_dual.need_sync_host()) *mask = "PETSC_OFFLOAD_GPU"; 1587 else if (aijkok->a_dual.need_sync_device()) *mask = "PETSC_OFFLOAD_CPU"; 1588 else *mask = "PETSC_OFFLOAD_BOTH"; 1589 PetscFunctionReturn(0); 1590 } 1591 1592 PETSC_INTERN PetscErrorCode MatAIJKokkosPrintOffloadMask(Mat A) 1593 { 1594 PetscErrorCode ierr; 1595 PetscMPIInt size; 1596 Mat Ad,Ao; 1597 const char *amask,*bmask; 1598 1599 PetscFunctionBegin; 1600 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr); 1601 1602 if (size == 1) { 1603 ierr = MatSeqAIJKokkosGetOffloadMask(A,&amask);CHKERRQ(ierr); 1604 ierr = PetscPrintf(PETSC_COMM_SELF,"%s\n",amask);CHKERRQ(ierr); 1605 } else { 1606 Ad = ((Mat_MPIAIJ*)A->data)->A; 1607 Ao = ((Mat_MPIAIJ*)A->data)->B; 1608 ierr = MatSeqAIJKokkosGetOffloadMask(Ad,&amask);CHKERRQ(ierr); 1609 ierr = MatSeqAIJKokkosGetOffloadMask(Ao,&bmask);CHKERRQ(ierr); 1610 ierr = PetscPrintf(PETSC_COMM_SELF,"Diag : Off-diag = %s : %s\n",amask,bmask);CHKERRQ(ierr); 1611 } 1612 PetscFunctionReturn(0); 1613 } 1614