1 #include <petscvec_kokkos.hpp> 2 #include <petscsf.h> 3 #include <../src/mat/impls/aij/mpi/mpiaij.h> 4 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 5 #include <KokkosSparse_spadd.hpp> 6 7 PetscErrorCode MatAssemblyEnd_MPIAIJKokkos(Mat A,MatAssemblyType mode) 8 { 9 PetscErrorCode ierr; 10 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 11 Mat_SeqAIJKokkos *aijkok = mpiaij->A->spptr ? static_cast<Mat_SeqAIJKokkos*>(mpiaij->A->spptr) : NULL; 12 13 PetscFunctionBegin; 14 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 15 if (aijkok && aijkok->device_mat_d.data()) { 16 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 17 } 18 19 PetscFunctionReturn(0); 20 } 21 22 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJKokkos(Mat mat,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 23 { 24 PetscErrorCode ierr; 25 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 26 27 PetscFunctionBegin; 28 ierr = PetscLayoutSetUp(mat->rmap);CHKERRQ(ierr); 29 ierr = PetscLayoutSetUp(mat->cmap);CHKERRQ(ierr); 30 #if defined(PETSC_USE_DEBUG) 31 if (d_nnz) { 32 PetscInt i; 33 for (i=0; i<mat->rmap->n; i++) { 34 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]); 35 } 36 } 37 if (o_nnz) { 38 PetscInt i; 39 for (i=0; i<mat->rmap->n; i++) { 40 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]); 41 } 42 } 43 #endif 44 #if defined(PETSC_USE_CTABLE) 45 ierr = PetscTableDestroy(&mpiaij->colmap);CHKERRQ(ierr); 46 #else 47 ierr = PetscFree(mpiaij->colmap);CHKERRQ(ierr); 48 #endif 49 ierr = PetscFree(mpiaij->garray);CHKERRQ(ierr); 50 ierr = VecDestroy(&mpiaij->lvec);CHKERRQ(ierr); 51 ierr = VecScatterDestroy(&mpiaij->Mvctx);CHKERRQ(ierr); 52 /* Because the B will have been resized we simply destroy it and create a new one each time */ 53 ierr = MatDestroy(&mpiaij->B);CHKERRQ(ierr); 54 55 if (!mpiaij->A) { 56 ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->A);CHKERRQ(ierr); 57 ierr = MatSetSizes(mpiaij->A,mat->rmap->n,mat->cmap->n,mat->rmap->n,mat->cmap->n);CHKERRQ(ierr); 58 ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->A);CHKERRQ(ierr); 59 } 60 if (!mpiaij->B) { 61 PetscMPIInt size; 62 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size);CHKERRMPI(ierr); 63 ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->B);CHKERRQ(ierr); 64 ierr = MatSetSizes(mpiaij->B,mat->rmap->n,size > 1 ? mat->cmap->N : 0,mat->rmap->n,size > 1 ? mat->cmap->N : 0);CHKERRQ(ierr); 65 ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->B);CHKERRQ(ierr); 66 } 67 ierr = MatSetType(mpiaij->A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 68 ierr = MatSetType(mpiaij->B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 69 ierr = MatSeqAIJSetPreallocation(mpiaij->A,d_nz,d_nnz);CHKERRQ(ierr); 70 ierr = MatSeqAIJSetPreallocation(mpiaij->B,o_nz,o_nnz);CHKERRQ(ierr); 71 mat->preallocated = PETSC_TRUE; 72 PetscFunctionReturn(0); 73 } 74 75 PetscErrorCode MatMult_MPIAIJKokkos(Mat mat,Vec xx,Vec yy) 76 { 77 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 78 PetscErrorCode ierr; 79 PetscInt nt; 80 81 PetscFunctionBegin; 82 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 83 if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",mat->cmap->n,nt); 84 ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 85 ierr = (*mpiaij->A->ops->mult)(mpiaij->A,xx,yy);CHKERRQ(ierr); 86 ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 87 ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,yy,yy);CHKERRQ(ierr); 88 PetscFunctionReturn(0); 89 } 90 91 PetscErrorCode MatMultAdd_MPIAIJKokkos(Mat mat,Vec xx,Vec yy,Vec zz) 92 { 93 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 94 PetscErrorCode ierr; 95 PetscInt nt; 96 97 PetscFunctionBegin; 98 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 99 if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",mat->cmap->n,nt); 100 ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 101 ierr = (*mpiaij->A->ops->multadd)(mpiaij->A,xx,yy,zz);CHKERRQ(ierr); 102 ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 103 ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,zz,zz);CHKERRQ(ierr); 104 PetscFunctionReturn(0); 105 } 106 107 PetscErrorCode MatMultTranspose_MPIAIJKokkos(Mat mat,Vec xx,Vec yy) 108 { 109 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 110 PetscErrorCode ierr; 111 PetscInt nt; 112 113 PetscFunctionBegin; 114 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 115 if (nt != mat->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",mat->rmap->n,nt); 116 ierr = (*mpiaij->B->ops->multtranspose)(mpiaij->B,xx,mpiaij->lvec);CHKERRQ(ierr); 117 ierr = (*mpiaij->A->ops->multtranspose)(mpiaij->A,xx,yy);CHKERRQ(ierr); 118 ierr = VecScatterBegin(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 119 ierr = VecScatterEnd(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 120 PetscFunctionReturn(0); 121 } 122 123 /* Merge the "A, B" matrices of mat into a matrix C. mat's type is MPIAIJKOKKOS. C's type is MATSEQAIJKOKKOS. 124 A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n). 125 C still uses local column ids. Their corresponding global column ids are returned in glob. 126 */ 127 PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJKokkos(Mat mat,MatReuse reuse,IS *glob,Mat *C) 128 { 129 Mat Ad,Ao; 130 const PetscInt *cmap; 131 PetscErrorCode ierr; 132 133 PetscFunctionBegin; 134 ierr = MatMPIAIJGetSeqAIJ(mat,&Ad,&Ao,&cmap);CHKERRQ(ierr); 135 ierr = MatSeqAIJKokkosMergeMats(Ad,Ao,reuse,C);CHKERRQ(ierr); 136 if (glob) { 137 PetscInt cst, i, dn, on, *gidx; 138 ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr); 139 ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr); 140 ierr = MatGetOwnershipRangeColumn(mat,&cst,NULL);CHKERRQ(ierr); 141 ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr); 142 for (i=0; i<dn; i++) gidx[i] = cst + i; 143 for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 144 ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr); 145 } 146 PetscFunctionReturn(0); 147 } 148 149 /* Structs used in matrix product C=AB, C=A^tB and C=B^tAB */ 150 struct MatMatStruct { 151 MatRowMapKokkosView Cdstart; /* Used to split sequential matrix into petsc's A, B format */ 152 PetscSF sf; /* SF to send/recv matrix entries */ 153 MatScalarKokkosView abuf; /* buf of mat values in send/recv */ 154 Mat C1,C2,B_local; 155 KokkosCsrMatrix C1_global,C2_global,C_global; 156 KernelHandle kh; 157 MatMatStruct() { 158 C1 = C2 = B_local = NULL; 159 sf = NULL; 160 } 161 162 ~MatMatStruct() { 163 MatDestroy(&C1); 164 MatDestroy(&C2); 165 MatDestroy(&B_local); 166 PetscSFDestroy(&sf); 167 kh.destroy_spadd_handle(); 168 } 169 }; 170 171 struct MatMatStruct_AB : public MatMatStruct { 172 MatColIdxKokkosView rows; 173 MatRowMapKokkosView rowoffset; 174 Mat B_other,C_petsc; /* SEQAIJKOKKOS matrices. TODO: have a better var name than C_petsc */ 175 176 MatMatStruct_AB() : B_other(NULL),C_petsc(NULL){} 177 ~MatMatStruct_AB() { 178 MatDestroy(&B_other); 179 MatDestroy(&C_petsc); 180 } 181 }; 182 183 struct MatMatStruct_AtB : public MatMatStruct { 184 MatRowMapKokkosView srcrowoffset,dstrowoffset; 185 }; 186 187 struct MatProductData_MPIAIJKokkos 188 { 189 MatMatStruct_AB *mmAB; 190 MatMatStruct_AtB *mmAtB; 191 PetscBool reusesym; 192 193 MatProductData_MPIAIJKokkos(): mmAB(NULL),mmAtB(NULL),reusesym(PETSC_FALSE){} 194 ~MatProductData_MPIAIJKokkos() { 195 delete mmAB; 196 delete mmAtB; 197 } 198 }; 199 200 static PetscErrorCode MatProductDataDestroy_MPIAIJKokkos(void *data) 201 { 202 PetscFunctionBegin; 203 CHKERRCXX(delete static_cast<MatProductData_MPIAIJKokkos*>(data)); 204 PetscFunctionReturn(0); 205 } 206 207 /* MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds - Get a KokkosCsrMatrix from a MATSEQAIJKOKKOS matrix 208 209 Input Parameters: 210 + A - the MATSEQAIJKOKKOS matrix 211 . N - new column size for the returned Kokkos matrix 212 - l2g - a map that maps old col ids to new col ids 213 214 Output Parameters: 215 . csrmat - the Kokkos matrix, which has the same row size as A, shares a, i but not j with A. 216 */ 217 static PetscErrorCode MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(Mat A,PetscInt N,const ConstMatColIdxKokkosView& l2g,KokkosCsrMatrix& csrmat) 218 { 219 KokkosCsrMatrix& orig = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->csrmat; 220 MatColIdxKokkosView jg("jg",orig.nnz()); /* New j array for csrmat */ 221 222 PetscFunctionBegin; 223 CHKERRCXX(Kokkos::parallel_for(orig.nnz(), KOKKOS_LAMBDA(const PetscInt i) {jg(i) = l2g(orig.graph.entries(i));})); 224 CHKERRCXX(csrmat = KokkosCsrMatrix("csrmat",orig.numRows(),N,orig.nnz(),orig.values,orig.graph.row_map,jg)); 225 PetscFunctionReturn(0); 226 } 227 228 /* MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices - Set the diag and offdiag matrices of a MATMPIAIJKOKKOS matrix. 229 It is similar to MatCreateMPIAIJWithSplitArrays. 230 231 Input Parameters: 232 + mat - the MATMPIAIJKOKKOS matrix, which should have its type and layout set, but should not have its diag, offdiag matrices set 233 . A - the diag matrix using local col ids 234 - B - the offdiag matrix using global col ids 235 236 Output Parameters: 237 . mat - the updated MATMPIAIJKOKKOS matrix 238 */ 239 static PetscErrorCode MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices(Mat mat,Mat A,Mat B) 240 { 241 PetscErrorCode ierr; 242 Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); 243 PetscInt m,n,M,N,Am,An,Bm,Bn; 244 Mat_SeqAIJKokkos *bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 245 246 PetscFunctionBegin; 247 ierr = MatGetSize(mat,&M,&N);CHKERRQ(ierr); 248 ierr = MatGetLocalSize(mat,&m,&n);CHKERRQ(ierr); 249 ierr = MatGetLocalSize(A,&Am,&An);CHKERRQ(ierr); 250 ierr = MatGetLocalSize(B,&Bm,&Bn);CHKERRQ(ierr); 251 252 if (m != Am || m != Bm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"local number of rows do not match"); 253 if (n != An) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"local number of columns do not match"); 254 if (N != Bn) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"global number of columns do not match"); 255 if (mpiaij->A || mpiaij->B) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A, B of the MPIAIJ matrix are not empty"); 256 mpiaij->A = A; 257 mpiaij->B = B; 258 259 mat->preallocated = PETSC_TRUE; 260 mat->nooffprocentries = PETSC_TRUE; /* See MatAssemblyBegin_MPIAIJ. In effect, making MatAssemblyBegin a nop */ 261 262 ierr = MatSetOption(mat,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE);CHKERRQ(ierr); 263 ierr = MatAssemblyBegin(mat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 264 /* MatAssemblyEnd is critical here. It sets mat->offloadmask according to A and B's, and 265 also gets mpiaij->B compacted, with its col ids and size reduced 266 */ 267 ierr = MatAssemblyEnd(mat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 268 ierr = MatSetOption(mat,MAT_NO_OFF_PROC_ENTRIES,PETSC_FALSE);CHKERRQ(ierr); 269 ierr = MatSetOption(mat,MAT_NEW_NONZERO_LOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 270 271 /* Update bkok with new local col ids (stored on host) and size */ 272 bkok->j_dual.modify_host(); 273 bkok->j_dual.sync_device(); 274 bkok->SetColSize(mpiaij->B->cmap->n); 275 PetscFunctionReturn(0); 276 } 277 278 /* MatSeqAIJKokkosBcast - Bcast rows of a SEQAIJKOKKOS matrice (B) to form a SEQAIJKOKKOS matrix (C). 279 280 It is essentially the MPIAIJKOKKOS counterpart of MatGetBrowsOfAoCols_MPIAIJ, but supports device and uses PetscSF. 281 In the given ownerSF, leaves correspond to rows in C, and roots correspond to rows in B. Roots may connect to multiple leaves. 282 Suppose C's j-th row is connected to a root identified by PetscSFNode (k,i), it means we will bcast the i-th row of B on rank k 283 to j-th row of C. ownerSF's leaves must be contiguous (in other words, as if ilocal=NULL was used to set its graph). 284 285 Collective on comm of ownerSF 286 287 Input Parameters: 288 + B - the SEQAIJKOKKOS matrix, using local col ids 289 . reuse - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX 290 . N - global col ids are in range of [0,N). N Must be the same across ranks (nonsignificant in MAT_REUSE_MATRIX) 291 . l2g - a map mapping B's local col ids to global ones (nonsignificant in MAT_REUSE_MATRIX) 292 . ownerSF - the ownership SF (nonsignificant in MAT_REUSE_MATRIX) 293 294 Input/Output Parameters (out when resue = MAT_INITIAL_MATRIX, inout when reuse = MAT_REUSE_MATRIX) 295 + bcastSF - the SF used to bcast rows of B. This plain SF does buffer (abuf) to buffer (Ca) send/recv. In this SF, vertices are nonzeros. 296 . abuf - buffer for sending matrix values 297 . rows - array containing indices of (local) rows that this rank needs to bcast to others. Each receiver rank has a chunk in rows[]. 298 Values in rows[] might have repeats, which simply indicates a row will be bcast'ed to multiple neighbors. 299 . rowoffset - For each row in rows[], it will be copied to rowoffset[] at abuf[] 300 - C - the SEQAIJKOKKOS matrix made of the bcast'ed rows, using local col ids. 301 */ 302 static PetscErrorCode MatSeqAIJKokkosBcast(Mat B,MatReuse reuse,PetscInt N,const ConstMatColIdxKokkosView& l2g,PetscSF ownerSF, 303 PetscSF& bcastSF,MatScalarKokkosView& abuf,MatColIdxKokkosView& rows, 304 MatRowMapKokkosView& rowoffset,Mat& C) 305 { 306 PetscErrorCode ierr; 307 Mat_SeqAIJKokkos *bkok,*ckok; 308 309 PetscFunctionBegin; 310 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); /* Make sure B->spptr is accessible */ 311 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 312 313 if (reuse == MAT_REUSE_MATRIX) { 314 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 315 316 const auto& Ba = bkok->a_dual.view_device(); 317 const auto& Bi = bkok->i_dual.view_device(); 318 const auto& Ca = ckok->a_dual.view_device(); 319 320 /* Copy Ba to abuf */ 321 Kokkos::parallel_for(Kokkos::TeamPolicy<>(rows.extent(0), Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 322 PetscInt i = t.league_rank(); /* rows[i] is r-th row of B */ 323 PetscInt r = rows(i); 324 PetscInt base = rowoffset(i); /* Copy r-th row of B to this offset in abuf[] */ 325 Kokkos::parallel_for(Kokkos::TeamThreadRange(t,Bi(r+1)-Bi(r)),[&](PetscInt k) { 326 abuf(base+k) = Ba(Bi(r)+k); 327 }); 328 }); 329 330 /* Send abuf to Ca through bcastSF and then mark C is updated on device */ 331 ierr = PetscSFBcastBegin(bcastSF,MPIU_SCALAR,abuf.data(),Ca.data(),MPI_REPLACE);CHKERRQ(ierr); /* TODO: get memtype for abuf */ 332 ierr = PetscSFBcastEnd (bcastSF,MPIU_SCALAR,abuf.data(),Ca.data(),MPI_REPLACE);CHKERRQ(ierr); 333 ckok->a_dual.modify_device(); 334 } else if (reuse == MAT_INITIAL_MATRIX) { 335 MPI_Comm comm; 336 PetscMPIInt tag; 337 PetscInt k,Cm,Cn,Cnnz,*Ci_h,nroots,nleaves; 338 339 ierr = PetscObjectGetComm((PetscObject)ownerSF,&comm);CHKERRMPI(ierr); 340 ierr = PetscSFGetGraph(ownerSF,&nroots,&nleaves,NULL,NULL);CHKERRQ(ierr); 341 Cm = nleaves; /* row size of C */ 342 Cn = N; /* col size of C, which initially uses global ids, so we can safely set its col size as N */ 343 344 /* Get row lens (nz) of B's rows for later fast query */ 345 PetscInt *Browlens; 346 const PetscInt *tmp = bkok->i_host_data(); 347 ierr = PetscMalloc1(nroots,&Browlens);CHKERRQ(ierr); 348 for (k=0; k<nroots; k++) Browlens[k] = tmp[k+1]-tmp[k]; 349 350 /* By ownerSF, each proc gets lens of rows of C */ 351 MatRowMapKokkosDualView Ci("i",Cm+1); /* C's rowmap */ 352 Ci_h = Ci.view_host().data(); 353 Ci_h[0] = 0; 354 ierr = PetscSFBcastWithMemTypeBegin(ownerSF,MPIU_INT,PETSC_MEMTYPE_HOST,Browlens,PETSC_MEMTYPE_HOST,&Ci_h[1],MPI_REPLACE);CHKERRQ(ierr); 355 ierr = PetscSFBcastEnd(ownerSF,MPIU_INT,Browlens,&Ci_h[1],MPI_REPLACE);CHKERRQ(ierr); 356 for (k=1; k<Cm+1; k++) Ci_h[k] += Ci_h[k-1]; /* Convert lens to CSR */ 357 Cnnz = Ci_h[Cm]; 358 Ci.modify_host(); 359 Ci.sync_device(); 360 361 /* With the newly known Cnnz, we are able to allocate (j, a) for C on host & device */ 362 MatColIdxKokkosDualView Cj("j",Cnnz); 363 MatScalarKokkosDualView Ca("a",Cnnz); 364 365 /* Now build the bcastSF to fill Ca, Cj. This plain SF only does (contiguous) buffer to buffer send/recv */ 366 const PetscMPIInt *iranks,*ranks; 367 const PetscInt *ioffset,*irootloc,*roffset; 368 PetscInt i,j,niranks,nranks,*sdisp,*rdisp,*rowptr; 369 MPI_Request *reqs; 370 371 ierr = PetscSFGetLeafRanks(ownerSF,&niranks,&iranks,&ioffset,&irootloc);CHKERRQ(ierr); /* irootloc[] contains indices of rows I need to send to each receiver */ 372 ierr = PetscSFGetRootRanks(ownerSF,&nranks,&ranks,&roffset,NULL/*rmine*/,NULL/*rremote*/);CHKERRQ(ierr); /* recv info */ 373 374 /* figure out offsets at the send buffer, to build the SF 375 sdisp[] - stores offsets of nonzeros (in abuf or jbuf, see later) I need to send, per receiver. 376 rowptr[] - stores offsets for data of each row in abuf 377 378 rdisp[] - to receive sdisp[] 379 */ 380 ierr = PetscMalloc3(niranks+1,&sdisp,nranks,&rdisp,niranks+nranks,&reqs);CHKERRQ(ierr); 381 MatRowMapKokkosViewHost rowptr_h("rowptr_h",ioffset[niranks]+1); /* Let Kokkos do the allocation, so that we can do an easy mirror later */ 382 rowptr = rowptr_h.data(); 383 384 sdisp[0] = 0; 385 rowptr[0] = 0; 386 for (i=0; i<niranks; i++) { /* for each receiver */ 387 PetscInt len, nz = 0; 388 for (j=ioffset[i]; j<ioffset[i+1]; j++) { /* for each row to this receiver */ 389 len = Browlens[irootloc[j]]; 390 rowptr[j+1] = rowptr[j] + len; 391 nz += len; 392 } 393 sdisp[i+1] = sdisp[i] + nz; 394 } 395 ierr = PetscCommGetNewTag(comm,&tag);CHKERRMPI(ierr); 396 for (i=0; i<nranks; i++) {ierr = MPI_Irecv(&rdisp[i],1,MPIU_INT,ranks[i],tag,comm,&reqs[i]);CHKERRMPI(ierr);} 397 for (i=0; i<niranks; i++) {ierr = MPI_Isend(&sdisp[i],1,MPIU_INT,iranks[i],tag,comm,&reqs[nranks+i]);CHKERRMPI(ierr);} 398 ierr = MPI_Waitall(niranks+nranks,reqs,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 399 400 PetscInt nleaves2 = Cnnz; /* leaves are the nonzeros I will receive */ 401 PetscInt nroots2 = sdisp[niranks]; /* roots are the nonzeros (in abuf) I will send */ 402 PetscSFNode *iremote; 403 ierr = PetscMalloc1(nleaves2,&iremote);CHKERRQ(ierr); 404 for (i=0; i<nranks; i++) { /* for each sender */ 405 k = 0; 406 for (j=Ci_h[roffset[i]]; j<Ci_h[roffset[i+1]]; j++) { 407 iremote[j].rank = ranks[i]; 408 iremote[j].index = rdisp[i] + k; 409 k++; 410 } 411 } 412 /* TODO: we should extend PetscSF APIs for this buffer-to-buffer send/recv */ 413 ierr = PetscSFCreate(comm,&bcastSF);CHKERRQ(ierr); 414 ierr = PetscSFSetGraph(bcastSF,nroots2,nleaves2,NULL/*ilocal*/,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr); 415 416 /* Extract selected rows of B, and copy their (a, j) into abuf[] and jbuf[], with j converted 417 from local to global. Then use bcastSF to fill Ca, Cj. 418 */ 419 ConstMatColIdxKokkosViewHost rows_h(irootloc,ioffset[niranks]); /* irootloc[] stores indices of rows I need to send */ 420 MatColIdxKokkosView rows("rows",ioffset[niranks]); 421 Kokkos::deep_copy(rows,rows_h); /* Use deep copy since irootoc is managed by PetscSF and we want 'rows' to be standalone */ 422 423 rowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),rowptr_h); /* If no device, rowoffset will be an alias to rowptr_h */ 424 425 MatColIdxKokkosView jbuf("jbuf",sdisp[niranks]); /* send buf for (global) col ids */ 426 abuf = MatScalarKokkosView("abuf",sdisp[niranks]); /* send buf for mat values */ 427 428 const auto& Ba = bkok->a_dual.view_device(); 429 const auto& Bi = bkok->i_dual.view_device(); 430 const auto& Bj = bkok->j_dual.view_device(); 431 432 /* Copy Ba, Bj to abuf, jbuf with change col ids from local to global */ 433 Kokkos::parallel_for(Kokkos::TeamPolicy<>(rows.extent(0), Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 434 PetscInt i = t.league_rank(); /* rows[i] is r-th row of B */ 435 PetscInt r = rows(i); 436 PetscInt base = rowoffset(i); /* Copy r-th row of B to this offset in abuf[] */ 437 Kokkos::parallel_for(Kokkos::TeamThreadRange(t,Bi(r+1)-Bi(r)),[&](PetscInt k) { 438 abuf(base+k) = Ba(Bi(r)+k); 439 jbuf(base+k) = l2g(Bj(Bi(r)+k)); 440 }); 441 }); 442 443 /* Send abuf & jbuf to fill Ca, Cj */ 444 ierr = PetscSFBcastBegin(bcastSF,MPIU_INT, jbuf.data(),Cj.view_device().data(),MPI_REPLACE);CHKERRQ(ierr); 445 ierr = PetscSFBcastBegin(bcastSF,MPIU_SCALAR,abuf.data(),Ca.view_device().data(),MPI_REPLACE);CHKERRQ(ierr); 446 ierr = PetscSFBcastEnd (bcastSF,MPIU_INT, jbuf.data(),Cj.view_device().data(),MPI_REPLACE);CHKERRQ(ierr); 447 ierr = PetscSFBcastEnd (bcastSF,MPIU_SCALAR,abuf.data(),Ca.view_device().data(),MPI_REPLACE);CHKERRQ(ierr); 448 Cj.modify_device(); /* Mark Cj, Ca modified on device, but only sync Cj since we might not need Ca on host at all */ 449 Cj.sync_host(); 450 Ca.modify_device(); 451 452 /* Construct C with Ca, Ci, Cj */ 453 auto ckok = new Mat_SeqAIJKokkos(Cm,Cn,Cnnz,Ci,Cj,Ca); 454 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,&C);CHKERRQ(ierr); 455 ierr = PetscFree3(sdisp,rdisp,reqs);CHKERRQ(ierr); 456 ierr = PetscFree(Browlens);CHKERRQ(ierr); 457 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unsupported MatReuse enum %d",reuse); 458 PetscFunctionReturn(0); 459 } 460 461 /* MatSeqAIJKokkosReduce - Reduce rows of a SEQAIJKOKKOS matrix (A) to form a Kokkos Csr matrix (C) 462 463 It is the reverse of MatSeqAIJKokkosBcast in some sense. 464 465 Think each row of A as a leaf, then the given ownerSF specifies roots of the leaves. Roots may connect to multiple leaves. 466 In this routine, we reduce (i.e., concatenate) leaves (rows) at their roots to form potentially longer rows in C. Such rows might 467 contain repeats, which does not matter since they will be summed up by other routines. C's row size will be nroots of ownerSF. 468 469 Input Parameters: 470 + A - the SEQAIJKOKKOS matrix to be reduced 471 . reuse - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX 472 . local - true if A uses local col ids; false if A is already in global col ids. 473 . N - if local, N is A's global col size 474 . l2g - if local, a map mapping A's local col ids to global ones, which are in range of [0,N). 475 - ownerSF - the SF specifies ownership (root) of rows in A 476 477 Output Parameters: 478 + reduceSF - the SF to reduce A's rows to contiguous buffers at the receiver side 479 . abuf - a contiguous buffer to receive A's rows sent to this proc. Suppose there are 'nrows' such rows. 480 . srcrowoffset - offset array of size nrows+1. Each entry is the corresponding row's offset in abuf[]. srcrowoffset[i+1]-srcrowoffset[i] is row i's len. 481 . dstrowoffset - offset array of size nrows. Each entry is the corresponding row's offset in Ca[], i.e., C's 'a' array. Row i, i+1 in abuf[] may go to 482 unrelated places in Ca, so dstrowoffset is not in CSR-like format as srcrowoffset. 483 - C - the matrix made up by rows sent to me from other ranks, using global col ids 484 485 TODO: we can even have MatSeqAIJKokkosReduceBegin/End to provide oppertunity for callers to overlap comp./comm. when reuse = MAT_REUSE_MATRIX. 486 */ 487 static PetscErrorCode MatSeqAIJKokkosReduce(Mat A,MatReuse reuse,PetscBool local,PetscInt N,const ConstMatColIdxKokkosView& l2g,PetscSF ownerSF, 488 PetscSF& reduceSF,MatScalarKokkosView& abuf, 489 MatRowMapKokkosView& srcrowoffset,MatRowMapKokkosView& dstrowoffset, 490 KokkosCsrMatrix& C) 491 { 492 PetscErrorCode ierr; 493 PetscInt i,r,Am,An,Annz,Cnnz,nrows; 494 const PetscInt *Ai; 495 Mat_SeqAIJKokkos *akok; 496 497 PetscFunctionBegin; 498 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); /* So that A's latest data is on device */ 499 ierr = MatGetSize(A,&Am,&An); 500 Ai = static_cast<Mat_SeqAIJ*>(A->data)->i; 501 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 502 Annz = Ai[Am]; 503 504 if (reuse == MAT_REUSE_MATRIX) { 505 /* Send Aa to abuf */ 506 ierr = PetscSFReduceBegin(reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE);CHKERRMPI(ierr); 507 ierr = PetscSFReduceEnd (reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE);CHKERRMPI(ierr); 508 509 /* Copy abuf to Ca */ 510 const MatScalarKokkosView& Ca = C.values; 511 nrows = dstrowoffset.extent(0); /* Not srcrowoffset[] since it has an extra entry for CSR */ 512 Kokkos::parallel_for(Kokkos::TeamPolicy<>(nrows, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 513 PetscInt i = t.league_rank(); 514 PetscInt src = srcrowoffset(i), dst = dstrowoffset(i); 515 PetscInt len = srcrowoffset(i+1) - srcrowoffset(i); 516 Kokkos::parallel_for(Kokkos::TeamThreadRange(t,len), [&](PetscInt k) {Ca(dst+k) = abuf(src+k);}); 517 }); 518 } else if (reuse == MAT_INITIAL_MATRIX) { 519 MPI_Comm comm; 520 MPI_Request *reqs; 521 PetscMPIInt tag; 522 PetscInt Cm; 523 524 ierr = PetscObjectGetComm((PetscObject)ownerSF,&comm);CHKERRQ(ierr); 525 ierr = PetscCommGetNewTag(comm,&tag);CHKERRQ(ierr); 526 527 PetscInt niranks,nranks,nroots,nleaves; 528 const PetscMPIInt *iranks,*ranks; 529 const PetscInt *ioffset,*rows,*roffset; /* rows[] contains local indices of rows scattered to me from others. ioffset[] is a CSR on rows[] */ 530 ierr = PetscSFSetUp(ownerSF);CHKERRQ(ierr); 531 ierr = PetscSFGetLeafRanks(ownerSF,&niranks,&iranks,&ioffset,&rows);CHKERRQ(ierr); /* recv info: iranks[] will send rows to me */ 532 ierr = PetscSFGetRootRanks(ownerSF,&nranks,&ranks,&roffset,NULL/*rmine*/,NULL/*rremote*/);CHKERRQ(ierr); /* send info */ 533 ierr = PetscSFGetGraph(ownerSF,&nroots,&nleaves,NULL,NULL);CHKERRQ(ierr); 534 if (nleaves != Am) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"ownerSF's nleaves(%" PetscInt_FMT ") != row size of A(%" PetscInt_FMT ")",nleaves,Am); 535 Cm = nroots; 536 nrows = ioffset[niranks]; /* # of rows to be received. Might receive same row (each is partial) from different senders */ 537 538 /* Tell owners how long each row I will send */ 539 PetscInt *srowlens; /* send buf of row lens */ 540 MatRowMapKokkosViewHost rrowlens_h("rrowoffset_h",nrows+1); /* recv buf of row lens. +1 to make CSR later. Memory might be passed to other views */ 541 PetscInt *rrowlens = rrowlens_h.data(); 542 543 ierr = PetscMalloc2(Am,&srowlens,niranks+nranks,&reqs);CHKERRQ(ierr); 544 for (i=0; i<Am; i++) srowlens[i] = Ai[i+1] - Ai[i]; 545 rrowlens[0] = 0; 546 rrowlens++; /* shift the pointer to make the following expression more readable */ 547 for (i=0; i<niranks; i++){ierr = MPI_Irecv(&rrowlens[ioffset[i]],ioffset[i+1]-ioffset[i],MPIU_INT,iranks[i],tag,comm,&reqs[i]);CHKERRMPI(ierr);} 548 for (i=0; i<nranks; i++) {ierr = MPI_Isend(&srowlens[roffset[i]],roffset[i+1]-roffset[i],MPIU_INT,ranks[i],tag,comm,&reqs[niranks+i]);CHKERRMPI(ierr);} 549 ierr = MPI_Waitall(niranks+nranks,reqs,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 550 551 /* Owner builds Ci on host by histogramming rrowlens[] */ 552 MatRowMapKokkosViewHost Ci_h("i",Cm+1); 553 Kokkos::deep_copy(Ci_h,0); /* Zero Ci */ 554 MatRowMapType *Ci_ptr = Ci_h.data(); 555 556 for (i=0; i<nrows; i++) { 557 r = rows[i]; /* local row id of i-th received row */ 558 #if defined(PETSC_USE_DEBUG) 559 if (r<0 || r>=Cm) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"local row id (%" PetscInt_FMT ") is out of range [0,%" PetscInt_FMT ")",r,Cm); 560 #endif 561 Ci_ptr[r+1] += rrowlens[i]; /* add to length of row r in C */ 562 } 563 for (i=0; i<Cm; i++) Ci_ptr[i+1] += Ci_ptr[i]; /* to CSR format */ 564 Cnnz = Ci_ptr[Cm]; 565 566 /* For each received row, compute src & dst offsets in memory copying (from recv bufs abuf, jbuf to Ca, Cj) */ 567 MatRowMapKokkosViewHost dstrowoffset_h("dstrowoffset_h",nrows); 568 PetscInt *dstrowoffset_hptr = dstrowoffset_h.data(); 569 PetscInt *currowlens; /* Current row lens. They are temp accumulators for row lens in C, to help build dstrowoffset */ 570 571 ierr = PetscCalloc1(Cm,&currowlens);CHKERRQ(ierr); /* Init with zero, to be added to */ 572 for (i=0; i<nrows; i++) { /* for each row I receive */ 573 r = rows[i]; /* row id in C */ 574 dstrowoffset_hptr[i] = Ci_ptr[r] + currowlens[r]; /* dst offset of the new place for each recv'ed row in Ca/Cj */ 575 currowlens[r] += rrowlens[i]; /* accumulate to length of row r in C */ 576 } 577 ierr = PetscFree(currowlens);CHKERRQ(ierr); 578 579 rrowlens--; 580 for (i=0; i<nrows; i++) rrowlens[i+1] += rrowlens[i]; /* Change rrowlens[] to CSR format */ 581 dstrowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),dstrowoffset_h); 582 srcrowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),rrowlens_h); /* src offset of each recv'ed row in abuf/jbuf */ 583 584 /* Build the reduceSF, which performs buffer to buffer send/recv */ 585 PetscInt *sdisp,*rdisp; /* buffer to send offsets of roots, and buffer to recv them */ 586 ierr = PetscMalloc2(niranks,&sdisp,nranks,&rdisp);CHKERRQ(ierr); 587 for (i=0; i<niranks; i++) sdisp[i] = rrowlens[ioffset[i]]; 588 for (i=0; i<nranks; i++) {ierr = MPI_Irecv(&rdisp[i],1,MPIU_INT,ranks[i],tag,comm,&reqs[i]);CHKERRMPI(ierr);} 589 for (i=0; i<niranks; i++) {ierr = MPI_Isend(&sdisp[i],1,MPIU_INT,iranks[i],tag,comm,&reqs[nranks+i]);CHKERRMPI(ierr);} 590 ierr = MPI_Waitall(niranks+nranks,reqs,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 591 592 /* Nonzeros in abuf/jbuf are roots and those in A are leaves */ 593 PetscInt nroots2 = Cnnz,nleaves2 = Annz; 594 PetscSFNode *iremote; 595 ierr = PetscMalloc1(nleaves2,&iremote);CHKERRQ(ierr); /* no free, since memory will be given to reduceSF */ 596 for (i=0; i<nranks; i++) { 597 PetscInt rootbase = rdisp[i]; /* root offset at this root rank */ 598 PetscInt leafbase = Ai[roffset[i]]; /* leaf base */ 599 PetscInt nz = Ai[roffset[i+1]] - leafbase; /* I will send nz nonzeros to this root rank */ 600 for (PetscInt k=0; k<nz; k++) { 601 iremote[leafbase+k].rank = ranks[i]; 602 iremote[leafbase+k].index = rootbase + k; 603 } 604 } 605 ierr = PetscSFCreate(comm,&reduceSF);CHKERRQ(ierr); 606 ierr = PetscSFSetGraph(reduceSF,nroots2,nleaves2,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr); 607 ierr = PetscFree2(sdisp,rdisp);CHKERRQ(ierr); 608 609 /* Reduce Aa, Ajg to abuf and jbuf */ 610 611 /* If A uses local col ids, convert them to global ones before sending */ 612 MatColIdxKokkosView Ajg; 613 if (local) { 614 Ajg = MatColIdxKokkosView("j",Annz); 615 const MatColIdxKokkosView& Aj = akok->j_dual.view_device(); 616 Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscInt i) {Ajg(i) = l2g(Aj(i));}); 617 } else { 618 Ajg = akok->j_dual.view_device(); /* no data copy, just take a reference */ 619 } 620 621 MatColIdxKokkosView jbuf("jbuf",Cnnz); 622 abuf = MatScalarKokkosView("abuf",Cnnz); 623 ierr = PetscSFReduceBegin(reduceSF,MPIU_INT, Ajg.data(), jbuf.data(),MPI_REPLACE);CHKERRMPI(ierr); 624 ierr = PetscSFReduceEnd (reduceSF,MPIU_INT, Ajg.data(), jbuf.data(),MPI_REPLACE);CHKERRMPI(ierr); 625 ierr = PetscSFReduceBegin(reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE);CHKERRMPI(ierr); 626 ierr = PetscSFReduceEnd (reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE);CHKERRMPI(ierr); 627 628 /* Copy data from abuf, jbuf to Ca, Cj */ 629 MatRowMapKokkosView Ci = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),Ci_h); /* Ci is an alias of Ci_h if no device */ 630 MatColIdxKokkosView Cj("j",Cnnz); 631 MatScalarKokkosView Ca("a",Cnnz); 632 633 Kokkos::parallel_for(Kokkos::TeamPolicy<>(nrows, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 634 PetscInt i = t.league_rank(); 635 PetscInt src = srcrowoffset(i), dst = dstrowoffset(i); 636 PetscInt len = srcrowoffset(i+1) - srcrowoffset(i); 637 Kokkos::parallel_for(Kokkos::TeamThreadRange(t,len), [&](PetscInt k) { 638 Ca(dst+k) = abuf(src+k); 639 Cj(dst+k) = jbuf(src+k); 640 }); 641 }); 642 643 /* Build C with Ca, Ci, Cj */ 644 C = KokkosCsrMatrix("csrmat",Cm,N,Cnnz,Ca,Ci,Cj); 645 ierr = PetscFree2(srowlens,reqs);CHKERRQ(ierr); 646 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unsupported MatReuse enum %d",reuse); 647 PetscFunctionReturn(0); 648 } 649 650 /* MatSetMPIAIJKokkosWithGlobalCSRMatrix - Set the diag and offdiag parts of a MATMPIAIJKOKKOS matrix by splitting a KokkosCsrMatrix 651 652 Input Parameters: 653 + C - the MATMPIAIJKOKKOS matrix, of size m,n,M,N 654 . reuse - indicate whether the matrix has called this function before 655 . csrmat - the KokkosCsrMatrix, of size m,N 656 - Cdstart - when reuse == MAT_REUSE_MATRIX, it is an input parameter. For each row in csrmat, it stores the start of the first 657 entry of the diag block of C in csrmat's j array. E.g, if row i has col ids = {0, 3, 4, 5, 7, 9} and the first diag 658 entry is 5, then Cdstart[i] = 3. 659 660 Output Parameters: 661 + C - the updated MATMPIAIJKOKKOS matrix 662 - Cdstart - when reuse == MAT_INITIAL_MATRIX, it is an output parameter 663 664 Notes: 665 Between calls with MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX, csrmat must have the same nonzero pattern 666 */ 667 static PetscErrorCode MatSetMPIAIJKokkosWithGlobalCSRMatrix(Mat C,MatReuse reuse,const KokkosCsrMatrix& csrmat,MatRowMapKokkosView& Cdstart) 668 { 669 PetscErrorCode ierr; 670 const MatScalarKokkosView& Ca = csrmat.values; 671 const ConstMatRowMapKokkosView& Ci = csrmat.graph.row_map; 672 PetscInt m,n,N; 673 674 PetscFunctionBegin; 675 ierr = MatGetLocalSize(C,&m,&n);CHKERRQ(ierr); 676 ierr = MatGetSize(C,NULL,&N);CHKERRQ(ierr); 677 678 if (reuse == MAT_REUSE_MATRIX) { 679 Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ*>(C->data); 680 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(mpiaij->A->spptr); 681 Mat_SeqAIJKokkos *bkok = static_cast<Mat_SeqAIJKokkos*>(mpiaij->B->spptr); 682 const MatScalarKokkosView& Cda = akok->a_dual.view_device(),Coa = bkok->a_dual.view_device(); 683 const MatRowMapKokkosView& Cdi = akok->i_dual.view_device(),Coi = bkok->i_dual.view_device(); 684 685 /* Fill 'a' of Cd and Co on device */ 686 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 687 PetscInt i = t.league_rank(); /* row i */ 688 PetscInt clen = Ci(i+1) - Ci(i); /* len of row i of C */ 689 PetscInt cdlen = Cdi(i+1) - Cdi(i); /* len of row i of Cd */ 690 PetscInt cdstart = Cdstart(i); /* [start, end) of row i of Cd in C */ 691 PetscInt cdend = cdstart + cdlen; 692 /* [0, clen) is cut into three blocks: [0, cdstart), [cdstart, cdend), [cdend, clen) */ 693 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clen), [&](PetscInt k) { 694 if (k < cdstart) { /* k in [0, cdstart) */ 695 Coa(Coi(i)+k) = Ca(Ci(i)+k); 696 } else if (k < cdend) { /* k in [cdstart, cdend) */ 697 Cda(Cdi(i)+(k-cdstart)) = Ca(Ci(i)+k); 698 } else { /* k in [cdend, clen) */ 699 Coa(Coi(i)+k-cdlen) = Ca(Ci(i)+k); 700 } 701 }); 702 }); 703 704 akok->a_dual.modify_device(); 705 bkok->a_dual.modify_device(); 706 } else if (reuse == MAT_INITIAL_MATRIX) { 707 Mat Cd,Co; 708 const MatColIdxKokkosView& Cj = csrmat.graph.entries; 709 MatRowMapKokkosDualView Cdi_dual("i",m+1),Coi_dual("i",m+1); 710 MatRowMapKokkosView Cdi = Cdi_dual.view_device(),Coi = Coi_dual.view_device(); 711 PetscInt cstart,cend; 712 713 /* Note that each row of C is sorted by col ids. We want to find out how to cut each row into three blocks: 714 left to the diag block, diag block, right to the diag block. The diag block have col ids in [cstart,cend). 715 Suppose a row of C has len nonzeros, indexed by [0, len). We want to know two indices: cdstart and cdend, 716 such that the three blocks are [0,cdstart), [cdstart,cdend), [cdend,len). The following code equivalentaly 717 stores values of cdstart and cdend-cstart (aka Cdi[]) instead. 718 */ 719 Cdstart = MatRowMapKokkosView("Cdstart",m); 720 ierr = PetscLayoutGetRange(C->cmap,&cstart,&cend);CHKERRQ(ierr); /* Not MatGetOwnershipRangeColumn() since C has not been preallocated yet */ 721 722 /* I could use RangePolicy and one thread per row. But since each thread essentially does binary search, threads in a 723 CUDA warp would completely diverge. So I use TeamPolicy with a team size 1. 724 */ 725 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 726 Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 727 PetscInt i = t.league_rank(); /* row i */ 728 PetscInt j,first,count,step; 729 730 if (i == 0) { /* Set the first entry of the i arrays to zero on device, to be used in CSR */ 731 Cdi(0) = 0; 732 Coi(0) = 0; 733 } 734 735 /* Do std::lower_bound(Ci(i),Ci(i+1),cstart) on Cj[]. We use j as the iterator. lower_bound() returns 736 in 'first' the first iterator with a value >= cstart, or last iterator if no such element is found. 737 */ 738 count = Ci(i+1)-Ci(i); 739 first = Ci(i); 740 while (count > 0) { 741 j = first; 742 step = count / 2; 743 j += step; 744 if (Cj(j) < cstart) { 745 first = ++j; 746 count -= step + 1; 747 } else count = step; 748 } 749 Cdstart(i) = first - Ci(i); /* 'first' is the while-loop's output */ 750 751 /* Do std::lower_bound(first,Ci(i+1),cend) on Cj[] */ 752 count = Ci(i+1) - first; 753 while (count > 0) { 754 j = first; 755 step = count / 2; 756 j += step; 757 if (Cj(j) < cend) { 758 first = ++j; 759 count -= step + 1; 760 } else count = step; 761 } 762 Cdi(i+1) = first - (Ci(i)+Cdstart(i)); /* 'first' is the while-loop's output */ 763 Coi(i+1) = (Ci(i+1)-Ci(i)) - Cdi(i+1); /* Co's row len = C's row len - Cd's row len */ 764 }); 765 }); 766 767 /* Convert row lens in Cdi[], Coi[] to CSR format using inclusive scan, e.g., changing [0,1,2,3] into [0,1,3,6] */ 768 Kokkos::parallel_scan(m+1,KOKKOS_LAMBDA(const PetscInt i,PetscInt& update,const bool final) { 769 update += Cdi(i); 770 if (final) Cdi(i) = update; 771 }); 772 Kokkos::parallel_scan(m+1,KOKKOS_LAMBDA(const PetscInt i,PetscInt& update,const bool final) { 773 update += Coi(i); 774 if (final) Coi(i) = update; 775 }); 776 777 /* Get Cdi, Coi on host (it is not a waste, since we do need them on host in 778 MatCreateSeqAIJKokkosWithCSRMatrix() below), then get nnz of Cd and Co. 779 */ 780 Cdi_dual.modify_device(); 781 Coi_dual.modify_device(); 782 Cdi_dual.sync_host(); 783 Coi_dual.sync_host(); 784 PetscInt Cd_nnz = Cdi_dual.view_host().data()[m]; 785 PetscInt Co_nnz = Coi_dual.view_host().data()[m]; 786 787 /* With nnz, allocate a, j for Cd and Co */ 788 MatColIdxKokkosDualView Cdj_dual("j",Cd_nnz),Coj_dual("j",Co_nnz); 789 MatScalarKokkosDualView Cda_dual("a",Cd_nnz),Coa_dual("a",Co_nnz); 790 791 /* Fill a, j of Cd and Co on device */ 792 MatColIdxKokkosView Cdj = Cdj_dual.view_device(),Coj = Coj_dual.view_device(); 793 MatScalarKokkosView Cda = Cda_dual.view_device(),Coa = Coa_dual.view_device(); 794 795 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 796 PetscInt i = t.league_rank(); /* row i */ 797 PetscInt clen = Ci(i+1) - Ci(i); /* len of row i of C */ 798 PetscInt cdlen = Cdi(i+1) - Cdi(i); /* len of row i of Cd */ 799 PetscInt cdstart = Cdstart(i); /* [start, end) of row i of Cd in C */ 800 PetscInt cdend = cdstart + cdlen; 801 /* [0, clen) is cut into three blocks: [0, cdstart), [cdstart, cdend), [cdend, clen) */ 802 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clen), [&](PetscInt k) { 803 if (k < cdstart) { /* k in [0, cdstart) */ 804 Coa(Coi(i)+k) = Ca(Ci(i)+k); 805 Coj(Coi(i)+k) = Cj(Ci(i)+k); 806 } else if (k < cdend) { /* k in [cdstart, cdend) */ 807 Cda(Cdi(i)+(k-cdstart)) = Ca(Ci(i)+k); 808 Cdj(Cdi(i)+(k-cdstart)) = Cj(Ci(i)+k) - cstart; /* Use local col ids in Cdj */ 809 } else { /* k in [cdend, clen) */ 810 Coa(Coi(i)+k-cdlen) = Ca(Ci(i)+k); 811 Coj(Coi(i)+k-cdlen) = Cj(Ci(i)+k); 812 } 813 }); 814 }); 815 816 Cdj_dual.modify_device(); 817 Cda_dual.modify_device(); 818 Coj_dual.modify_device(); 819 Coa_dual.modify_device(); 820 /* With a, i, j for Cd and Co, finally build Cd, Co and then C. Their offloadmask will be set in each's MatAssemblyEnd */ 821 auto cdkok = new Mat_SeqAIJKokkos(m,n,Cd_nnz,Cdi_dual,Cdj_dual,Cda_dual); 822 auto cokok = new Mat_SeqAIJKokkos(m,N,Co_nnz,Coi_dual,Coj_dual,Coa_dual); 823 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,cdkok,&Cd);CHKERRQ(ierr); 824 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,cokok,&Co);CHKERRQ(ierr); 825 ierr = MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices(C,Cd,Co);CHKERRQ(ierr); /* Coj will be converted to local ids within */ 826 } 827 PetscFunctionReturn(0); 828 } 829 830 /* MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos - Compact a SEQAIJKOKKS matrix's global col ids. 831 832 It is similar to MatSeqAIJCompactOutExtraColumns_SeqAIJ, but it applies to SEQAIJKOKKOS and returns the l2g map in Kokkos view. 833 834 Input Parameters: 835 + C - the MATMPIAIJKOKKOS matrix, of size m,n,M,N 836 . reuse - indicate whether the matrix has called this function before 837 . csrmat - the KokkosCsrMatrix, of size m,N 838 - Cdoffset - when reuse == MAT_REUSE_MATRIX, it is an input parameter. For each row in csrmat, it stores the offset of the first 839 entry of the diag block of C in csrmat's j array. 840 841 Output Parameters: 842 + C - the updated MATMPIAIJKOKKOS matrix 843 - Cdoffset - when reuse == MAT_INITIAL_MATRIX, it is an output parameter 844 845 Notes: the input matrix's col ids and col size will be changed. 846 */ 847 static PetscErrorCode MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos(Mat C,MatColIdxKokkosView& l2g) 848 { 849 PetscErrorCode ierr; 850 Mat_SeqAIJKokkos *ckok; 851 ISLocalToGlobalMapping l2gmap; 852 const PetscInt *garray; 853 PetscInt sz; 854 855 PetscFunctionBegin; 856 /* Compact P_other's global col ids and col size. We do it since we guess with local ids KK might be more memory scalable */ 857 ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(C,&l2gmap);CHKERRQ(ierr); 858 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 859 ckok->j_dual.modify_host(); /* P_other's j is modified on host; we need to sync it on device */ 860 ckok->j_dual.sync_device(); 861 ckok->SetColSize(C->cmap->n); /* Update col size of the csrmat in spptr */ 862 863 /* Build l2g -- the local to global mapping of C's cols */ 864 ierr = ISLocalToGlobalMappingGetIndices(l2gmap,&garray);CHKERRQ(ierr); 865 ierr = ISLocalToGlobalMappingGetSize(l2gmap,&sz);CHKERRQ(ierr); 866 if (C->cmap->n != sz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"matrix column size(%" PetscInt_FMT ") != l2g mapping size(%" PetscInt_FMT ")", C->cmap->n,sz); 867 868 ConstMatColIdxKokkosViewHost tmp(garray,sz); 869 l2g = MatColIdxKokkosView("l2g",sz); 870 Kokkos::deep_copy(l2g,tmp); 871 872 ierr = ISLocalToGlobalMappingRestoreIndices(l2gmap,&garray);CHKERRQ(ierr); 873 ierr = ISLocalToGlobalMappingDestroy(&l2gmap);CHKERRQ(ierr); 874 PetscFunctionReturn(0); 875 } 876 877 /* MatProductSymbolic_MPIAIJKokkos_AB - AB flavor of MatProductSymbolic_MPIAIJKokkos 878 879 Input Parameters: 880 + product - Mat_Product which carried out the computation. Passed in to access info about this mat product. 881 . A - an MPIAIJKOKKOS matrix 882 . B - an MPIAIJKOKKOS matrix 883 - mm - a struct used to stash intermediate data when computing AB. Persist from symbolic to numeric operations. 884 885 Notes: The local part of the result C is stored as mm->C_global, which is of type KokkosCsrMatrix and uses global col ids. 886 */ 887 static PetscErrorCode MatProductSymbolic_MPIAIJKokkos_AB(Mat_Product *product,Mat A,Mat B,MatMatStruct_AB *mm) 888 { 889 PetscErrorCode ierr; 890 Mat_MPIAIJ *a = static_cast<Mat_MPIAIJ*>(A->data); 891 Mat Ad = a->A,Ao = a->B; /* diag and offdiag of A */ 892 IS glob = NULL; 893 const PetscInt *garray; 894 PetscInt N = B->cmap->N,sz; 895 ConstMatColIdxKokkosView l2g1; /* two temp maps mapping local col ids to global ones */ 896 MatColIdxKokkosView l2g2; 897 Mat C1,C2; /* intermediate matrices */ 898 899 PetscFunctionBegin; 900 /* C1 = Ad * B_local. B_local is a matrix got by merging Bd and Bo, and uses local col ids */ 901 ierr = MatMPIAIJGetLocalMatMerge(B,MAT_INITIAL_MATRIX,&glob,&mm->B_local);CHKERRQ(ierr); 902 ierr = MatProductCreate(Ad,mm->B_local,NULL,&C1);CHKERRQ(ierr); 903 ierr = MatProductSetType(C1,MATPRODUCT_AB);CHKERRQ(ierr); 904 ierr = MatProductSetFill(C1,product->fill);CHKERRQ(ierr); 905 C1->product->api_user = product->api_user; 906 ierr = MatProductSetFromOptions(C1);CHKERRQ(ierr); 907 if (!C1->ops->productsymbolic) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C1->product->type]); 908 ierr = (*C1->ops->productsymbolic)(C1);CHKERRQ(ierr); 909 910 ierr = ISGetIndices(glob,&garray);CHKERRQ(ierr); 911 ierr = ISGetSize(glob,&sz);CHKERRQ(ierr); 912 const auto& tmp = ConstMatColIdxKokkosViewHost(garray,sz); /* wrap garray as a view */ 913 l2g1 = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),tmp); /* maybe just an alias to tmp, so we restore garray at the very end */ 914 ierr = MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C1,N,l2g1,mm->C1_global); 915 916 /* C2 = Ao * B_other. B_other is a matrix consisting of needed rows of B gathered from other procs */ 917 ierr = MatSeqAIJKokkosBcast(mm->B_local,MAT_INITIAL_MATRIX,N,l2g1,a->Mvctx,mm->sf, 918 mm->abuf,mm->rows,mm->rowoffset,mm->B_other);CHKERRQ(ierr); 919 920 /* Compact B_other to use local ids as we guess KK spgemm is more memroy scalable with that; We could skip the compaction to simplify code */ 921 ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos(mm->B_other,l2g2);CHKERRQ(ierr); 922 ierr = MatProductCreate(Ao,mm->B_other,NULL,&C2);CHKERRQ(ierr); 923 ierr = MatProductSetType(C2,MATPRODUCT_AB);CHKERRQ(ierr); 924 ierr = MatProductSetFill(C2,product->fill);CHKERRQ(ierr); 925 C2->product->api_user = product->api_user; 926 ierr = MatProductSetFromOptions(C2);CHKERRQ(ierr); 927 if (!C2->ops->productsymbolic) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C2->product->type]); 928 ierr = (*C2->ops->productsymbolic)(C2);CHKERRQ(ierr); 929 ierr = MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C2,N,l2g2,mm->C2_global); 930 931 /* C = C1 + C2. We actually use their global col ids versions in adding */ 932 mm->kh.create_spadd_handle(false); /* Input C1, C2 are NOT sorted, since B_local, B_other are not */ 933 KokkosSparse::spadd_symbolic(&mm->kh,mm->C1_global,mm->C2_global,mm->C_global); 934 /* Have to do numeric since spadd_symbolic does not really populate column indices of the result matrix */ 935 KokkosSparse::spadd_numeric(&mm->kh,(MatScalarType)1.0,mm->C1_global,(MatScalarType)1.0,mm->C2_global,mm->C_global); 936 937 mm->C1 = C1; 938 mm->C2 = C2; 939 ierr = ISRestoreIndices(glob,&garray);CHKERRQ(ierr); 940 ierr = ISDestroy(&glob);CHKERRQ(ierr); 941 PetscFunctionReturn(0); 942 } 943 944 /* MatProductSymbolic_MPIAIJKokkos_AtB - A^tB flavor of MatProductSymbolic_MPIAIJKokkos 945 946 Input Parameters: 947 + product - Mat_Product which carried out the computation. Passed in to access info about this mat product. 948 . A - an MPIAIJKOKKOS matrix 949 . B - a SEQAIJKOKKOS matrix. It works as if A^t is multiplied by a parallel matrix made up of Bs on each rank. 950 . localB - Does B use local col ids? If false, then B is already in global col ids. 951 . N - col size of the "parallel B matrix". It implies B's global col ids are in range of [0,N) and N is the same across the communicator. 952 . l2g - If localB, then l2g maps B's local col ids to global ones. 953 - mm - a struct used to stash intermediate data in AtB 954 955 Notes: The local part of the result C is stored as mm->C_global, which is of type KokkosCsrMatrix and uses global col ids. 956 */ 957 static PetscErrorCode MatProductSymbolic_MPIAIJKokkos_AtB(Mat_Product *product,Mat A,Mat B,PetscBool localB,PetscInt N,const ConstMatColIdxKokkosView& l2g,MatMatStruct_AtB *mm) 958 { 959 PetscErrorCode ierr; 960 Mat_MPIAIJ *a = static_cast<Mat_MPIAIJ*>(A->data); 961 Mat Ad = a->A,Ao = a->B; /* diag and offdiag of A */ 962 Mat C1,C2; /* intermediate matrices */ 963 964 PetscFunctionBegin; 965 /* C1 = Ad^t * B */ 966 ierr = MatProductCreate(Ad,B,NULL,&C1);CHKERRQ(ierr); 967 ierr = MatProductSetType(C1,MATPRODUCT_AtB);CHKERRQ(ierr); 968 ierr = MatProductSetFill(C1,product->fill);CHKERRQ(ierr); 969 C1->product->api_user = product->api_user; 970 ierr = MatProductSetFromOptions(C1);CHKERRQ(ierr); 971 if (!C1->ops->productsymbolic) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C1->product->type]); 972 ierr = (*C1->ops->productsymbolic)(C1);CHKERRQ(ierr); 973 974 if (localB) {ierr = MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C1,N,l2g,mm->C1_global);} 975 else mm->C1_global = static_cast<Mat_SeqAIJKokkos*>(C1->spptr)->csrmat; /* the csrmat already uses global col ids */ 976 977 /* C2 = Ao^t * B */ 978 ierr = MatProductCreate(Ao,B,NULL,&C2);CHKERRQ(ierr); 979 ierr = MatProductSetType(C2,MATPRODUCT_AtB);CHKERRQ(ierr); 980 ierr = MatProductSetFill(C2,product->fill);CHKERRQ(ierr); 981 C2->product->api_user = product->api_user; 982 ierr = MatProductSetFromOptions(C2);CHKERRQ(ierr); 983 if (!C2->ops->productsymbolic) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C2->product->type]); 984 ierr = (*C2->ops->productsymbolic)(C2);CHKERRQ(ierr); 985 986 ierr = MatSeqAIJKokkosReduce(C2,MAT_INITIAL_MATRIX,localB,N,l2g,a->Mvctx,mm->sf,mm->abuf, 987 mm->srcrowoffset,mm->dstrowoffset,mm->C2_global);CHKERRQ(ierr); 988 989 mm->kh.create_spadd_handle(false); /* Input C1, C2 are NOT sorted, since B may be not */ 990 KokkosSparse::spadd_symbolic(&mm->kh,mm->C1_global,mm->C2_global,mm->C_global); 991 /* Have to do numeric since spadd_symbolic does not really populate column indices of the result matrix */ 992 KokkosSparse::spadd_numeric(&mm->kh,(MatScalarType)1.0,mm->C1_global,(MatScalarType)1.0,mm->C2_global,mm->C_global); 993 mm->C1 = C1; 994 mm->C2 = C2; 995 PetscFunctionReturn(0); 996 } 997 998 PetscErrorCode MatProductNumeric_MPIAIJKokkos(Mat C) 999 { 1000 PetscErrorCode ierr; 1001 Mat_Product *product = C->product; 1002 MatProductType ptype; 1003 MatProductData_MPIAIJKokkos *mmdata; 1004 MatMatStruct *mm = NULL; 1005 MatMatStruct_AB *ab; 1006 MatMatStruct_AtB *atb; 1007 Mat A,B,Ad,Ao,Bd,Bo; 1008 const MatScalarType one = 1.0; /* Not use literal 1.0 directly, to avoid wrong template instantiation in KokkosSparse::spadd_numeric */ 1009 1010 PetscFunctionBegin; 1011 MatCheckProduct(C,1); 1012 mmdata = static_cast<MatProductData_MPIAIJKokkos*>(product->data); 1013 ptype = product->type; 1014 A = product->A; 1015 B = product->B; 1016 Ad = static_cast<Mat_MPIAIJ*>(A->data)->A; 1017 Ao = static_cast<Mat_MPIAIJ*>(A->data)->B; 1018 Bd = static_cast<Mat_MPIAIJ*>(B->data)->A; 1019 Bo = static_cast<Mat_MPIAIJ*>(B->data)->B; 1020 1021 if (mmdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 1022 mmdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 1023 ab = mmdata->mmAB; 1024 atb = mmdata->mmAtB; 1025 if (ab) { 1026 static_cast<MatProductData_SeqAIJKokkos*>(ab->C1->product->data)->reusesym = PETSC_FALSE; 1027 static_cast<MatProductData_SeqAIJKokkos*>(ab->C2->product->data)->reusesym = PETSC_FALSE; 1028 } 1029 if (atb) { 1030 static_cast<MatProductData_SeqAIJKokkos*>(atb->C1->product->data)->reusesym = PETSC_FALSE; 1031 static_cast<MatProductData_SeqAIJKokkos*>(atb->C2->product->data)->reusesym = PETSC_FALSE; 1032 } 1033 PetscFunctionReturn(0); 1034 } 1035 1036 if (ptype == MATPRODUCT_AB) { 1037 ab = mmdata->mmAB; 1038 /* C1 = Ad * B_local */ 1039 if (!ab->C1->ops->productnumeric || !ab->C2->ops->productnumeric) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing numeric op for MATPRODUCT_AB"); 1040 ierr = MatMPIAIJGetLocalMatMerge(B,MAT_REUSE_MATRIX,NULL/*glob*/,&ab->B_local);CHKERRQ(ierr); 1041 if (ab->C1->product->B != ab->B_local) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AB, internal mat product matrix C1->B has unexpectedly changed"); 1042 if (ab->C1->product->A != Ad) {ierr = MatProductReplaceMats(Ad,NULL,NULL,ab->C1);CHKERRQ(ierr);} 1043 ierr = (*ab->C1->ops->productnumeric)(ab->C1);CHKERRQ(ierr); 1044 ierr = MatSeqAIJKokkosBcast(ab->B_local,MAT_REUSE_MATRIX,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,ab->sf, 1045 ab->abuf,ab->rows,ab->rowoffset,ab->B_other);CHKERRQ(ierr); 1046 /* C2 = Ao * B_other */ 1047 if (ab->C2->product->B != ab->B_other) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AB, internal mat product matrix C2->B has unexpectedly changed"); 1048 if (ab->C1->product->A != Ao) {ierr = MatProductReplaceMats(Ao,NULL,NULL,ab->C2);CHKERRQ(ierr);} 1049 ierr = (*ab->C2->ops->productnumeric)(ab->C2);CHKERRQ(ierr); 1050 /* C = C1_global + C2_global */ 1051 KokkosSparse::spadd_numeric(&ab->kh,one,ab->C1_global,one,ab->C2_global,ab->C_global); 1052 mm = static_cast<MatMatStruct*>(ab); 1053 } else if (ptype == MATPRODUCT_AtB) { 1054 atb = mmdata->mmAtB; 1055 if (!atb->C1->ops->productnumeric || !atb->C2->ops->productnumeric) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing numeric op for MATPRODUCT_AtB"); 1056 /* C1 = Ad^t * B_local */ 1057 ierr = MatMPIAIJGetLocalMatMerge(B,MAT_REUSE_MATRIX,NULL/*glob*/,&atb->B_local);CHKERRQ(ierr); 1058 if (atb->C1->product->B != atb->B_local) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AtB, internal mat product matrix C1->B has unexpectedly changed"); 1059 if (atb->C1->product->A != Ad) {ierr = MatProductReplaceMats(Ad,NULL,NULL,atb->C1);CHKERRQ(ierr);} 1060 ierr = (*atb->C1->ops->productnumeric)(atb->C1);CHKERRQ(ierr); 1061 1062 /* C2 = Ao^t * B_local */ 1063 if (atb->C2->product->B != atb->B_local) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AtB, internal mat product matrix C2->B has unexpectedly changed"); 1064 if (atb->C2->product->A != Ao) {ierr = MatProductReplaceMats(Ao,NULL,NULL,atb->C2);CHKERRQ(ierr);} 1065 ierr = (*atb->C2->ops->productnumeric)(atb->C2);CHKERRQ(ierr); 1066 /* Form C2_global */ 1067 ierr = MatSeqAIJKokkosReduce(atb->C2,MAT_REUSE_MATRIX,PETSC_TRUE,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,atb->sf, 1068 atb->abuf,atb->srcrowoffset,atb->dstrowoffset,atb->C2_global);CHKERRQ(ierr); 1069 /* C = C1_global + C2_global */ 1070 KokkosSparse::spadd_numeric(&atb->kh,one,atb->C1_global,one,atb->C2_global,atb->C_global); 1071 mm = static_cast<MatMatStruct*>(atb); 1072 } else if (ptype == MATPRODUCT_PtAP) { /* BtAB */ 1073 ab = mmdata->mmAB; 1074 ierr = MatMPIAIJGetLocalMatMerge(B,MAT_REUSE_MATRIX,NULL/*glob*/,&ab->B_local);CHKERRQ(ierr); 1075 1076 /* ab->C1 = Ad * B_local */ 1077 if (ab->C1->product->B != ab->B_local) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_PtAP, internal mat product matrix ab->C1->B has unexpectedly changed"); 1078 if (ab->C1->product->A != Ad) {ierr = MatProductReplaceMats(Ad,NULL,NULL,ab->C1);CHKERRQ(ierr);} 1079 ierr = (*ab->C1->ops->productnumeric)(ab->C1);CHKERRQ(ierr); 1080 ierr = MatSeqAIJKokkosBcast(ab->B_local,MAT_REUSE_MATRIX,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,ab->sf, 1081 ab->abuf,ab->rows,ab->rowoffset,ab->B_other);CHKERRQ(ierr); 1082 /* ab->C2 = Ao * B_other */ 1083 if (ab->C2->product->A != Ao) {ierr = MatProductReplaceMats(Ao,NULL,NULL,ab->C2);CHKERRQ(ierr);} 1084 ierr = (*ab->C2->ops->productnumeric)(ab->C2);CHKERRQ(ierr); /* C2 = Ao * B_other */ 1085 KokkosSparse::spadd_numeric(&ab->kh,one,ab->C1_global,one,ab->C2_global,ab->C_global); 1086 1087 /* atb->C1 = Bd^t * ab->C_petsc */ 1088 atb = mmdata->mmAtB; 1089 if (atb->C1->product->B != ab->C_petsc) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_PtAP, internal mat product matrix atb->C1->B has unexpectedly changed"); 1090 if (atb->C1->product->A != Bd) {ierr = MatProductReplaceMats(Bd,NULL,NULL,atb->C1);CHKERRQ(ierr);} 1091 ierr = (*atb->C1->ops->productnumeric)(atb->C1);CHKERRQ(ierr); 1092 /* atb->C2 = Bo^t * ab->C_petsc */ 1093 if (atb->C2->product->A != Bo) {ierr = MatProductReplaceMats(Bo,NULL,NULL,atb->C2);CHKERRQ(ierr);} 1094 ierr = (*atb->C2->ops->productnumeric)(atb->C2);CHKERRQ(ierr); 1095 ierr = MatSeqAIJKokkosReduce(atb->C2,MAT_REUSE_MATRIX,PETSC_FALSE,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,atb->sf, 1096 atb->abuf,atb->srcrowoffset,atb->dstrowoffset,atb->C2_global);CHKERRQ(ierr); 1097 KokkosSparse::spadd_numeric(&atb->kh,one,atb->C1_global,one,atb->C2_global,atb->C_global); 1098 mm = static_cast<MatMatStruct*>(atb); 1099 } 1100 /* Split C_global to form C */ 1101 ierr = MatSetMPIAIJKokkosWithGlobalCSRMatrix(C,MAT_REUSE_MATRIX,mm->C_global,mm->Cdstart);CHKERRQ(ierr); 1102 PetscFunctionReturn(0); 1103 } 1104 1105 PetscErrorCode MatProductSymbolic_MPIAIJKokkos(Mat C) 1106 { 1107 PetscErrorCode ierr; 1108 Mat A,B; 1109 Mat_Product *product = C->product; 1110 MatProductType ptype; 1111 MatProductData_MPIAIJKokkos *mmdata; 1112 MatMatStruct *mm = NULL; 1113 IS glob = NULL; 1114 const PetscInt *garray; 1115 PetscInt m,n,M,N,sz; 1116 ConstMatColIdxKokkosView l2g; /* map local col ids to global ones */ 1117 1118 PetscFunctionBegin; 1119 MatCheckProduct(C,1); 1120 if (product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 1121 ptype = product->type; 1122 A = product->A; 1123 B = product->B; 1124 1125 switch (ptype) { 1126 case MATPRODUCT_AB: m = A->rmap->n; n = B->cmap->n; M = A->rmap->N; N = B->cmap->N; break; 1127 case MATPRODUCT_AtB: m = A->cmap->n; n = B->cmap->n; M = A->cmap->N; N = B->cmap->N; break; 1128 case MATPRODUCT_PtAP: m = B->cmap->n; n = B->cmap->n; M = B->cmap->N; N = B->cmap->N; break; /* BtAB */ 1129 default: SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for product type %s",MatProductTypes[ptype]); 1130 } 1131 1132 ierr = MatSetSizes(C,m,n,M,N);CHKERRQ(ierr); 1133 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 1134 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 1135 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 1136 1137 mmdata = new MatProductData_MPIAIJKokkos(); 1138 mmdata->reusesym = product->api_user; 1139 1140 if (ptype == MATPRODUCT_AB) { 1141 mmdata->mmAB = new MatMatStruct_AB(); 1142 ierr = MatProductSymbolic_MPIAIJKokkos_AB(product,A,B,mmdata->mmAB);CHKERRQ(ierr); 1143 mm = static_cast<MatMatStruct*>(mmdata->mmAB); 1144 } else if (ptype == MATPRODUCT_AtB) { 1145 mmdata->mmAtB = new MatMatStruct_AtB(); 1146 auto atb = mmdata->mmAtB; 1147 ierr = MatMPIAIJGetLocalMatMerge(B,MAT_INITIAL_MATRIX,&glob,&atb->B_local);CHKERRQ(ierr); 1148 ierr = ISGetIndices(glob,&garray);CHKERRQ(ierr); 1149 ierr = ISGetSize(glob,&sz);CHKERRQ(ierr); 1150 l2g = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatColIdxKokkosViewHost(garray,sz)); 1151 ierr = MatProductSymbolic_MPIAIJKokkos_AtB(product,A,atb->B_local,PETSC_TRUE,N,l2g,atb);CHKERRQ(ierr); 1152 ierr = ISRestoreIndices(glob,&garray);CHKERRQ(ierr); 1153 ierr = ISDestroy(&glob);CHKERRQ(ierr); 1154 mm = static_cast<MatMatStruct*>(atb); 1155 } else if (ptype == MATPRODUCT_PtAP) { /* BtAB */ 1156 mmdata->mmAB = new MatMatStruct_AB(); /* tmp=A*B */ 1157 mmdata->mmAtB = new MatMatStruct_AtB(); /* C=B^t*tmp */ 1158 auto ab = mmdata->mmAB; 1159 auto atb = mmdata->mmAtB; 1160 ierr = MatProductSymbolic_MPIAIJKokkos_AB(product,A,B,ab);CHKERRQ(ierr); 1161 auto tmp = new Mat_SeqAIJKokkos(ab->C_global); /* Memory will be owned by ab->C_petsc */ 1162 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,tmp,&ab->C_petsc);CHKERRQ(ierr); 1163 ierr = MatProductSymbolic_MPIAIJKokkos_AtB(product,B,ab->C_petsc,PETSC_FALSE,N,l2g/*not used*/,atb);CHKERRQ(ierr); 1164 mm = static_cast<MatMatStruct*>(atb); 1165 } 1166 /* Split the C_global into petsc A, B format */ 1167 ierr = MatSetMPIAIJKokkosWithGlobalCSRMatrix(C,MAT_INITIAL_MATRIX,mm->C_global,mm->Cdstart);CHKERRQ(ierr); 1168 C->product->data = mmdata; 1169 C->product->destroy = MatProductDataDestroy_MPIAIJKokkos; 1170 C->ops->productnumeric = MatProductNumeric_MPIAIJKokkos; 1171 PetscFunctionReturn(0); 1172 } 1173 1174 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_MPIAIJKokkos(Mat mat) 1175 { 1176 PetscErrorCode ierr; 1177 Mat_Product *product = mat->product; 1178 PetscBool match = PETSC_FALSE; 1179 PetscBool usecpu = PETSC_FALSE; 1180 1181 PetscFunctionBegin; 1182 MatCheckProduct(mat,1); 1183 if (!product->A->boundtocpu && !product->B->boundtocpu) { 1184 ierr = PetscObjectTypeCompare((PetscObject)product->B,((PetscObject)product->A)->type_name,&match);CHKERRQ(ierr); 1185 } 1186 if (match) { /* we can always fallback to the CPU if requested */ 1187 switch (product->type) { 1188 case MATPRODUCT_AB: 1189 if (product->api_user) { 1190 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr); 1191 ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 1192 ierr = PetscOptionsEnd();CHKERRQ(ierr); 1193 } else { 1194 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr); 1195 ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 1196 ierr = PetscOptionsEnd();CHKERRQ(ierr); 1197 } 1198 break; 1199 case MATPRODUCT_AtB: 1200 if (product->api_user) { 1201 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr); 1202 ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 1203 ierr = PetscOptionsEnd();CHKERRQ(ierr); 1204 } else { 1205 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr); 1206 ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 1207 ierr = PetscOptionsEnd();CHKERRQ(ierr); 1208 } 1209 break; 1210 case MATPRODUCT_PtAP: 1211 if (product->api_user) { 1212 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 1213 ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 1214 ierr = PetscOptionsEnd();CHKERRQ(ierr); 1215 } else { 1216 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr); 1217 ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 1218 ierr = PetscOptionsEnd();CHKERRQ(ierr); 1219 } 1220 break; 1221 default: 1222 break; 1223 } 1224 match = (PetscBool)!usecpu; 1225 } 1226 if (match) { 1227 switch (product->type) { 1228 case MATPRODUCT_AB: 1229 case MATPRODUCT_AtB: 1230 case MATPRODUCT_PtAP: 1231 mat->ops->productsymbolic = MatProductSymbolic_MPIAIJKokkos; 1232 break; 1233 default: 1234 break; 1235 } 1236 } 1237 /* fallback to MPIAIJ ops */ 1238 if (!mat->ops->productsymbolic) { 1239 ierr = MatProductSetFromOptions_MPIAIJ(mat);CHKERRQ(ierr); 1240 } 1241 PetscFunctionReturn(0); 1242 } 1243 1244 PetscErrorCode MatDestroy_MPIAIJKokkos(Mat A) 1245 { 1246 PetscErrorCode ierr; 1247 1248 PetscFunctionBegin; 1249 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 1250 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr); 1251 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 1252 PetscFunctionReturn(0); 1253 } 1254 1255 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 1256 { 1257 PetscErrorCode ierr; 1258 Mat B; 1259 Mat_MPIAIJ *a; 1260 1261 PetscFunctionBegin; 1262 if (reuse == MAT_INITIAL_MATRIX) { 1263 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 1264 } else if (reuse == MAT_REUSE_MATRIX) { 1265 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 1266 } 1267 B = *newmat; 1268 1269 B->boundtocpu = PETSC_FALSE; 1270 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 1271 ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr); 1272 ierr = PetscObjectChangeTypeName((PetscObject)B,MATMPIAIJKOKKOS);CHKERRQ(ierr); 1273 1274 a = static_cast<Mat_MPIAIJ*>(A->data); 1275 if (a->A) {ierr = MatSetType(a->A,MATSEQAIJKOKKOS);CHKERRQ(ierr);} 1276 if (a->B) {ierr = MatSetType(a->B,MATSEQAIJKOKKOS);CHKERRQ(ierr);} 1277 if (a->lvec) {ierr = VecSetType(a->lvec,VECSEQKOKKOS);CHKERRQ(ierr);} 1278 1279 B->ops->assemblyend = MatAssemblyEnd_MPIAIJKokkos; 1280 B->ops->mult = MatMult_MPIAIJKokkos; 1281 B->ops->multadd = MatMultAdd_MPIAIJKokkos; 1282 B->ops->multtranspose = MatMultTranspose_MPIAIJKokkos; 1283 B->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJKokkos; 1284 B->ops->destroy = MatDestroy_MPIAIJKokkos; 1285 1286 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJKokkos);CHKERRQ(ierr); 1287 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJKokkos);CHKERRQ(ierr); 1288 PetscFunctionReturn(0); 1289 } 1290 1291 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJKokkos(Mat A) 1292 { 1293 PetscErrorCode ierr; 1294 1295 PetscFunctionBegin; 1296 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 1297 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 1298 ierr = MatConvert_MPIAIJ_MPIAIJKokkos(A,MATMPIAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 1299 PetscFunctionReturn(0); 1300 } 1301 1302 /*@C 1303 MatCreateAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 1304 (the default parallel PETSc format). This matrix will ultimately pushed down 1305 to Kokkos for calculations. For good matrix 1306 assembly performance the user should preallocate the matrix storage by setting 1307 the parameter nz (or the array nnz). By setting these parameters accurately, 1308 performance during matrix assembly can be increased by more than a factor of 50. 1309 1310 Collective 1311 1312 Input Parameters: 1313 + comm - MPI communicator, set to PETSC_COMM_SELF 1314 . m - number of rows 1315 . n - number of columns 1316 . nz - number of nonzeros per row (same for all rows) 1317 - nnz - array containing the number of nonzeros in the various rows 1318 (possibly different for each row) or NULL 1319 1320 Output Parameter: 1321 . A - the matrix 1322 1323 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1324 MatXXXXSetPreallocation() paradigm instead of this routine directly. 1325 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1326 1327 Notes: 1328 If nnz is given then nz is ignored 1329 1330 The AIJ format (also called the Yale sparse matrix format or 1331 compressed row storage), is fully compatible with standard Fortran 77 1332 storage. That is, the stored row and column indices can begin at 1333 either one (as in Fortran) or zero. See the users' manual for details. 1334 1335 Specify the preallocated storage with either nz or nnz (not both). 1336 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1337 allocation. For large problems you MUST preallocate memory or you 1338 will get TERRIBLE performance, see the users' manual chapter on matrices. 1339 1340 By default, this format uses inodes (identical nodes) when possible, to 1341 improve numerical efficiency of matrix-vector products and solves. We 1342 search for consecutive rows with the same nonzero structure, thereby 1343 reusing matrix information to achieve increased efficiency. 1344 1345 Level: intermediate 1346 1347 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJKOKKOS, MATAIJKokkos 1348 @*/ 1349 PetscErrorCode MatCreateAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 1350 { 1351 PetscErrorCode ierr; 1352 PetscMPIInt size; 1353 1354 PetscFunctionBegin; 1355 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1356 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 1357 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 1358 if (size > 1) { 1359 ierr = MatSetType(*A,MATMPIAIJKOKKOS);CHKERRQ(ierr); 1360 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 1361 } else { 1362 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1363 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 1364 } 1365 PetscFunctionReturn(0); 1366 } 1367 1368 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 1369 PetscErrorCode MatKokkosGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) 1370 { 1371 PetscMPIInt size,rank; 1372 MPI_Comm comm; 1373 PetscErrorCode ierr; 1374 PetscSplitCSRDataStructure d_mat=NULL; 1375 1376 PetscFunctionBegin; 1377 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 1378 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 1379 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 1380 if (size == 1) { 1381 ierr = MatSeqAIJKokkosGetDeviceMat(A,&d_mat);CHKERRQ(ierr); 1382 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); /* Since we are going to modify matrix values on device */ 1383 } else { 1384 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 1385 ierr = MatSeqAIJKokkosGetDeviceMat(aij->A,&d_mat);CHKERRQ(ierr); 1386 ierr = MatSeqAIJKokkosModifyDevice(aij->A);CHKERRQ(ierr); 1387 ierr = MatSeqAIJKokkosModifyDevice(aij->B);CHKERRQ(ierr); 1388 } 1389 // act like MatSetValues because not called on host 1390 if (A->assembled) { 1391 if (A->was_assembled) { 1392 ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr); 1393 } 1394 A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 1395 } else { 1396 ierr = PetscInfo1(A,"Warning !assemble ??? assembled=%" PetscInt_FMT "\n",A->assembled);CHKERRQ(ierr); 1397 // SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix"); 1398 } 1399 if (!d_mat) { 1400 struct _n_SplitCSRMat h_mat; /* host container */ 1401 Mat_SeqAIJKokkos *aijkokA; 1402 Mat_SeqAIJ *jaca; 1403 PetscInt n = A->rmap->n, nnz; 1404 Mat Amat; 1405 PetscInt *colmap; 1406 1407 /* create and copy h_mat */ 1408 h_mat.M = A->cmap->N; // use for debug build 1409 ierr = PetscInfo(A,"Create device matrix in Kokkos\n");CHKERRQ(ierr); 1410 if (size == 1) { 1411 Amat = A; 1412 jaca = (Mat_SeqAIJ*)A->data; 1413 h_mat.rstart = 0; h_mat.rend = A->rmap->n; 1414 h_mat.cstart = 0; h_mat.cend = A->cmap->n; 1415 h_mat.offdiag.i = h_mat.offdiag.j = NULL; 1416 h_mat.offdiag.a = NULL; 1417 aijkokA = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1418 aijkokA->i_uncompressed_d = NULL; 1419 aijkokA->colmap_d = NULL; 1420 } else { 1421 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 1422 Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 1423 PetscInt ii; 1424 Mat_SeqAIJKokkos *aijkokB; 1425 1426 Amat = aij->A; 1427 aijkokA = static_cast<Mat_SeqAIJKokkos*>(aij->A->spptr); 1428 aijkokB = static_cast<Mat_SeqAIJKokkos*>(aij->B->spptr); 1429 aijkokA->i_uncompressed_d = NULL; 1430 aijkokA->colmap_d = NULL; 1431 jaca = (Mat_SeqAIJ*)aij->A->data; 1432 if (aij->B->cmap->n && !aij->garray) SETERRQ(comm,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 1433 if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 1434 aij->donotstash = PETSC_TRUE; 1435 aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 1436 jaca->nonew = jacb->nonew = PETSC_TRUE; // no more disassembly 1437 ierr = PetscCalloc1(A->cmap->N,&colmap);CHKERRQ(ierr); 1438 ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N)*sizeof(PetscInt));CHKERRQ(ierr); 1439 for (ii=0; ii<aij->B->cmap->n; ii++) colmap[aij->garray[ii]] = ii+1; 1440 // allocate B copy data 1441 h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 1442 h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 1443 nnz = jacb->i[n]; 1444 if (jacb->compressedrow.use) { 1445 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_i_k (jacb->i,n+1); 1446 aijkokB->i_uncompressed_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_i_k)); 1447 Kokkos::deep_copy (*aijkokB->i_uncompressed_d, h_i_k); 1448 h_mat.offdiag.i = aijkokB->i_uncompressed_d->data(); 1449 } else { 1450 h_mat.offdiag.i = aijkokB->i_device_data(); 1451 } 1452 h_mat.offdiag.j = aijkokB->j_device_data(); 1453 h_mat.offdiag.a = aijkokB->a_device_data(); 1454 { 1455 Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_colmap_k (colmap,A->cmap->N); 1456 aijkokB->colmap_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_colmap_k)); 1457 Kokkos::deep_copy (*aijkokB->colmap_d, h_colmap_k); 1458 h_mat.colmap = aijkokB->colmap_d->data(); 1459 ierr = PetscFree(colmap);CHKERRQ(ierr); 1460 } 1461 h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 1462 h_mat.offdiag.n = n; 1463 } 1464 // allocate A copy data 1465 nnz = jaca->i[n]; 1466 h_mat.diag.n = n; 1467 h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 1468 ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr); 1469 if (jaca->compressedrow.use) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A does not suppport compressed row (todo)"); 1470 else { 1471 h_mat.diag.i = aijkokA->i_device_data(); 1472 } 1473 h_mat.diag.j = aijkokA->j_device_data(); 1474 h_mat.diag.a = aijkokA->a_device_data(); 1475 // copy pointers and metdata to device 1476 ierr = MatSeqAIJKokkosSetDeviceMat(Amat,&h_mat);CHKERRQ(ierr); 1477 ierr = MatSeqAIJKokkosGetDeviceMat(Amat,&d_mat);CHKERRQ(ierr); 1478 ierr = PetscInfo2(A,"Create device Mat n=%" PetscInt_FMT " nnz=%" PetscInt_FMT "\n",h_mat.diag.n, nnz);CHKERRQ(ierr); 1479 } 1480 *B = d_mat; // return it, set it in Mat, and set it up 1481 A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 1482 PetscFunctionReturn(0); 1483 } 1484 1485 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetOffloadMask(Mat A, const char **mask) 1486 { 1487 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1488 1489 PetscFunctionBegin; 1490 if (!aijkok) *mask = "AIJKOK_UNALLOCATED"; 1491 else if (aijkok->a_dual.need_sync_host()) *mask = "PETSC_OFFLOAD_GPU"; 1492 else if (aijkok->a_dual.need_sync_device()) *mask = "PETSC_OFFLOAD_CPU"; 1493 else *mask = "PETSC_OFFLOAD_BOTH"; 1494 PetscFunctionReturn(0); 1495 } 1496 1497 PETSC_INTERN PetscErrorCode MatAIJKokkosPrintOffloadMask(Mat A) 1498 { 1499 PetscErrorCode ierr; 1500 PetscMPIInt size; 1501 Mat Ad,Ao; 1502 const char *amask,*bmask; 1503 1504 PetscFunctionBegin; 1505 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr); 1506 1507 if (size == 1) { 1508 ierr = MatSeqAIJKokkosGetOffloadMask(A,&amask);CHKERRQ(ierr); 1509 ierr = PetscPrintf(PETSC_COMM_SELF,"%s\n",amask);CHKERRQ(ierr); 1510 } else { 1511 Ad = ((Mat_MPIAIJ*)A->data)->A; 1512 Ao = ((Mat_MPIAIJ*)A->data)->B; 1513 ierr = MatSeqAIJKokkosGetOffloadMask(Ad,&amask);CHKERRQ(ierr); 1514 ierr = MatSeqAIJKokkosGetOffloadMask(Ao,&bmask);CHKERRQ(ierr); 1515 ierr = PetscPrintf(PETSC_COMM_SELF,"Diag : Off-diag = %s : %s\n",amask,bmask);CHKERRQ(ierr); 1516 } 1517 PetscFunctionReturn(0); 1518 } 1519