1 #include <petscvec_kokkos.hpp> 2 #include <petscsf.h> 3 #include <petsc/private/sfimpl.h> 4 #include <../src/mat/impls/aij/mpi/mpiaij.h> 5 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 6 #include <../src/mat/impls/aij/mpi/kokkos/mpiaijkok.hpp> 7 #include <KokkosSparse_spadd.hpp> 8 9 PetscErrorCode MatAssemblyEnd_MPIAIJKokkos(Mat A,MatAssemblyType mode) 10 { 11 PetscErrorCode ierr; 12 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 13 Mat_SeqAIJKokkos *aijkok = mpiaij->A->spptr ? static_cast<Mat_SeqAIJKokkos*>(mpiaij->A->spptr) : NULL; 14 15 PetscFunctionBegin; 16 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 17 if (aijkok && aijkok->device_mat_d.data()) { 18 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 19 } 20 21 PetscFunctionReturn(0); 22 } 23 24 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJKokkos(Mat mat,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 25 { 26 PetscErrorCode ierr; 27 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 28 29 PetscFunctionBegin; 30 ierr = PetscLayoutSetUp(mat->rmap);CHKERRQ(ierr); 31 ierr = PetscLayoutSetUp(mat->cmap);CHKERRQ(ierr); 32 #if defined(PETSC_USE_DEBUG) 33 if (d_nnz) { 34 PetscInt i; 35 for (i=0; i<mat->rmap->n; i++) { 36 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]); 37 } 38 } 39 if (o_nnz) { 40 PetscInt i; 41 for (i=0; i<mat->rmap->n; i++) { 42 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]); 43 } 44 } 45 #endif 46 #if defined(PETSC_USE_CTABLE) 47 ierr = PetscTableDestroy(&mpiaij->colmap);CHKERRQ(ierr); 48 #else 49 ierr = PetscFree(mpiaij->colmap);CHKERRQ(ierr); 50 #endif 51 ierr = PetscFree(mpiaij->garray);CHKERRQ(ierr); 52 ierr = VecDestroy(&mpiaij->lvec);CHKERRQ(ierr); 53 ierr = VecScatterDestroy(&mpiaij->Mvctx);CHKERRQ(ierr); 54 /* Because the B will have been resized we simply destroy it and create a new one each time */ 55 ierr = MatDestroy(&mpiaij->B);CHKERRQ(ierr); 56 57 if (!mpiaij->A) { 58 ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->A);CHKERRQ(ierr); 59 ierr = MatSetSizes(mpiaij->A,mat->rmap->n,mat->cmap->n,mat->rmap->n,mat->cmap->n);CHKERRQ(ierr); 60 ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->A);CHKERRQ(ierr); 61 } 62 if (!mpiaij->B) { 63 PetscMPIInt size; 64 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size);CHKERRMPI(ierr); 65 ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->B);CHKERRQ(ierr); 66 ierr = MatSetSizes(mpiaij->B,mat->rmap->n,size > 1 ? mat->cmap->N : 0,mat->rmap->n,size > 1 ? mat->cmap->N : 0);CHKERRQ(ierr); 67 ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->B);CHKERRQ(ierr); 68 } 69 ierr = MatSetType(mpiaij->A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 70 ierr = MatSetType(mpiaij->B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 71 ierr = MatSeqAIJSetPreallocation(mpiaij->A,d_nz,d_nnz);CHKERRQ(ierr); 72 ierr = MatSeqAIJSetPreallocation(mpiaij->B,o_nz,o_nnz);CHKERRQ(ierr); 73 mat->preallocated = PETSC_TRUE; 74 PetscFunctionReturn(0); 75 } 76 77 PetscErrorCode MatMult_MPIAIJKokkos(Mat mat,Vec xx,Vec yy) 78 { 79 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 80 PetscErrorCode ierr; 81 PetscInt nt; 82 83 PetscFunctionBegin; 84 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 85 if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",mat->cmap->n,nt); 86 ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 87 ierr = (*mpiaij->A->ops->mult)(mpiaij->A,xx,yy);CHKERRQ(ierr); 88 ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 89 ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,yy,yy);CHKERRQ(ierr); 90 PetscFunctionReturn(0); 91 } 92 93 PetscErrorCode MatMultAdd_MPIAIJKokkos(Mat mat,Vec xx,Vec yy,Vec zz) 94 { 95 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 96 PetscErrorCode ierr; 97 PetscInt nt; 98 99 PetscFunctionBegin; 100 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 101 if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",mat->cmap->n,nt); 102 ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 103 ierr = (*mpiaij->A->ops->multadd)(mpiaij->A,xx,yy,zz);CHKERRQ(ierr); 104 ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 105 ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,zz,zz);CHKERRQ(ierr); 106 PetscFunctionReturn(0); 107 } 108 109 PetscErrorCode MatMultTranspose_MPIAIJKokkos(Mat mat,Vec xx,Vec yy) 110 { 111 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 112 PetscErrorCode ierr; 113 PetscInt nt; 114 115 PetscFunctionBegin; 116 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 117 if (nt != mat->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",mat->rmap->n,nt); 118 ierr = (*mpiaij->B->ops->multtranspose)(mpiaij->B,xx,mpiaij->lvec);CHKERRQ(ierr); 119 ierr = (*mpiaij->A->ops->multtranspose)(mpiaij->A,xx,yy);CHKERRQ(ierr); 120 ierr = VecScatterBegin(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 121 ierr = VecScatterEnd(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 122 PetscFunctionReturn(0); 123 } 124 125 /* Merge the "A, B" matrices of mat into a matrix C. mat's type is MPIAIJKOKKOS. C's type is MATSEQAIJKOKKOS. 126 A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n). 127 C still uses local column ids. Their corresponding global column ids are returned in glob. 128 */ 129 PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJKokkos(Mat mat,MatReuse reuse,IS *glob,Mat *C) 130 { 131 Mat Ad,Ao; 132 const PetscInt *cmap; 133 PetscErrorCode ierr; 134 135 PetscFunctionBegin; 136 ierr = MatMPIAIJGetSeqAIJ(mat,&Ad,&Ao,&cmap);CHKERRQ(ierr); 137 ierr = MatSeqAIJKokkosMergeMats(Ad,Ao,reuse,C);CHKERRQ(ierr); 138 if (glob) { 139 PetscInt cst, i, dn, on, *gidx; 140 ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr); 141 ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr); 142 ierr = MatGetOwnershipRangeColumn(mat,&cst,NULL);CHKERRQ(ierr); 143 ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr); 144 for (i=0; i<dn; i++) gidx[i] = cst + i; 145 for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 146 ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr); 147 } 148 PetscFunctionReturn(0); 149 } 150 151 /* Structs used in matrix product C=AB, C=A^tB and C=B^tAB */ 152 struct MatMatStruct { 153 MatRowMapKokkosView Cdstart; /* Used to split sequential matrix into petsc's A, B format */ 154 PetscSF sf; /* SF to send/recv matrix entries */ 155 MatScalarKokkosView abuf; /* buf of mat values in send/recv */ 156 Mat C1,C2,B_local; 157 KokkosCsrMatrix C1_global,C2_global,C_global; 158 KernelHandle kh; 159 MatMatStruct() { 160 C1 = C2 = B_local = NULL; 161 sf = NULL; 162 } 163 164 ~MatMatStruct() { 165 MatDestroy(&C1); 166 MatDestroy(&C2); 167 MatDestroy(&B_local); 168 PetscSFDestroy(&sf); 169 kh.destroy_spadd_handle(); 170 } 171 }; 172 173 struct MatMatStruct_AB : public MatMatStruct { 174 MatColIdxKokkosView rows; 175 MatRowMapKokkosView rowoffset; 176 Mat B_other,C_petsc; /* SEQAIJKOKKOS matrices. TODO: have a better var name than C_petsc */ 177 178 MatMatStruct_AB() : B_other(NULL),C_petsc(NULL){} 179 ~MatMatStruct_AB() { 180 MatDestroy(&B_other); 181 MatDestroy(&C_petsc); 182 } 183 }; 184 185 struct MatMatStruct_AtB : public MatMatStruct { 186 MatRowMapKokkosView srcrowoffset,dstrowoffset; 187 }; 188 189 struct MatProductData_MPIAIJKokkos 190 { 191 MatMatStruct_AB *mmAB; 192 MatMatStruct_AtB *mmAtB; 193 PetscBool reusesym; 194 195 MatProductData_MPIAIJKokkos(): mmAB(NULL),mmAtB(NULL),reusesym(PETSC_FALSE){} 196 ~MatProductData_MPIAIJKokkos() { 197 delete mmAB; 198 delete mmAtB; 199 } 200 }; 201 202 static PetscErrorCode MatProductDataDestroy_MPIAIJKokkos(void *data) 203 { 204 PetscFunctionBegin; 205 CHKERRCXX(delete static_cast<MatProductData_MPIAIJKokkos*>(data)); 206 PetscFunctionReturn(0); 207 } 208 209 /* MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds - Get a KokkosCsrMatrix from a MATSEQAIJKOKKOS matrix 210 211 Input Parameters: 212 + A - the MATSEQAIJKOKKOS matrix 213 . N - new column size for the returned Kokkos matrix 214 - l2g - a map that maps old col ids to new col ids 215 216 Output Parameters: 217 . csrmat - the Kokkos matrix, which has the same row size as A, shares a, i but not j with A. 218 */ 219 static PetscErrorCode MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(Mat A,PetscInt N,const ConstMatColIdxKokkosView& l2g,KokkosCsrMatrix& csrmat) 220 { 221 KokkosCsrMatrix& orig = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->csrmat; 222 MatColIdxKokkosView jg("jg",orig.nnz()); /* New j array for csrmat */ 223 224 PetscFunctionBegin; 225 CHKERRCXX(Kokkos::parallel_for(orig.nnz(), KOKKOS_LAMBDA(const PetscInt i) {jg(i) = l2g(orig.graph.entries(i));})); 226 CHKERRCXX(csrmat = KokkosCsrMatrix("csrmat",orig.numRows(),N,orig.nnz(),orig.values,orig.graph.row_map,jg)); 227 PetscFunctionReturn(0); 228 } 229 230 /* MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices - Set the diag and offdiag matrices of a MATMPIAIJKOKKOS matrix. 231 It is similar to MatCreateMPIAIJWithSplitArrays. 232 233 Input Parameters: 234 + mat - the MATMPIAIJKOKKOS matrix, which should have its type and layout set, but should not have its diag, offdiag matrices set 235 . A - the diag matrix using local col ids 236 - B - the offdiag matrix using global col ids 237 238 Output Parameters: 239 . mat - the updated MATMPIAIJKOKKOS matrix 240 */ 241 static PetscErrorCode MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices(Mat mat,Mat A,Mat B) 242 { 243 PetscErrorCode ierr; 244 Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); 245 PetscInt m,n,M,N,Am,An,Bm,Bn; 246 Mat_SeqAIJKokkos *bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 247 248 PetscFunctionBegin; 249 ierr = MatGetSize(mat,&M,&N);CHKERRQ(ierr); 250 ierr = MatGetLocalSize(mat,&m,&n);CHKERRQ(ierr); 251 ierr = MatGetLocalSize(A,&Am,&An);CHKERRQ(ierr); 252 ierr = MatGetLocalSize(B,&Bm,&Bn);CHKERRQ(ierr); 253 254 if (m != Am || m != Bm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"local number of rows do not match"); 255 if (n != An) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"local number of columns do not match"); 256 if (N != Bn) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"global number of columns do not match"); 257 if (mpiaij->A || mpiaij->B) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A, B of the MPIAIJ matrix are not empty"); 258 mpiaij->A = A; 259 mpiaij->B = B; 260 261 mat->preallocated = PETSC_TRUE; 262 mat->nooffprocentries = PETSC_TRUE; /* See MatAssemblyBegin_MPIAIJ. In effect, making MatAssemblyBegin a nop */ 263 264 ierr = MatSetOption(mat,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE);CHKERRQ(ierr); 265 ierr = MatAssemblyBegin(mat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 266 /* MatAssemblyEnd is critical here. It sets mat->offloadmask according to A and B's, and 267 also gets mpiaij->B compacted, with its col ids and size reduced 268 */ 269 ierr = MatAssemblyEnd(mat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 270 ierr = MatSetOption(mat,MAT_NO_OFF_PROC_ENTRIES,PETSC_FALSE);CHKERRQ(ierr); 271 ierr = MatSetOption(mat,MAT_NEW_NONZERO_LOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 272 273 /* Update bkok with new local col ids (stored on host) and size */ 274 bkok->j_dual.modify_host(); 275 bkok->j_dual.sync_device(); 276 bkok->SetColSize(mpiaij->B->cmap->n); 277 PetscFunctionReturn(0); 278 } 279 280 /* MatSeqAIJKokkosBcast - Bcast rows of a SEQAIJKOKKOS matrice (B) to form a SEQAIJKOKKOS matrix (C). 281 282 It is essentially the MPIAIJKOKKOS counterpart of MatGetBrowsOfAoCols_MPIAIJ, but supports device and uses PetscSF. 283 In the given ownerSF, leaves correspond to rows in C, and roots correspond to rows in B. Roots may connect to multiple leaves. 284 Suppose C's j-th row is connected to a root identified by PetscSFNode (k,i), it means we will bcast the i-th row of B on rank k 285 to j-th row of C. ownerSF's leaves must be contiguous (in other words, as if ilocal=NULL was used to set its graph). 286 287 Collective on comm of ownerSF 288 289 Input Parameters: 290 + B - the SEQAIJKOKKOS matrix, using local col ids 291 . reuse - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX 292 . N - global col ids are in range of [0,N). N Must be the same across ranks (nonsignificant in MAT_REUSE_MATRIX) 293 . l2g - a map mapping B's local col ids to global ones (nonsignificant in MAT_REUSE_MATRIX) 294 . ownerSF - the ownership SF (nonsignificant in MAT_REUSE_MATRIX) 295 296 Input/Output Parameters (out when resue = MAT_INITIAL_MATRIX, inout when reuse = MAT_REUSE_MATRIX) 297 + bcastSF - the SF used to bcast rows of B. This plain SF does buffer (abuf) to buffer (Ca) send/recv. In this SF, vertices are nonzeros. 298 . abuf - buffer for sending matrix values 299 . rows - array containing indices of (local) rows that this rank needs to bcast to others. Each receiver rank has a chunk in rows[]. 300 Values in rows[] might have repeats, which simply indicates a row will be bcast'ed to multiple neighbors. 301 . rowoffset - For each row in rows[], it will be copied to rowoffset[] at abuf[] 302 - C - the SEQAIJKOKKOS matrix made of the bcast'ed rows, using local col ids. 303 */ 304 static PetscErrorCode MatSeqAIJKokkosBcast(Mat B,MatReuse reuse,PetscInt N,const ConstMatColIdxKokkosView& l2g,PetscSF ownerSF, 305 PetscSF& bcastSF,MatScalarKokkosView& abuf,MatColIdxKokkosView& rows, 306 MatRowMapKokkosView& rowoffset,Mat& C) 307 { 308 PetscErrorCode ierr; 309 Mat_SeqAIJKokkos *bkok,*ckok; 310 311 PetscFunctionBegin; 312 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); /* Make sure B->spptr is accessible */ 313 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 314 315 if (reuse == MAT_REUSE_MATRIX) { 316 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 317 318 const auto& Ba = bkok->a_dual.view_device(); 319 const auto& Bi = bkok->i_dual.view_device(); 320 const auto& Ca = ckok->a_dual.view_device(); 321 322 /* Copy Ba to abuf */ 323 Kokkos::parallel_for(Kokkos::TeamPolicy<>(rows.extent(0), Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 324 PetscInt i = t.league_rank(); /* rows[i] is r-th row of B */ 325 PetscInt r = rows(i); 326 PetscInt base = rowoffset(i); /* Copy r-th row of B to this offset in abuf[] */ 327 Kokkos::parallel_for(Kokkos::TeamThreadRange(t,Bi(r+1)-Bi(r)),[&](PetscInt k) { 328 abuf(base+k) = Ba(Bi(r)+k); 329 }); 330 }); 331 332 /* Send abuf to Ca through bcastSF and then mark C is updated on device */ 333 ierr = PetscSFBcastBegin(bcastSF,MPIU_SCALAR,abuf.data(),Ca.data(),MPI_REPLACE);CHKERRQ(ierr); /* TODO: get memtype for abuf */ 334 ierr = PetscSFBcastEnd (bcastSF,MPIU_SCALAR,abuf.data(),Ca.data(),MPI_REPLACE);CHKERRQ(ierr); 335 ckok->a_dual.modify_device(); 336 } else if (reuse == MAT_INITIAL_MATRIX) { 337 MPI_Comm comm; 338 PetscMPIInt tag; 339 PetscInt k,Cm,Cn,Cnnz,*Ci_h,nroots,nleaves; 340 341 ierr = PetscObjectGetComm((PetscObject)ownerSF,&comm);CHKERRMPI(ierr); 342 ierr = PetscSFGetGraph(ownerSF,&nroots,&nleaves,NULL,NULL);CHKERRQ(ierr); 343 Cm = nleaves; /* row size of C */ 344 Cn = N; /* col size of C, which initially uses global ids, so we can safely set its col size as N */ 345 346 /* Get row lens (nz) of B's rows for later fast query */ 347 PetscInt *Browlens; 348 const PetscInt *tmp = bkok->i_host_data(); 349 ierr = PetscMalloc1(nroots,&Browlens);CHKERRQ(ierr); 350 for (k=0; k<nroots; k++) Browlens[k] = tmp[k+1]-tmp[k]; 351 352 /* By ownerSF, each proc gets lens of rows of C */ 353 MatRowMapKokkosDualView Ci("i",Cm+1); /* C's rowmap */ 354 Ci_h = Ci.view_host().data(); 355 Ci_h[0] = 0; 356 ierr = PetscSFBcastWithMemTypeBegin(ownerSF,MPIU_INT,PETSC_MEMTYPE_HOST,Browlens,PETSC_MEMTYPE_HOST,&Ci_h[1],MPI_REPLACE);CHKERRQ(ierr); 357 ierr = PetscSFBcastEnd(ownerSF,MPIU_INT,Browlens,&Ci_h[1],MPI_REPLACE);CHKERRQ(ierr); 358 for (k=1; k<Cm+1; k++) Ci_h[k] += Ci_h[k-1]; /* Convert lens to CSR */ 359 Cnnz = Ci_h[Cm]; 360 Ci.modify_host(); 361 Ci.sync_device(); 362 363 /* With the newly known Cnnz, we are able to allocate (j, a) for C on host & device */ 364 MatColIdxKokkosDualView Cj("j",Cnnz); 365 MatScalarKokkosDualView Ca("a",Cnnz); 366 367 /* Now build the bcastSF to fill Ca, Cj. This plain SF only does (contiguous) buffer to buffer send/recv */ 368 const PetscMPIInt *iranks,*ranks; 369 const PetscInt *ioffset,*irootloc,*roffset; 370 PetscInt i,j,niranks,nranks,*sdisp,*rdisp,*rowptr; 371 MPI_Request *reqs; 372 373 ierr = PetscSFGetLeafRanks(ownerSF,&niranks,&iranks,&ioffset,&irootloc);CHKERRQ(ierr); /* irootloc[] contains indices of rows I need to send to each receiver */ 374 ierr = PetscSFGetRootRanks(ownerSF,&nranks,&ranks,&roffset,NULL/*rmine*/,NULL/*rremote*/);CHKERRQ(ierr); /* recv info */ 375 376 /* figure out offsets at the send buffer, to build the SF 377 sdisp[] - stores offsets of nonzeros (in abuf or jbuf, see later) I need to send, per receiver. 378 rowptr[] - stores offsets for data of each row in abuf 379 380 rdisp[] - to receive sdisp[] 381 */ 382 ierr = PetscMalloc3(niranks+1,&sdisp,nranks,&rdisp,niranks+nranks,&reqs);CHKERRQ(ierr); 383 MatRowMapKokkosViewHost rowptr_h("rowptr_h",ioffset[niranks]+1); /* Let Kokkos do the allocation, so that we can do an easy mirror later */ 384 rowptr = rowptr_h.data(); 385 386 sdisp[0] = 0; 387 rowptr[0] = 0; 388 for (i=0; i<niranks; i++) { /* for each receiver */ 389 PetscInt len, nz = 0; 390 for (j=ioffset[i]; j<ioffset[i+1]; j++) { /* for each row to this receiver */ 391 len = Browlens[irootloc[j]]; 392 rowptr[j+1] = rowptr[j] + len; 393 nz += len; 394 } 395 sdisp[i+1] = sdisp[i] + nz; 396 } 397 ierr = PetscCommGetNewTag(comm,&tag);CHKERRMPI(ierr); 398 for (i=0; i<nranks; i++) {ierr = MPI_Irecv(&rdisp[i],1,MPIU_INT,ranks[i],tag,comm,&reqs[i]);CHKERRMPI(ierr);} 399 for (i=0; i<niranks; i++) {ierr = MPI_Isend(&sdisp[i],1,MPIU_INT,iranks[i],tag,comm,&reqs[nranks+i]);CHKERRMPI(ierr);} 400 ierr = MPI_Waitall(niranks+nranks,reqs,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 401 402 PetscInt nleaves2 = Cnnz; /* leaves are the nonzeros I will receive */ 403 PetscInt nroots2 = sdisp[niranks]; /* roots are the nonzeros (in abuf) I will send */ 404 PetscSFNode *iremote; 405 ierr = PetscMalloc1(nleaves2,&iremote);CHKERRQ(ierr); 406 for (i=0; i<nranks; i++) { /* for each sender */ 407 k = 0; 408 for (j=Ci_h[roffset[i]]; j<Ci_h[roffset[i+1]]; j++) { 409 iremote[j].rank = ranks[i]; 410 iremote[j].index = rdisp[i] + k; 411 k++; 412 } 413 } 414 /* TODO: we should extend PetscSF APIs for this buffer-to-buffer send/recv */ 415 ierr = PetscSFCreate(comm,&bcastSF);CHKERRQ(ierr); 416 ierr = PetscSFSetGraph(bcastSF,nroots2,nleaves2,NULL/*ilocal*/,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr); 417 418 /* Extract selected rows of B, and copy their (a, j) into abuf[] and jbuf[], with j converted 419 from local to global. Then use bcastSF to fill Ca, Cj. 420 */ 421 ConstMatColIdxKokkosViewHost rows_h(irootloc,ioffset[niranks]); /* irootloc[] stores indices of rows I need to send */ 422 MatColIdxKokkosView rows("rows",ioffset[niranks]); 423 Kokkos::deep_copy(rows,rows_h); /* Use deep copy since irootoc is managed by PetscSF and we want 'rows' to be standalone */ 424 425 rowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),rowptr_h); /* If no device, rowoffset will be an alias to rowptr_h */ 426 427 MatColIdxKokkosView jbuf("jbuf",sdisp[niranks]); /* send buf for (global) col ids */ 428 abuf = MatScalarKokkosView("abuf",sdisp[niranks]); /* send buf for mat values */ 429 430 const auto& Ba = bkok->a_dual.view_device(); 431 const auto& Bi = bkok->i_dual.view_device(); 432 const auto& Bj = bkok->j_dual.view_device(); 433 434 /* Copy Ba, Bj to abuf, jbuf with change col ids from local to global */ 435 Kokkos::parallel_for(Kokkos::TeamPolicy<>(rows.extent(0), Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 436 PetscInt i = t.league_rank(); /* rows[i] is r-th row of B */ 437 PetscInt r = rows(i); 438 PetscInt base = rowoffset(i); /* Copy r-th row of B to this offset in abuf[] */ 439 Kokkos::parallel_for(Kokkos::TeamThreadRange(t,Bi(r+1)-Bi(r)),[&](PetscInt k) { 440 abuf(base+k) = Ba(Bi(r)+k); 441 jbuf(base+k) = l2g(Bj(Bi(r)+k)); 442 }); 443 }); 444 445 /* Send abuf & jbuf to fill Ca, Cj */ 446 ierr = PetscSFBcastBegin(bcastSF,MPIU_INT, jbuf.data(),Cj.view_device().data(),MPI_REPLACE);CHKERRQ(ierr); 447 ierr = PetscSFBcastBegin(bcastSF,MPIU_SCALAR,abuf.data(),Ca.view_device().data(),MPI_REPLACE);CHKERRQ(ierr); 448 ierr = PetscSFBcastEnd (bcastSF,MPIU_INT, jbuf.data(),Cj.view_device().data(),MPI_REPLACE);CHKERRQ(ierr); 449 ierr = PetscSFBcastEnd (bcastSF,MPIU_SCALAR,abuf.data(),Ca.view_device().data(),MPI_REPLACE);CHKERRQ(ierr); 450 Cj.modify_device(); /* Mark Cj, Ca modified on device, but only sync Cj since we might not need Ca on host at all */ 451 Cj.sync_host(); 452 Ca.modify_device(); 453 454 /* Construct C with Ca, Ci, Cj */ 455 auto ckok = new Mat_SeqAIJKokkos(Cm,Cn,Cnnz,Ci,Cj,Ca); 456 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,&C);CHKERRQ(ierr); 457 ierr = PetscFree3(sdisp,rdisp,reqs);CHKERRQ(ierr); 458 ierr = PetscFree(Browlens);CHKERRQ(ierr); 459 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unsupported MatReuse enum %d",reuse); 460 PetscFunctionReturn(0); 461 } 462 463 /* MatSeqAIJKokkosReduce - Reduce rows of a SEQAIJKOKKOS matrix (A) to form a Kokkos Csr matrix (C) 464 465 It is the reverse of MatSeqAIJKokkosBcast in some sense. 466 467 Think each row of A as a leaf, then the given ownerSF specifies roots of the leaves. Roots may connect to multiple leaves. 468 In this routine, we reduce (i.e., concatenate) leaves (rows) at their roots to form potentially longer rows in C. Such rows might 469 contain repeats, which does not matter since they will be summed up by other routines. C's row size will be nroots of ownerSF. 470 471 Input Parameters: 472 + A - the SEQAIJKOKKOS matrix to be reduced 473 . reuse - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX 474 . local - true if A uses local col ids; false if A is already in global col ids. 475 . N - if local, N is A's global col size 476 . l2g - if local, a map mapping A's local col ids to global ones, which are in range of [0,N). 477 - ownerSF - the SF specifies ownership (root) of rows in A 478 479 Output Parameters: 480 + reduceSF - the SF to reduce A's rows to contiguous buffers at the receiver side 481 . abuf - a contiguous buffer to receive A's rows sent to this proc. Suppose there are 'nrows' such rows. 482 . srcrowoffset - offset array of size nrows+1. Each entry is the corresponding row's offset in abuf[]. srcrowoffset[i+1]-srcrowoffset[i] is row i's len. 483 . dstrowoffset - offset array of size nrows. Each entry is the corresponding row's offset in Ca[], i.e., C's 'a' array. Row i, i+1 in abuf[] may go to 484 unrelated places in Ca, so dstrowoffset is not in CSR-like format as srcrowoffset. 485 - C - the matrix made up by rows sent to me from other ranks, using global col ids 486 487 TODO: we can even have MatSeqAIJKokkosReduceBegin/End to provide oppertunity for callers to overlap comp./comm. when reuse = MAT_REUSE_MATRIX. 488 */ 489 static PetscErrorCode MatSeqAIJKokkosReduce(Mat A,MatReuse reuse,PetscBool local,PetscInt N,const ConstMatColIdxKokkosView& l2g,PetscSF ownerSF, 490 PetscSF& reduceSF,MatScalarKokkosView& abuf, 491 MatRowMapKokkosView& srcrowoffset,MatRowMapKokkosView& dstrowoffset, 492 KokkosCsrMatrix& C) 493 { 494 PetscErrorCode ierr; 495 PetscInt i,r,Am,An,Annz,Cnnz,nrows; 496 const PetscInt *Ai; 497 Mat_SeqAIJKokkos *akok; 498 499 PetscFunctionBegin; 500 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); /* So that A's latest data is on device */ 501 ierr = MatGetSize(A,&Am,&An); 502 Ai = static_cast<Mat_SeqAIJ*>(A->data)->i; 503 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 504 Annz = Ai[Am]; 505 506 if (reuse == MAT_REUSE_MATRIX) { 507 /* Send Aa to abuf */ 508 ierr = PetscSFReduceBegin(reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE);CHKERRMPI(ierr); 509 ierr = PetscSFReduceEnd (reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE);CHKERRMPI(ierr); 510 511 /* Copy abuf to Ca */ 512 const MatScalarKokkosView& Ca = C.values; 513 nrows = dstrowoffset.extent(0); /* Not srcrowoffset[] since it has an extra entry for CSR */ 514 Kokkos::parallel_for(Kokkos::TeamPolicy<>(nrows, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 515 PetscInt i = t.league_rank(); 516 PetscInt src = srcrowoffset(i), dst = dstrowoffset(i); 517 PetscInt len = srcrowoffset(i+1) - srcrowoffset(i); 518 Kokkos::parallel_for(Kokkos::TeamThreadRange(t,len), [&](PetscInt k) {Ca(dst+k) = abuf(src+k);}); 519 }); 520 } else if (reuse == MAT_INITIAL_MATRIX) { 521 MPI_Comm comm; 522 MPI_Request *reqs; 523 PetscMPIInt tag; 524 PetscInt Cm; 525 526 ierr = PetscObjectGetComm((PetscObject)ownerSF,&comm);CHKERRQ(ierr); 527 ierr = PetscCommGetNewTag(comm,&tag);CHKERRQ(ierr); 528 529 PetscInt niranks,nranks,nroots,nleaves; 530 const PetscMPIInt *iranks,*ranks; 531 const PetscInt *ioffset,*rows,*roffset; /* rows[] contains local indices of rows scattered to me from others. ioffset[] is a CSR on rows[] */ 532 ierr = PetscSFSetUp(ownerSF);CHKERRQ(ierr); 533 ierr = PetscSFGetLeafRanks(ownerSF,&niranks,&iranks,&ioffset,&rows);CHKERRQ(ierr); /* recv info: iranks[] will send rows to me */ 534 ierr = PetscSFGetRootRanks(ownerSF,&nranks,&ranks,&roffset,NULL/*rmine*/,NULL/*rremote*/);CHKERRQ(ierr); /* send info */ 535 ierr = PetscSFGetGraph(ownerSF,&nroots,&nleaves,NULL,NULL);CHKERRQ(ierr); 536 if (nleaves != Am) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"ownerSF's nleaves(%" PetscInt_FMT ") != row size of A(%" PetscInt_FMT ")",nleaves,Am); 537 Cm = nroots; 538 nrows = ioffset[niranks]; /* # of rows to be received. Might receive same row (each is partial) from different senders */ 539 540 /* Tell owners how long each row I will send */ 541 PetscInt *srowlens; /* send buf of row lens */ 542 MatRowMapKokkosViewHost rrowlens_h("rrowoffset_h",nrows+1); /* recv buf of row lens. +1 to make CSR later. Memory might be passed to other views */ 543 PetscInt *rrowlens = rrowlens_h.data(); 544 545 ierr = PetscMalloc2(Am,&srowlens,niranks+nranks,&reqs);CHKERRQ(ierr); 546 for (i=0; i<Am; i++) srowlens[i] = Ai[i+1] - Ai[i]; 547 rrowlens[0] = 0; 548 rrowlens++; /* shift the pointer to make the following expression more readable */ 549 for (i=0; i<niranks; i++){ierr = MPI_Irecv(&rrowlens[ioffset[i]],ioffset[i+1]-ioffset[i],MPIU_INT,iranks[i],tag,comm,&reqs[i]);CHKERRMPI(ierr);} 550 for (i=0; i<nranks; i++) {ierr = MPI_Isend(&srowlens[roffset[i]],roffset[i+1]-roffset[i],MPIU_INT,ranks[i],tag,comm,&reqs[niranks+i]);CHKERRMPI(ierr);} 551 ierr = MPI_Waitall(niranks+nranks,reqs,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 552 553 /* Owner builds Ci on host by histogramming rrowlens[] */ 554 MatRowMapKokkosViewHost Ci_h("i",Cm+1); 555 Kokkos::deep_copy(Ci_h,0); /* Zero Ci */ 556 MatRowMapType *Ci_ptr = Ci_h.data(); 557 558 for (i=0; i<nrows; i++) { 559 r = rows[i]; /* local row id of i-th received row */ 560 #if defined(PETSC_USE_DEBUG) 561 if (r<0 || r>=Cm) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"local row id (%" PetscInt_FMT ") is out of range [0,%" PetscInt_FMT ")",r,Cm); 562 #endif 563 Ci_ptr[r+1] += rrowlens[i]; /* add to length of row r in C */ 564 } 565 for (i=0; i<Cm; i++) Ci_ptr[i+1] += Ci_ptr[i]; /* to CSR format */ 566 Cnnz = Ci_ptr[Cm]; 567 568 /* For each received row, compute src & dst offsets in memory copying (from recv bufs abuf, jbuf to Ca, Cj) */ 569 MatRowMapKokkosViewHost dstrowoffset_h("dstrowoffset_h",nrows); 570 PetscInt *dstrowoffset_hptr = dstrowoffset_h.data(); 571 PetscInt *currowlens; /* Current row lens. They are temp accumulators for row lens in C, to help build dstrowoffset */ 572 573 ierr = PetscCalloc1(Cm,&currowlens);CHKERRQ(ierr); /* Init with zero, to be added to */ 574 for (i=0; i<nrows; i++) { /* for each row I receive */ 575 r = rows[i]; /* row id in C */ 576 dstrowoffset_hptr[i] = Ci_ptr[r] + currowlens[r]; /* dst offset of the new place for each recv'ed row in Ca/Cj */ 577 currowlens[r] += rrowlens[i]; /* accumulate to length of row r in C */ 578 } 579 ierr = PetscFree(currowlens);CHKERRQ(ierr); 580 581 rrowlens--; 582 for (i=0; i<nrows; i++) rrowlens[i+1] += rrowlens[i]; /* Change rrowlens[] to CSR format */ 583 dstrowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),dstrowoffset_h); 584 srcrowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),rrowlens_h); /* src offset of each recv'ed row in abuf/jbuf */ 585 586 /* Build the reduceSF, which performs buffer to buffer send/recv */ 587 PetscInt *sdisp,*rdisp; /* buffer to send offsets of roots, and buffer to recv them */ 588 ierr = PetscMalloc2(niranks,&sdisp,nranks,&rdisp);CHKERRQ(ierr); 589 for (i=0; i<niranks; i++) sdisp[i] = rrowlens[ioffset[i]]; 590 for (i=0; i<nranks; i++) {ierr = MPI_Irecv(&rdisp[i],1,MPIU_INT,ranks[i],tag,comm,&reqs[i]);CHKERRMPI(ierr);} 591 for (i=0; i<niranks; i++) {ierr = MPI_Isend(&sdisp[i],1,MPIU_INT,iranks[i],tag,comm,&reqs[nranks+i]);CHKERRMPI(ierr);} 592 ierr = MPI_Waitall(niranks+nranks,reqs,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 593 594 /* Nonzeros in abuf/jbuf are roots and those in A are leaves */ 595 PetscInt nroots2 = Cnnz,nleaves2 = Annz; 596 PetscSFNode *iremote; 597 ierr = PetscMalloc1(nleaves2,&iremote);CHKERRQ(ierr); /* no free, since memory will be given to reduceSF */ 598 for (i=0; i<nranks; i++) { 599 PetscInt rootbase = rdisp[i]; /* root offset at this root rank */ 600 PetscInt leafbase = Ai[roffset[i]]; /* leaf base */ 601 PetscInt nz = Ai[roffset[i+1]] - leafbase; /* I will send nz nonzeros to this root rank */ 602 for (PetscInt k=0; k<nz; k++) { 603 iremote[leafbase+k].rank = ranks[i]; 604 iremote[leafbase+k].index = rootbase + k; 605 } 606 } 607 ierr = PetscSFCreate(comm,&reduceSF);CHKERRQ(ierr); 608 ierr = PetscSFSetGraph(reduceSF,nroots2,nleaves2,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr); 609 ierr = PetscFree2(sdisp,rdisp);CHKERRQ(ierr); 610 611 /* Reduce Aa, Ajg to abuf and jbuf */ 612 613 /* If A uses local col ids, convert them to global ones before sending */ 614 MatColIdxKokkosView Ajg; 615 if (local) { 616 Ajg = MatColIdxKokkosView("j",Annz); 617 const MatColIdxKokkosView& Aj = akok->j_dual.view_device(); 618 Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscInt i) {Ajg(i) = l2g(Aj(i));}); 619 } else { 620 Ajg = akok->j_dual.view_device(); /* no data copy, just take a reference */ 621 } 622 623 MatColIdxKokkosView jbuf("jbuf",Cnnz); 624 abuf = MatScalarKokkosView("abuf",Cnnz); 625 ierr = PetscSFReduceBegin(reduceSF,MPIU_INT, Ajg.data(), jbuf.data(),MPI_REPLACE);CHKERRMPI(ierr); 626 ierr = PetscSFReduceEnd (reduceSF,MPIU_INT, Ajg.data(), jbuf.data(),MPI_REPLACE);CHKERRMPI(ierr); 627 ierr = PetscSFReduceBegin(reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE);CHKERRMPI(ierr); 628 ierr = PetscSFReduceEnd (reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE);CHKERRMPI(ierr); 629 630 /* Copy data from abuf, jbuf to Ca, Cj */ 631 MatRowMapKokkosView Ci = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),Ci_h); /* Ci is an alias of Ci_h if no device */ 632 MatColIdxKokkosView Cj("j",Cnnz); 633 MatScalarKokkosView Ca("a",Cnnz); 634 635 Kokkos::parallel_for(Kokkos::TeamPolicy<>(nrows, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 636 PetscInt i = t.league_rank(); 637 PetscInt src = srcrowoffset(i), dst = dstrowoffset(i); 638 PetscInt len = srcrowoffset(i+1) - srcrowoffset(i); 639 Kokkos::parallel_for(Kokkos::TeamThreadRange(t,len), [&](PetscInt k) { 640 Ca(dst+k) = abuf(src+k); 641 Cj(dst+k) = jbuf(src+k); 642 }); 643 }); 644 645 /* Build C with Ca, Ci, Cj */ 646 C = KokkosCsrMatrix("csrmat",Cm,N,Cnnz,Ca,Ci,Cj); 647 ierr = PetscFree2(srowlens,reqs);CHKERRQ(ierr); 648 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unsupported MatReuse enum %d",reuse); 649 PetscFunctionReturn(0); 650 } 651 652 /* MatSetMPIAIJKokkosWithGlobalCSRMatrix - Set the diag and offdiag parts of a MATMPIAIJKOKKOS matrix by splitting a KokkosCsrMatrix 653 654 Input Parameters: 655 + C - the MATMPIAIJKOKKOS matrix, of size m,n,M,N 656 . reuse - indicate whether the matrix has called this function before 657 . csrmat - the KokkosCsrMatrix, of size m,N 658 - Cdstart - when reuse == MAT_REUSE_MATRIX, it is an input parameter. For each row in csrmat, it stores the start of the first 659 entry of the diag block of C in csrmat's j array. E.g, if row i has col ids = {0, 3, 4, 5, 7, 9} and the first diag 660 entry is 5, then Cdstart[i] = 3. 661 662 Output Parameters: 663 + C - the updated MATMPIAIJKOKKOS matrix 664 - Cdstart - when reuse == MAT_INITIAL_MATRIX, it is an output parameter 665 666 Notes: 667 Between calls with MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX, csrmat must have the same nonzero pattern 668 */ 669 static PetscErrorCode MatSetMPIAIJKokkosWithGlobalCSRMatrix(Mat C,MatReuse reuse,const KokkosCsrMatrix& csrmat,MatRowMapKokkosView& Cdstart) 670 { 671 PetscErrorCode ierr; 672 const MatScalarKokkosView& Ca = csrmat.values; 673 const ConstMatRowMapKokkosView& Ci = csrmat.graph.row_map; 674 PetscInt m,n,N; 675 676 PetscFunctionBegin; 677 ierr = MatGetLocalSize(C,&m,&n);CHKERRQ(ierr); 678 ierr = MatGetSize(C,NULL,&N);CHKERRQ(ierr); 679 680 if (reuse == MAT_REUSE_MATRIX) { 681 Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ*>(C->data); 682 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(mpiaij->A->spptr); 683 Mat_SeqAIJKokkos *bkok = static_cast<Mat_SeqAIJKokkos*>(mpiaij->B->spptr); 684 const MatScalarKokkosView& Cda = akok->a_dual.view_device(),Coa = bkok->a_dual.view_device(); 685 const MatRowMapKokkosView& Cdi = akok->i_dual.view_device(),Coi = bkok->i_dual.view_device(); 686 687 /* Fill 'a' of Cd and Co on device */ 688 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 689 PetscInt i = t.league_rank(); /* row i */ 690 PetscInt clen = Ci(i+1) - Ci(i); /* len of row i of C */ 691 PetscInt cdlen = Cdi(i+1) - Cdi(i); /* len of row i of Cd */ 692 PetscInt cdstart = Cdstart(i); /* [start, end) of row i of Cd in C */ 693 PetscInt cdend = cdstart + cdlen; 694 /* [0, clen) is cut into three blocks: [0, cdstart), [cdstart, cdend), [cdend, clen) */ 695 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clen), [&](PetscInt k) { 696 if (k < cdstart) { /* k in [0, cdstart) */ 697 Coa(Coi(i)+k) = Ca(Ci(i)+k); 698 } else if (k < cdend) { /* k in [cdstart, cdend) */ 699 Cda(Cdi(i)+(k-cdstart)) = Ca(Ci(i)+k); 700 } else { /* k in [cdend, clen) */ 701 Coa(Coi(i)+k-cdlen) = Ca(Ci(i)+k); 702 } 703 }); 704 }); 705 706 akok->a_dual.modify_device(); 707 bkok->a_dual.modify_device(); 708 } else if (reuse == MAT_INITIAL_MATRIX) { 709 Mat Cd,Co; 710 const MatColIdxKokkosView& Cj = csrmat.graph.entries; 711 MatRowMapKokkosDualView Cdi_dual("i",m+1),Coi_dual("i",m+1); 712 MatRowMapKokkosView Cdi = Cdi_dual.view_device(),Coi = Coi_dual.view_device(); 713 PetscInt cstart,cend; 714 715 /* Note that each row of C is sorted by col ids. We want to find out how to cut each row into three blocks: 716 left to the diag block, diag block, right to the diag block. The diag block have col ids in [cstart,cend). 717 Suppose a row of C has len nonzeros, indexed by [0, len). We want to know two indices: cdstart and cdend, 718 such that the three blocks are [0,cdstart), [cdstart,cdend), [cdend,len). The following code equivalentaly 719 stores values of cdstart and cdend-cstart (aka Cdi[]) instead. 720 */ 721 Cdstart = MatRowMapKokkosView("Cdstart",m); 722 ierr = PetscLayoutGetRange(C->cmap,&cstart,&cend);CHKERRQ(ierr); /* Not MatGetOwnershipRangeColumn() since C has not been preallocated yet */ 723 724 /* I could use RangePolicy and one thread per row. But since each thread essentially does binary search, threads in a 725 CUDA warp would completely diverge. So I use TeamPolicy with a team size 1. 726 */ 727 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 728 Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 729 PetscInt i = t.league_rank(); /* row i */ 730 PetscInt j,first,count,step; 731 732 if (i == 0) { /* Set the first entry of the i arrays to zero on device, to be used in CSR */ 733 Cdi(0) = 0; 734 Coi(0) = 0; 735 } 736 737 /* Do std::lower_bound(Ci(i),Ci(i+1),cstart) on Cj[]. We use j as the iterator. lower_bound() returns 738 in 'first' the first iterator with a value >= cstart, or last iterator if no such element is found. 739 */ 740 count = Ci(i+1)-Ci(i); 741 first = Ci(i); 742 while (count > 0) { 743 j = first; 744 step = count / 2; 745 j += step; 746 if (Cj(j) < cstart) { 747 first = ++j; 748 count -= step + 1; 749 } else count = step; 750 } 751 Cdstart(i) = first - Ci(i); /* 'first' is the while-loop's output */ 752 753 /* Do std::lower_bound(first,Ci(i+1),cend) on Cj[] */ 754 count = Ci(i+1) - first; 755 while (count > 0) { 756 j = first; 757 step = count / 2; 758 j += step; 759 if (Cj(j) < cend) { 760 first = ++j; 761 count -= step + 1; 762 } else count = step; 763 } 764 Cdi(i+1) = first - (Ci(i)+Cdstart(i)); /* 'first' is the while-loop's output */ 765 Coi(i+1) = (Ci(i+1)-Ci(i)) - Cdi(i+1); /* Co's row len = C's row len - Cd's row len */ 766 }); 767 }); 768 769 /* Convert row lens in Cdi[], Coi[] to CSR format using inclusive scan, e.g., changing [0,1,2,3] into [0,1,3,6] */ 770 Kokkos::parallel_scan(m+1,KOKKOS_LAMBDA(const PetscInt i,PetscInt& update,const bool final) { 771 update += Cdi(i); 772 if (final) Cdi(i) = update; 773 }); 774 Kokkos::parallel_scan(m+1,KOKKOS_LAMBDA(const PetscInt i,PetscInt& update,const bool final) { 775 update += Coi(i); 776 if (final) Coi(i) = update; 777 }); 778 779 /* Get Cdi, Coi on host (it is not a waste, since we do need them on host in 780 MatCreateSeqAIJKokkosWithCSRMatrix() below), then get nnz of Cd and Co. 781 */ 782 Cdi_dual.modify_device(); 783 Coi_dual.modify_device(); 784 Cdi_dual.sync_host(); 785 Coi_dual.sync_host(); 786 PetscInt Cd_nnz = Cdi_dual.view_host().data()[m]; 787 PetscInt Co_nnz = Coi_dual.view_host().data()[m]; 788 789 /* With nnz, allocate a, j for Cd and Co */ 790 MatColIdxKokkosDualView Cdj_dual("j",Cd_nnz),Coj_dual("j",Co_nnz); 791 MatScalarKokkosDualView Cda_dual("a",Cd_nnz),Coa_dual("a",Co_nnz); 792 793 /* Fill a, j of Cd and Co on device */ 794 MatColIdxKokkosView Cdj = Cdj_dual.view_device(),Coj = Coj_dual.view_device(); 795 MatScalarKokkosView Cda = Cda_dual.view_device(),Coa = Coa_dual.view_device(); 796 797 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 798 PetscInt i = t.league_rank(); /* row i */ 799 PetscInt clen = Ci(i+1) - Ci(i); /* len of row i of C */ 800 PetscInt cdlen = Cdi(i+1) - Cdi(i); /* len of row i of Cd */ 801 PetscInt cdstart = Cdstart(i); /* [start, end) of row i of Cd in C */ 802 PetscInt cdend = cdstart + cdlen; 803 /* [0, clen) is cut into three blocks: [0, cdstart), [cdstart, cdend), [cdend, clen) */ 804 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clen), [&](PetscInt k) { 805 if (k < cdstart) { /* k in [0, cdstart) */ 806 Coa(Coi(i)+k) = Ca(Ci(i)+k); 807 Coj(Coi(i)+k) = Cj(Ci(i)+k); 808 } else if (k < cdend) { /* k in [cdstart, cdend) */ 809 Cda(Cdi(i)+(k-cdstart)) = Ca(Ci(i)+k); 810 Cdj(Cdi(i)+(k-cdstart)) = Cj(Ci(i)+k) - cstart; /* Use local col ids in Cdj */ 811 } else { /* k in [cdend, clen) */ 812 Coa(Coi(i)+k-cdlen) = Ca(Ci(i)+k); 813 Coj(Coi(i)+k-cdlen) = Cj(Ci(i)+k); 814 } 815 }); 816 }); 817 818 Cdj_dual.modify_device(); 819 Cda_dual.modify_device(); 820 Coj_dual.modify_device(); 821 Coa_dual.modify_device(); 822 /* With a, i, j for Cd and Co, finally build Cd, Co and then C. Their offloadmask will be set in each's MatAssemblyEnd */ 823 auto cdkok = new Mat_SeqAIJKokkos(m,n,Cd_nnz,Cdi_dual,Cdj_dual,Cda_dual); 824 auto cokok = new Mat_SeqAIJKokkos(m,N,Co_nnz,Coi_dual,Coj_dual,Coa_dual); 825 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,cdkok,&Cd);CHKERRQ(ierr); 826 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,cokok,&Co);CHKERRQ(ierr); 827 ierr = MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices(C,Cd,Co);CHKERRQ(ierr); /* Coj will be converted to local ids within */ 828 } 829 PetscFunctionReturn(0); 830 } 831 832 /* MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos - Compact a SEQAIJKOKKS matrix's global col ids. 833 834 It is similar to MatSeqAIJCompactOutExtraColumns_SeqAIJ, but it applies to SEQAIJKOKKOS and returns the l2g map in Kokkos view. 835 836 Input Parameters: 837 + C - the MATMPIAIJKOKKOS matrix, of size m,n,M,N 838 . reuse - indicate whether the matrix has called this function before 839 . csrmat - the KokkosCsrMatrix, of size m,N 840 - Cdoffset - when reuse == MAT_REUSE_MATRIX, it is an input parameter. For each row in csrmat, it stores the offset of the first 841 entry of the diag block of C in csrmat's j array. 842 843 Output Parameters: 844 + C - the updated MATMPIAIJKOKKOS matrix 845 - Cdoffset - when reuse == MAT_INITIAL_MATRIX, it is an output parameter 846 847 Notes: the input matrix's col ids and col size will be changed. 848 */ 849 static PetscErrorCode MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos(Mat C,MatColIdxKokkosView& l2g) 850 { 851 PetscErrorCode ierr; 852 Mat_SeqAIJKokkos *ckok; 853 ISLocalToGlobalMapping l2gmap; 854 const PetscInt *garray; 855 PetscInt sz; 856 857 PetscFunctionBegin; 858 /* Compact P_other's global col ids and col size. We do it since we guess with local ids KK might be more memory scalable */ 859 ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(C,&l2gmap);CHKERRQ(ierr); 860 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 861 ckok->j_dual.modify_host(); /* P_other's j is modified on host; we need to sync it on device */ 862 ckok->j_dual.sync_device(); 863 ckok->SetColSize(C->cmap->n); /* Update col size of the csrmat in spptr */ 864 865 /* Build l2g -- the local to global mapping of C's cols */ 866 ierr = ISLocalToGlobalMappingGetIndices(l2gmap,&garray);CHKERRQ(ierr); 867 ierr = ISLocalToGlobalMappingGetSize(l2gmap,&sz);CHKERRQ(ierr); 868 if (C->cmap->n != sz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"matrix column size(%" PetscInt_FMT ") != l2g mapping size(%" PetscInt_FMT ")", C->cmap->n,sz); 869 870 ConstMatColIdxKokkosViewHost tmp(garray,sz); 871 l2g = MatColIdxKokkosView("l2g",sz); 872 Kokkos::deep_copy(l2g,tmp); 873 874 ierr = ISLocalToGlobalMappingRestoreIndices(l2gmap,&garray);CHKERRQ(ierr); 875 ierr = ISLocalToGlobalMappingDestroy(&l2gmap);CHKERRQ(ierr); 876 PetscFunctionReturn(0); 877 } 878 879 /* MatProductSymbolic_MPIAIJKokkos_AB - AB flavor of MatProductSymbolic_MPIAIJKokkos 880 881 Input Parameters: 882 + product - Mat_Product which carried out the computation. Passed in to access info about this mat product. 883 . A - an MPIAIJKOKKOS matrix 884 . B - an MPIAIJKOKKOS matrix 885 - mm - a struct used to stash intermediate data when computing AB. Persist from symbolic to numeric operations. 886 887 Notes: The local part of the result C is stored as mm->C_global, which is of type KokkosCsrMatrix and uses global col ids. 888 */ 889 static PetscErrorCode MatProductSymbolic_MPIAIJKokkos_AB(Mat_Product *product,Mat A,Mat B,MatMatStruct_AB *mm) 890 { 891 PetscErrorCode ierr; 892 Mat_MPIAIJ *a = static_cast<Mat_MPIAIJ*>(A->data); 893 Mat Ad = a->A,Ao = a->B; /* diag and offdiag of A */ 894 IS glob = NULL; 895 const PetscInt *garray; 896 PetscInt N = B->cmap->N,sz; 897 ConstMatColIdxKokkosView l2g1; /* two temp maps mapping local col ids to global ones */ 898 MatColIdxKokkosView l2g2; 899 Mat C1,C2; /* intermediate matrices */ 900 901 PetscFunctionBegin; 902 /* C1 = Ad * B_local. B_local is a matrix got by merging Bd and Bo, and uses local col ids */ 903 ierr = MatMPIAIJGetLocalMatMerge(B,MAT_INITIAL_MATRIX,&glob,&mm->B_local);CHKERRQ(ierr); 904 ierr = MatProductCreate(Ad,mm->B_local,NULL,&C1);CHKERRQ(ierr); 905 ierr = MatProductSetType(C1,MATPRODUCT_AB);CHKERRQ(ierr); 906 ierr = MatProductSetFill(C1,product->fill);CHKERRQ(ierr); 907 C1->product->api_user = product->api_user; 908 ierr = MatProductSetFromOptions(C1);CHKERRQ(ierr); 909 if (!C1->ops->productsymbolic) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C1->product->type]); 910 ierr = (*C1->ops->productsymbolic)(C1);CHKERRQ(ierr); 911 912 ierr = ISGetIndices(glob,&garray);CHKERRQ(ierr); 913 ierr = ISGetSize(glob,&sz);CHKERRQ(ierr); 914 const auto& tmp = ConstMatColIdxKokkosViewHost(garray,sz); /* wrap garray as a view */ 915 l2g1 = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),tmp); /* maybe just an alias to tmp, so we restore garray at the very end */ 916 ierr = MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C1,N,l2g1,mm->C1_global); 917 918 /* C2 = Ao * B_other. B_other is a matrix consisting of needed rows of B gathered from other procs */ 919 ierr = MatSeqAIJKokkosBcast(mm->B_local,MAT_INITIAL_MATRIX,N,l2g1,a->Mvctx,mm->sf, 920 mm->abuf,mm->rows,mm->rowoffset,mm->B_other);CHKERRQ(ierr); 921 922 /* Compact B_other to use local ids as we guess KK spgemm is more memroy scalable with that; We could skip the compaction to simplify code */ 923 ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos(mm->B_other,l2g2);CHKERRQ(ierr); 924 ierr = MatProductCreate(Ao,mm->B_other,NULL,&C2);CHKERRQ(ierr); 925 ierr = MatProductSetType(C2,MATPRODUCT_AB);CHKERRQ(ierr); 926 ierr = MatProductSetFill(C2,product->fill);CHKERRQ(ierr); 927 C2->product->api_user = product->api_user; 928 ierr = MatProductSetFromOptions(C2);CHKERRQ(ierr); 929 if (!C2->ops->productsymbolic) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C2->product->type]); 930 ierr = (*C2->ops->productsymbolic)(C2);CHKERRQ(ierr); 931 ierr = MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C2,N,l2g2,mm->C2_global); 932 933 /* C = C1 + C2. We actually use their global col ids versions in adding */ 934 mm->kh.create_spadd_handle(false); /* Input C1, C2 are NOT sorted, since B_local, B_other are not */ 935 KokkosSparse::spadd_symbolic(&mm->kh,mm->C1_global,mm->C2_global,mm->C_global); 936 /* Have to do numeric since spadd_symbolic does not really populate column indices of the result matrix */ 937 KokkosSparse::spadd_numeric(&mm->kh,(MatScalarType)1.0,mm->C1_global,(MatScalarType)1.0,mm->C2_global,mm->C_global); 938 939 mm->C1 = C1; 940 mm->C2 = C2; 941 ierr = ISRestoreIndices(glob,&garray);CHKERRQ(ierr); 942 ierr = ISDestroy(&glob);CHKERRQ(ierr); 943 PetscFunctionReturn(0); 944 } 945 946 /* MatProductSymbolic_MPIAIJKokkos_AtB - A^tB flavor of MatProductSymbolic_MPIAIJKokkos 947 948 Input Parameters: 949 + product - Mat_Product which carried out the computation. Passed in to access info about this mat product. 950 . A - an MPIAIJKOKKOS matrix 951 . B - a SEQAIJKOKKOS matrix. It works as if A^t is multiplied by a parallel matrix made up of Bs on each rank. 952 . localB - Does B use local col ids? If false, then B is already in global col ids. 953 . N - col size of the "parallel B matrix". It implies B's global col ids are in range of [0,N) and N is the same across the communicator. 954 . l2g - If localB, then l2g maps B's local col ids to global ones. 955 - mm - a struct used to stash intermediate data in AtB 956 957 Notes: The local part of the result C is stored as mm->C_global, which is of type KokkosCsrMatrix and uses global col ids. 958 */ 959 static PetscErrorCode MatProductSymbolic_MPIAIJKokkos_AtB(Mat_Product *product,Mat A,Mat B,PetscBool localB,PetscInt N,const ConstMatColIdxKokkosView& l2g,MatMatStruct_AtB *mm) 960 { 961 PetscErrorCode ierr; 962 Mat_MPIAIJ *a = static_cast<Mat_MPIAIJ*>(A->data); 963 Mat Ad = a->A,Ao = a->B; /* diag and offdiag of A */ 964 Mat C1,C2; /* intermediate matrices */ 965 966 PetscFunctionBegin; 967 /* C1 = Ad^t * B */ 968 ierr = MatProductCreate(Ad,B,NULL,&C1);CHKERRQ(ierr); 969 ierr = MatProductSetType(C1,MATPRODUCT_AtB);CHKERRQ(ierr); 970 ierr = MatProductSetFill(C1,product->fill);CHKERRQ(ierr); 971 C1->product->api_user = product->api_user; 972 ierr = MatProductSetFromOptions(C1);CHKERRQ(ierr); 973 if (!C1->ops->productsymbolic) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C1->product->type]); 974 ierr = (*C1->ops->productsymbolic)(C1);CHKERRQ(ierr); 975 976 if (localB) {ierr = MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C1,N,l2g,mm->C1_global);} 977 else mm->C1_global = static_cast<Mat_SeqAIJKokkos*>(C1->spptr)->csrmat; /* the csrmat already uses global col ids */ 978 979 /* C2 = Ao^t * B */ 980 ierr = MatProductCreate(Ao,B,NULL,&C2);CHKERRQ(ierr); 981 ierr = MatProductSetType(C2,MATPRODUCT_AtB);CHKERRQ(ierr); 982 ierr = MatProductSetFill(C2,product->fill);CHKERRQ(ierr); 983 C2->product->api_user = product->api_user; 984 ierr = MatProductSetFromOptions(C2);CHKERRQ(ierr); 985 if (!C2->ops->productsymbolic) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C2->product->type]); 986 ierr = (*C2->ops->productsymbolic)(C2);CHKERRQ(ierr); 987 988 ierr = MatSeqAIJKokkosReduce(C2,MAT_INITIAL_MATRIX,localB,N,l2g,a->Mvctx,mm->sf,mm->abuf, 989 mm->srcrowoffset,mm->dstrowoffset,mm->C2_global);CHKERRQ(ierr); 990 991 mm->kh.create_spadd_handle(false); /* Input C1, C2 are NOT sorted, since B may be not */ 992 KokkosSparse::spadd_symbolic(&mm->kh,mm->C1_global,mm->C2_global,mm->C_global); 993 /* Have to do numeric since spadd_symbolic does not really populate column indices of the result matrix */ 994 KokkosSparse::spadd_numeric(&mm->kh,(MatScalarType)1.0,mm->C1_global,(MatScalarType)1.0,mm->C2_global,mm->C_global); 995 mm->C1 = C1; 996 mm->C2 = C2; 997 PetscFunctionReturn(0); 998 } 999 1000 PetscErrorCode MatProductNumeric_MPIAIJKokkos(Mat C) 1001 { 1002 PetscErrorCode ierr; 1003 Mat_Product *product = C->product; 1004 MatProductType ptype; 1005 MatProductData_MPIAIJKokkos *mmdata; 1006 MatMatStruct *mm = NULL; 1007 MatMatStruct_AB *ab; 1008 MatMatStruct_AtB *atb; 1009 Mat A,B,Ad,Ao,Bd,Bo; 1010 const MatScalarType one = 1.0; /* Not use literal 1.0 directly, to avoid wrong template instantiation in KokkosSparse::spadd_numeric */ 1011 1012 PetscFunctionBegin; 1013 MatCheckProduct(C,1); 1014 mmdata = static_cast<MatProductData_MPIAIJKokkos*>(product->data); 1015 ptype = product->type; 1016 A = product->A; 1017 B = product->B; 1018 Ad = static_cast<Mat_MPIAIJ*>(A->data)->A; 1019 Ao = static_cast<Mat_MPIAIJ*>(A->data)->B; 1020 Bd = static_cast<Mat_MPIAIJ*>(B->data)->A; 1021 Bo = static_cast<Mat_MPIAIJ*>(B->data)->B; 1022 1023 if (mmdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 1024 mmdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 1025 ab = mmdata->mmAB; 1026 atb = mmdata->mmAtB; 1027 if (ab) { 1028 static_cast<MatProductData_SeqAIJKokkos*>(ab->C1->product->data)->reusesym = PETSC_FALSE; 1029 static_cast<MatProductData_SeqAIJKokkos*>(ab->C2->product->data)->reusesym = PETSC_FALSE; 1030 } 1031 if (atb) { 1032 static_cast<MatProductData_SeqAIJKokkos*>(atb->C1->product->data)->reusesym = PETSC_FALSE; 1033 static_cast<MatProductData_SeqAIJKokkos*>(atb->C2->product->data)->reusesym = PETSC_FALSE; 1034 } 1035 PetscFunctionReturn(0); 1036 } 1037 1038 if (ptype == MATPRODUCT_AB) { 1039 ab = mmdata->mmAB; 1040 /* C1 = Ad * B_local */ 1041 if (!ab->C1->ops->productnumeric || !ab->C2->ops->productnumeric) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing numeric op for MATPRODUCT_AB"); 1042 ierr = MatMPIAIJGetLocalMatMerge(B,MAT_REUSE_MATRIX,NULL/*glob*/,&ab->B_local);CHKERRQ(ierr); 1043 if (ab->C1->product->B != ab->B_local) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AB, internal mat product matrix C1->B has unexpectedly changed"); 1044 if (ab->C1->product->A != Ad) {ierr = MatProductReplaceMats(Ad,NULL,NULL,ab->C1);CHKERRQ(ierr);} 1045 ierr = (*ab->C1->ops->productnumeric)(ab->C1);CHKERRQ(ierr); 1046 ierr = MatSeqAIJKokkosBcast(ab->B_local,MAT_REUSE_MATRIX,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,ab->sf, 1047 ab->abuf,ab->rows,ab->rowoffset,ab->B_other);CHKERRQ(ierr); 1048 /* C2 = Ao * B_other */ 1049 if (ab->C2->product->B != ab->B_other) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AB, internal mat product matrix C2->B has unexpectedly changed"); 1050 if (ab->C1->product->A != Ao) {ierr = MatProductReplaceMats(Ao,NULL,NULL,ab->C2);CHKERRQ(ierr);} 1051 ierr = (*ab->C2->ops->productnumeric)(ab->C2);CHKERRQ(ierr); 1052 /* C = C1_global + C2_global */ 1053 KokkosSparse::spadd_numeric(&ab->kh,one,ab->C1_global,one,ab->C2_global,ab->C_global); 1054 mm = static_cast<MatMatStruct*>(ab); 1055 } else if (ptype == MATPRODUCT_AtB) { 1056 atb = mmdata->mmAtB; 1057 if (!atb->C1->ops->productnumeric || !atb->C2->ops->productnumeric) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing numeric op for MATPRODUCT_AtB"); 1058 /* C1 = Ad^t * B_local */ 1059 ierr = MatMPIAIJGetLocalMatMerge(B,MAT_REUSE_MATRIX,NULL/*glob*/,&atb->B_local);CHKERRQ(ierr); 1060 if (atb->C1->product->B != atb->B_local) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AtB, internal mat product matrix C1->B has unexpectedly changed"); 1061 if (atb->C1->product->A != Ad) {ierr = MatProductReplaceMats(Ad,NULL,NULL,atb->C1);CHKERRQ(ierr);} 1062 ierr = (*atb->C1->ops->productnumeric)(atb->C1);CHKERRQ(ierr); 1063 1064 /* C2 = Ao^t * B_local */ 1065 if (atb->C2->product->B != atb->B_local) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AtB, internal mat product matrix C2->B has unexpectedly changed"); 1066 if (atb->C2->product->A != Ao) {ierr = MatProductReplaceMats(Ao,NULL,NULL,atb->C2);CHKERRQ(ierr);} 1067 ierr = (*atb->C2->ops->productnumeric)(atb->C2);CHKERRQ(ierr); 1068 /* Form C2_global */ 1069 ierr = MatSeqAIJKokkosReduce(atb->C2,MAT_REUSE_MATRIX,PETSC_TRUE,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,atb->sf, 1070 atb->abuf,atb->srcrowoffset,atb->dstrowoffset,atb->C2_global);CHKERRQ(ierr); 1071 /* C = C1_global + C2_global */ 1072 KokkosSparse::spadd_numeric(&atb->kh,one,atb->C1_global,one,atb->C2_global,atb->C_global); 1073 mm = static_cast<MatMatStruct*>(atb); 1074 } else if (ptype == MATPRODUCT_PtAP) { /* BtAB */ 1075 ab = mmdata->mmAB; 1076 ierr = MatMPIAIJGetLocalMatMerge(B,MAT_REUSE_MATRIX,NULL/*glob*/,&ab->B_local);CHKERRQ(ierr); 1077 1078 /* ab->C1 = Ad * B_local */ 1079 if (ab->C1->product->B != ab->B_local) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_PtAP, internal mat product matrix ab->C1->B has unexpectedly changed"); 1080 if (ab->C1->product->A != Ad) {ierr = MatProductReplaceMats(Ad,NULL,NULL,ab->C1);CHKERRQ(ierr);} 1081 ierr = (*ab->C1->ops->productnumeric)(ab->C1);CHKERRQ(ierr); 1082 ierr = MatSeqAIJKokkosBcast(ab->B_local,MAT_REUSE_MATRIX,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,ab->sf, 1083 ab->abuf,ab->rows,ab->rowoffset,ab->B_other);CHKERRQ(ierr); 1084 /* ab->C2 = Ao * B_other */ 1085 if (ab->C2->product->A != Ao) {ierr = MatProductReplaceMats(Ao,NULL,NULL,ab->C2);CHKERRQ(ierr);} 1086 ierr = (*ab->C2->ops->productnumeric)(ab->C2);CHKERRQ(ierr); /* C2 = Ao * B_other */ 1087 KokkosSparse::spadd_numeric(&ab->kh,one,ab->C1_global,one,ab->C2_global,ab->C_global); 1088 1089 /* atb->C1 = Bd^t * ab->C_petsc */ 1090 atb = mmdata->mmAtB; 1091 if (atb->C1->product->B != ab->C_petsc) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_PtAP, internal mat product matrix atb->C1->B has unexpectedly changed"); 1092 if (atb->C1->product->A != Bd) {ierr = MatProductReplaceMats(Bd,NULL,NULL,atb->C1);CHKERRQ(ierr);} 1093 ierr = (*atb->C1->ops->productnumeric)(atb->C1);CHKERRQ(ierr); 1094 /* atb->C2 = Bo^t * ab->C_petsc */ 1095 if (atb->C2->product->A != Bo) {ierr = MatProductReplaceMats(Bo,NULL,NULL,atb->C2);CHKERRQ(ierr);} 1096 ierr = (*atb->C2->ops->productnumeric)(atb->C2);CHKERRQ(ierr); 1097 ierr = MatSeqAIJKokkosReduce(atb->C2,MAT_REUSE_MATRIX,PETSC_FALSE,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,atb->sf, 1098 atb->abuf,atb->srcrowoffset,atb->dstrowoffset,atb->C2_global);CHKERRQ(ierr); 1099 KokkosSparse::spadd_numeric(&atb->kh,one,atb->C1_global,one,atb->C2_global,atb->C_global); 1100 mm = static_cast<MatMatStruct*>(atb); 1101 } 1102 /* Split C_global to form C */ 1103 ierr = MatSetMPIAIJKokkosWithGlobalCSRMatrix(C,MAT_REUSE_MATRIX,mm->C_global,mm->Cdstart);CHKERRQ(ierr); 1104 PetscFunctionReturn(0); 1105 } 1106 1107 PetscErrorCode MatProductSymbolic_MPIAIJKokkos(Mat C) 1108 { 1109 PetscErrorCode ierr; 1110 Mat A,B; 1111 Mat_Product *product = C->product; 1112 MatProductType ptype; 1113 MatProductData_MPIAIJKokkos *mmdata; 1114 MatMatStruct *mm = NULL; 1115 IS glob = NULL; 1116 const PetscInt *garray; 1117 PetscInt m,n,M,N,sz; 1118 ConstMatColIdxKokkosView l2g; /* map local col ids to global ones */ 1119 1120 PetscFunctionBegin; 1121 MatCheckProduct(C,1); 1122 if (product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 1123 ptype = product->type; 1124 A = product->A; 1125 B = product->B; 1126 1127 switch (ptype) { 1128 case MATPRODUCT_AB: m = A->rmap->n; n = B->cmap->n; M = A->rmap->N; N = B->cmap->N; break; 1129 case MATPRODUCT_AtB: m = A->cmap->n; n = B->cmap->n; M = A->cmap->N; N = B->cmap->N; break; 1130 case MATPRODUCT_PtAP: m = B->cmap->n; n = B->cmap->n; M = B->cmap->N; N = B->cmap->N; break; /* BtAB */ 1131 default: SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for product type %s",MatProductTypes[ptype]); 1132 } 1133 1134 ierr = MatSetSizes(C,m,n,M,N);CHKERRQ(ierr); 1135 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 1136 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 1137 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr); 1138 1139 mmdata = new MatProductData_MPIAIJKokkos(); 1140 mmdata->reusesym = product->api_user; 1141 1142 if (ptype == MATPRODUCT_AB) { 1143 mmdata->mmAB = new MatMatStruct_AB(); 1144 ierr = MatProductSymbolic_MPIAIJKokkos_AB(product,A,B,mmdata->mmAB);CHKERRQ(ierr); 1145 mm = static_cast<MatMatStruct*>(mmdata->mmAB); 1146 } else if (ptype == MATPRODUCT_AtB) { 1147 mmdata->mmAtB = new MatMatStruct_AtB(); 1148 auto atb = mmdata->mmAtB; 1149 ierr = MatMPIAIJGetLocalMatMerge(B,MAT_INITIAL_MATRIX,&glob,&atb->B_local);CHKERRQ(ierr); 1150 ierr = ISGetIndices(glob,&garray);CHKERRQ(ierr); 1151 ierr = ISGetSize(glob,&sz);CHKERRQ(ierr); 1152 l2g = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatColIdxKokkosViewHost(garray,sz)); 1153 ierr = MatProductSymbolic_MPIAIJKokkos_AtB(product,A,atb->B_local,PETSC_TRUE,N,l2g,atb);CHKERRQ(ierr); 1154 ierr = ISRestoreIndices(glob,&garray);CHKERRQ(ierr); 1155 ierr = ISDestroy(&glob);CHKERRQ(ierr); 1156 mm = static_cast<MatMatStruct*>(atb); 1157 } else if (ptype == MATPRODUCT_PtAP) { /* BtAB */ 1158 mmdata->mmAB = new MatMatStruct_AB(); /* tmp=A*B */ 1159 mmdata->mmAtB = new MatMatStruct_AtB(); /* C=B^t*tmp */ 1160 auto ab = mmdata->mmAB; 1161 auto atb = mmdata->mmAtB; 1162 ierr = MatProductSymbolic_MPIAIJKokkos_AB(product,A,B,ab);CHKERRQ(ierr); 1163 auto tmp = new Mat_SeqAIJKokkos(ab->C_global); /* Memory will be owned by ab->C_petsc */ 1164 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,tmp,&ab->C_petsc);CHKERRQ(ierr); 1165 ierr = MatProductSymbolic_MPIAIJKokkos_AtB(product,B,ab->C_petsc,PETSC_FALSE,N,l2g/*not used*/,atb);CHKERRQ(ierr); 1166 mm = static_cast<MatMatStruct*>(atb); 1167 } 1168 /* Split the C_global into petsc A, B format */ 1169 ierr = MatSetMPIAIJKokkosWithGlobalCSRMatrix(C,MAT_INITIAL_MATRIX,mm->C_global,mm->Cdstart);CHKERRQ(ierr); 1170 C->product->data = mmdata; 1171 C->product->destroy = MatProductDataDestroy_MPIAIJKokkos; 1172 C->ops->productnumeric = MatProductNumeric_MPIAIJKokkos; 1173 PetscFunctionReturn(0); 1174 } 1175 1176 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_MPIAIJKokkos(Mat mat) 1177 { 1178 PetscErrorCode ierr; 1179 Mat_Product *product = mat->product; 1180 PetscBool match = PETSC_FALSE; 1181 PetscBool usecpu = PETSC_FALSE; 1182 1183 PetscFunctionBegin; 1184 MatCheckProduct(mat,1); 1185 if (!product->A->boundtocpu && !product->B->boundtocpu) { 1186 ierr = PetscObjectTypeCompare((PetscObject)product->B,((PetscObject)product->A)->type_name,&match);CHKERRQ(ierr); 1187 } 1188 if (match) { /* we can always fallback to the CPU if requested */ 1189 switch (product->type) { 1190 case MATPRODUCT_AB: 1191 if (product->api_user) { 1192 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr); 1193 ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 1194 ierr = PetscOptionsEnd();CHKERRQ(ierr); 1195 } else { 1196 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr); 1197 ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 1198 ierr = PetscOptionsEnd();CHKERRQ(ierr); 1199 } 1200 break; 1201 case MATPRODUCT_AtB: 1202 if (product->api_user) { 1203 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr); 1204 ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 1205 ierr = PetscOptionsEnd();CHKERRQ(ierr); 1206 } else { 1207 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr); 1208 ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 1209 ierr = PetscOptionsEnd();CHKERRQ(ierr); 1210 } 1211 break; 1212 case MATPRODUCT_PtAP: 1213 if (product->api_user) { 1214 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 1215 ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 1216 ierr = PetscOptionsEnd();CHKERRQ(ierr); 1217 } else { 1218 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr); 1219 ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 1220 ierr = PetscOptionsEnd();CHKERRQ(ierr); 1221 } 1222 break; 1223 default: 1224 break; 1225 } 1226 match = (PetscBool)!usecpu; 1227 } 1228 if (match) { 1229 switch (product->type) { 1230 case MATPRODUCT_AB: 1231 case MATPRODUCT_AtB: 1232 case MATPRODUCT_PtAP: 1233 mat->ops->productsymbolic = MatProductSymbolic_MPIAIJKokkos; 1234 break; 1235 default: 1236 break; 1237 } 1238 } 1239 /* fallback to MPIAIJ ops */ 1240 if (!mat->ops->productsymbolic) { 1241 ierr = MatProductSetFromOptions_MPIAIJ(mat);CHKERRQ(ierr); 1242 } 1243 PetscFunctionReturn(0); 1244 } 1245 1246 /* std::upper_bound(): Given a sorted array, return index of the first element in range [first,last) whose value 1247 is greater than value, or last if there is no such element. 1248 */ 1249 PETSC_STATIC_INLINE PetscErrorCode PetscSortedIntUpperBound(PetscInt *array,PetscInt first,PetscInt last,PetscInt value,PetscInt *upper) 1250 { 1251 PetscInt it,step,count = last - first; 1252 1253 PetscFunctionBegin; 1254 while (count > 0) { 1255 it = first; 1256 step = count / 2; 1257 it += step; 1258 if (!(value < array[it])) { 1259 first = ++it; 1260 count -= step + 1; 1261 } else count = step; 1262 } 1263 *upper = first; 1264 PetscFunctionReturn(0); 1265 } 1266 1267 /* Merge two sets of sorted nonzero entries and return a CSR for the merged (sequential) matrix 1268 1269 Input Parameters: 1270 1271 j1,rowBegin1,rowEnd1,perm1,jmap1: describe the first set of nonzeros (Set1) 1272 j2,rowBegin2,rowEnd2,perm2,jmap2: describe the second set of nonzeros (Set2) 1273 1274 mat: both sets' entries are on m rows, where m is the number of local rows of the matrix mat 1275 1276 For Set1, j1[] contains column indices of the nonzeros. 1277 For the k-th row (0<=k<m), [rowBegin1[k],rowEnd1[k]) index into j1[] and point to the begin/end nonzero in row k 1278 respectively (note rowEnd1[k] is not necessarily equal to rwoBegin1[k+1]). Indices in this range of j1[] are sorted, 1279 but might have repeats. jmap1[t+1] - jmap1[t] is the number of repeats for the t-th unique nonzero in Set1. 1280 1281 Similar for Set2. 1282 1283 This routine merges the two sets of nonzeros row by row and removes repeats. 1284 1285 Output Parameters: (memories are allocated by the caller) 1286 1287 i[],j[]: the CSR of the merged matrix, which has m rows. 1288 imap1[]: the k-th unique nonzero in Set1 (k=0,1,...) corresponds to imap1[k]-th unique nonzero in the merged matrix. 1289 imap2[]: similar to imap1[], but for Set2. 1290 Note we order nonzeros row-by-row and from left to right. 1291 */ 1292 static PetscErrorCode MatMergeEntries_Internal(Mat mat,const PetscInt *j1,const PetscInt *j2,const PetscInt *rowBegin1,const PetscInt *rowEnd1, 1293 const PetscInt *rowBegin2,const PetscInt *rowEnd2,const MatRowMapKokkosViewHost& jmap1_h,const MatRowMapKokkosViewHost& jmap2_h, 1294 MatRowMapKokkosViewHost& imap1_h,MatRowMapKokkosViewHost& imap2_h,PetscInt *i,PetscInt *j) 1295 { 1296 PetscErrorCode ierr; 1297 PetscInt r,m,t,t1,t2,b1,e1,b2,e2; 1298 PetscInt *jmap1 = jmap1_h.data(),*jmap2 = jmap2_h.data(),*imap1 = imap1_h.data(),*imap2 = imap2_h.data(); 1299 1300 PetscFunctionBegin; 1301 ierr = MatGetLocalSize(mat,&m,NULL);CHKERRQ(ierr); 1302 t1 = t2 = t = 0; /* Count unique nonzeros of in Set1, Set1 and the merged respectively */ 1303 i[0] = 0; 1304 for (r=0; r<m; r++) { /* Do row by row merging */ 1305 b1 = rowBegin1[r]; 1306 e1 = rowEnd1[r]; 1307 b2 = rowBegin2[r]; 1308 e2 = rowEnd2[r]; 1309 while (b1 < e1 && b2 < e2) { 1310 if (j1[b1] == j2[b2]) { /* Same column index and hence same nonzero */ 1311 j[t] = j1[b1]; 1312 imap1[t1] = t; 1313 imap2[t2] = t; 1314 b1 += jmap1[t1+1] - jmap1[t1]; /* Jump to next unique local nonzero */ 1315 b2 += jmap2[t2+1] - jmap2[t2]; /* Jump to next unique remote nonzero */ 1316 t1++; t2++; t++; 1317 } else if (j1[b1] < j2[b2]) { 1318 j[t] = j1[b1]; 1319 imap1[t1] = t; 1320 b1 += jmap1[t1+1] - jmap1[t1]; 1321 t1++; t++; 1322 } else { 1323 j[t] = j2[b2]; 1324 imap2[t2] = t; 1325 b2 += jmap2[t2+1] - jmap2[t2]; 1326 t2++; t++; 1327 } 1328 } 1329 /* Merge the remaining in either j1[] or j2[] */ 1330 while (b1 < e1) { 1331 j[t] = j1[b1]; 1332 imap1[t1] = t; 1333 b1 += jmap1[t1+1] - jmap1[t1]; 1334 t1++; t++; 1335 } 1336 while (b2 < e2) { 1337 j[t] = j2[b2]; 1338 imap2[t2] = t; 1339 b2 += jmap2[t2+1] - jmap2[t2]; 1340 t2++; t++; 1341 } 1342 i[r+1] = t; 1343 } 1344 PetscFunctionReturn(0); 1345 } 1346 1347 /* Split a set/group of local entries into two subsets: those in the diagonal block and those in the off-diagonal block 1348 1349 Input Parameters: 1350 mat: an MPI matrix that provides row and column layout information for splitting. Let's assume its number of local rows is m. 1351 n,i[],j[],perm[]: there are n input entries, belonging to m rows. Row/col indices of the entries are stored in i[] and j[] 1352 respectively, along with a permutation array perm[]. Length of the i[],j[],perm[] arrays is n. 1353 1354 i[] is already sorted, but within a row, j[] is not sorted and might have repeats. 1355 i[] might contain negative indices at the beginning, which says the corresponding entries should be ignored in the splitting. 1356 1357 Output Parameters: 1358 j[],perm[]: the routine needs to sort j[] within each row along with perm[]. 1359 rowBegin[],rowMid[],rowEnd[]: of length m, and the memory is preallocated and zeroed by the caller. 1360 They contain indices pointing to j[]. For 0<=r<m, [rowBegin[r],rowMid[r]) point to begin/end entries in row r of the diagonal block, 1361 and [rowMid[r],rowEnd[r]) point to begin/end entries in row r of the off-diagonal block. 1362 1363 Aperm_h,Ajmap_h: They are Kokkos views on host. This routine will resize and fill them with proper values. Let's say Aperm = Aperm_h.data(), 1364 and Ajmap = Ajmap_h.data(). Aperm[] stores values from perm[] for entries in the diagonal block. Hence length of Aperm[] is the number 1365 of entries in the diagonal block, though those entries might have repeats (i.e., same 'i,j' pair). 1366 Ajmap[] stores the number of repeats of each unique nonzero in the diagonal block. More precisely, Ajmap[t+1] - Ajmap[t] is the number of 1367 repeats for the t-th unique nonzero in the diagonal block. Ajmap[0] is always 0. 1368 Length of Aperm_h is the number of nonzeros in the diagonal block. 1369 Length of Ajmap_h is the number of unique nonzeros in the diagonal block + 1. 1370 1371 Bperm_h and Bjmap_h are similar to Aperm_h and Ajmap_h, respectively, but for the off-diagonal block. 1372 */ 1373 1374 static PetscErrorCode MatSplitEntries_Internal(Mat mat,PetscInt n,const PetscInt i[], 1375 PetscInt j[],PetscInt perm[],PetscInt rowBegin[],PetscInt rowMid[],PetscInt rowEnd[], 1376 MatRowMapKokkosViewHost& Aperm_h,MatRowMapKokkosViewHost& Ajmap_h,MatRowMapKokkosViewHost& Bperm_h,MatRowMapKokkosViewHost& Bjmap_h) 1377 { 1378 PetscErrorCode ierr; 1379 PetscInt cstart,cend,rstart,rend,mid; 1380 PetscInt Atot=0,Btot=0; /* Total number of nonzeros in the diagonal and off-diagonal blocks */ 1381 PetscInt Annz=0,Bnnz=0; /* Number of unique nonzeros in the diagonal and off-diagonal blocks */ 1382 PetscInt k,m,p,q,r,s,row,col; 1383 PetscInt *Aperm,*Bperm,*Ajmap,*Bjmap; 1384 1385 PetscFunctionBegin; 1386 ierr = PetscLayoutGetRange(mat->rmap,&rstart,&rend);CHKERRQ(ierr); 1387 ierr = PetscLayoutGetRange(mat->cmap,&cstart,&cend);CHKERRQ(ierr); 1388 m = rend - rstart; 1389 1390 for (k=0; k<n; k++) {if (i[k]>=0) break;} /* Skip negative rows */ 1391 1392 /* Process [k,n): sort and partition each local row into diag and offdiag portions, 1393 fill rowBegin[], rowMid[], rowEnd[], and count Atot, Btot, Annz, Bnnz. 1394 */ 1395 while (k<n) { 1396 row = i[k]; 1397 /* Entries in [k,s) are in one row. Shift diagonal block col indices so that diag is ahead of offdiag after sorting the row */ 1398 for (s=k; s<n; s++) if (i[s] != row) break; 1399 for (p=k; p<s; p++) { 1400 if (j[p] >= cstart && j[p] < cend) j[p] -= PETSC_MAX_INT; /* Shift diag columns to range of [-PETSC_MAX_INT, -1] */ 1401 #if defined(PETSC_USE_DEBUG) 1402 else if (j[p] < 0 || j[p] > mat->cmap->N) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Column index %" PetscInt_FMT " is out of range",j[p]); 1403 #endif 1404 } 1405 ierr = PetscSortIntWithArray(s-k,j+k,perm+k);CHKERRQ(ierr); 1406 ierr = PetscSortedIntUpperBound(j,k,s,-1,&mid);CHKERRQ(ierr); /* Seperate [k,s) into [k,mid) for diag and [mid,s) for offdiag */ 1407 rowBegin[row-rstart] = k; 1408 rowMid[row-rstart] = mid; 1409 rowEnd[row-rstart] = s; 1410 1411 /* Count nonzeros of this diag/offdiag row, which might have repeats */ 1412 Atot += mid - k; 1413 Btot += s - mid; 1414 1415 /* Count unique nonzeros of this diag/offdiag row */ 1416 for (p=k; p<mid;) { 1417 col = j[p]; 1418 do {j[p] += PETSC_MAX_INT; p++;} while (p<mid && j[p] == col); /* Revert the modified diagonal indices */ 1419 Annz++; 1420 } 1421 1422 for (p=mid; p<s;) { 1423 col = j[p]; 1424 do {p++;} while (p<s && j[p] == col); 1425 Bnnz++; 1426 } 1427 k = s; 1428 } 1429 1430 /* Resize views according to Atot, Btot, Annz, Bnnz */ 1431 Kokkos::resize(Aperm_h,Atot); 1432 Kokkos::resize(Ajmap_h,Annz+1); 1433 Kokkos::resize(Bperm_h,Btot); 1434 Kokkos::resize(Bjmap_h,Bnnz+1); 1435 Aperm = Aperm_h.data(); 1436 Bperm = Bperm_h.data(); 1437 Ajmap = Ajmap_h.data(); 1438 Bjmap = Bjmap_h.data(); 1439 Ajmap[0] = 0; 1440 Bjmap[0] = 0; 1441 1442 /* Re-scan indices and copy diag/offdiag permuation indices to Aperm, Bperm and also fill Ajmap and Bjmap */ 1443 Atot = Btot = Annz = Bnnz = 0; 1444 for (r=0; r<m; r++) { 1445 k = rowBegin[r]; 1446 mid = rowMid[r]; 1447 s = rowEnd[r]; 1448 ierr = PetscArraycpy(Aperm+Atot,perm+k, mid-k);CHKERRQ(ierr); 1449 ierr = PetscArraycpy(Bperm+Btot,perm+mid,s-mid);CHKERRQ(ierr); 1450 Atot += mid - k; 1451 Btot += s - mid; 1452 1453 /* Scan column indices in this row and find out how many repeats each unique nonzero has */ 1454 for (p=k; p<mid;) { 1455 col = j[p]; 1456 q = p; 1457 do {p++;} while (p<mid && j[p] == col); 1458 Ajmap[Annz+1] = Ajmap[Annz] + (p - q); 1459 Annz++; 1460 } 1461 1462 for (p=mid; p<s;) { 1463 col = j[p]; 1464 q = p; 1465 do {p++;} while (p<s && j[p] == col); 1466 Bjmap[Bnnz+1] = Bjmap[Bnnz] + (p - q); 1467 Bnnz++; 1468 } 1469 } 1470 PetscFunctionReturn(0); 1471 } 1472 1473 static PetscErrorCode MatSetPreallocationCOO_MPIAIJKokkos(Mat mat, PetscInt coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 1474 { 1475 PetscErrorCode ierr; 1476 MPI_Comm comm; 1477 PetscMPIInt rank,size; 1478 PetscInt m,n,M,N,k,p,q,rstart,rend,cstart,cend,rem; 1479 1480 PetscFunctionBegin; 1481 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 1482 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 1483 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 1484 1485 ierr = PetscLayoutGetRange(mat->rmap,&rstart,&rend);CHKERRQ(ierr); 1486 ierr = PetscLayoutGetRange(mat->cmap,&cstart,&cend);CHKERRQ(ierr); 1487 ierr = MatGetLocalSize(mat,&m,&n);CHKERRQ(ierr); 1488 ierr = MatGetSize(mat,&M,&N);CHKERRQ(ierr); 1489 1490 /* ---------------------------------------------------------------------------*/ 1491 /* Sort (i,j) by row along with a permuation array, so that the to-be-ignored */ 1492 /* entries come first, then local rows, then remote rows. */ 1493 /* ---------------------------------------------------------------------------*/ 1494 PetscInt n1 = coo_n,*i1,*j1,*perm1; /* Copies of input COOs along with a permutation array */ 1495 ierr = PetscMalloc3(n1,&i1,n1,&j1,n1,&perm1);CHKERRQ(ierr); 1496 ierr = PetscArraycpy(i1,coo_i,n1);CHKERRQ(ierr); /* Make a copy since we'll modify it */ 1497 ierr = PetscArraycpy(j1,coo_j,n1);CHKERRQ(ierr); 1498 for (k=0; k<n1; k++) perm1[k] = k; 1499 1500 /* Manipulate indices so that entries with negative row or col indices will have smallest 1501 row indices, local entries will have greater but negative row indices, and remote entries 1502 will have positive row indices. 1503 */ 1504 for (k=0; k<n1; k++) { 1505 if (i1[k] < 0 || j1[k] < 0) i1[k] = PETSC_MIN_INT; /* e.g., -2^31, minimal to move them ahead */ 1506 else if (i1[k] >= rstart && i1[k] < rend) i1[k] -= PETSC_MAX_INT; /* e.g., minus 2^31-1 to shift local rows to range of [-PETSC_MAX_INT, -1] */ 1507 } 1508 1509 /* Sort by row; after that, [0,k) have ignored entires, [k,rem) have local rows and [rem,n1) have remote rows */ 1510 ierr = PetscSortIntWithArrayPair(n1,i1,j1,perm1);CHKERRQ(ierr); 1511 for (k=0; k<n1; k++) {if (i1[k] > PETSC_MIN_INT) break;} /* Advance k to the first entry we need to take care of */ 1512 ierr = PetscSortedIntUpperBound(i1,k,n1,rend-1-PETSC_MAX_INT,&rem);CHKERRQ(ierr); /* rem is upper bound of the last local row */ 1513 for (; k<rem; k++) i1[k] += PETSC_MAX_INT; /* Revert row indices of local rows*/ 1514 1515 /* ---------------------------------------------------------------------------*/ 1516 /* Split local rows into diag/offdiag portions */ 1517 /* ---------------------------------------------------------------------------*/ 1518 PetscInt *rowBegin1,*rowMid1,*rowEnd1; 1519 MatRowMapKokkosViewHost Ajmap1_h,Aperm1_h,Bjmap1_h,Bperm1_h,Cperm1_h("Cperm1_h",n1-rem); 1520 1521 ierr = PetscCalloc3(m,&rowBegin1,m,&rowMid1,m,&rowEnd1);CHKERRQ(ierr); 1522 ierr = MatSplitEntries_Internal(mat,rem,i1,j1,perm1,rowBegin1,rowMid1,rowEnd1,Aperm1_h,Ajmap1_h,Bperm1_h,Bjmap1_h);CHKERRQ(ierr); 1523 1524 /* ---------------------------------------------------------------------------*/ 1525 /* Send remote rows to their owner */ 1526 /* ---------------------------------------------------------------------------*/ 1527 /* Find which rows should be sent to which remote ranks*/ 1528 PetscInt nsend = 0; 1529 PetscMPIInt *sendto; /* Of length nsend, storing remote ranks */ 1530 PetscInt *nentries; /* Of length nsend, storing number of entries to be sent to each remote rank */ 1531 const PetscInt *ranges; 1532 PetscInt maxNsend = size >= 128? 128 : size; /* Assume max 128 neighbors; realloc when needed */ 1533 1534 ierr = PetscLayoutGetRanges(mat->rmap,&ranges);CHKERRQ(ierr); 1535 ierr = PetscMalloc2(maxNsend,&sendto,maxNsend,&nentries);CHKERRQ(ierr); 1536 for (k=rem; k<n1;) { 1537 PetscMPIInt owner; 1538 PetscInt firstRow,lastRow; 1539 /* Locate a row range */ 1540 firstRow = i1[k]; /* first row of this owner */ 1541 ierr = PetscLayoutFindOwner(mat->rmap,firstRow,&owner);CHKERRQ(ierr); 1542 lastRow = ranges[owner+1]-1; /* last row of this owner */ 1543 1544 /* Find the first index 'p' in [k,n) with i[p] belonging to next owner */ 1545 ierr = PetscSortedIntUpperBound(i1,k,n1,lastRow,&p);CHKERRQ(ierr); 1546 1547 /* All entries in [k,p) belong to this remote owner */ 1548 if (nsend >= maxNsend) { /* Double the remote ranks arrays if not long enough */ 1549 PetscMPIInt *sendto2; 1550 PetscInt *nentries2; 1551 PetscInt maxNsend2 = (maxNsend <= size/2) ? maxNsend*2 : size; 1552 ierr = PetscMalloc2(maxNsend2,&sendto2,maxNsend2,&nentries2);CHKERRQ(ierr); 1553 ierr = PetscArraycpy(sendto2,sendto,maxNsend);CHKERRQ(ierr); 1554 ierr = PetscArraycpy(nentries2,nentries2,maxNsend+1);CHKERRQ(ierr); 1555 ierr = PetscFree2(sendto,nentries2);CHKERRQ(ierr); 1556 sendto = sendto2; 1557 nentries = nentries2; 1558 maxNsend = maxNsend2; 1559 } 1560 sendto[nsend] = owner; 1561 nentries[nsend] = p - k; 1562 nsend++; 1563 k = p; 1564 } 1565 1566 /* Build 1st SF to know offsets on remote to send data */ 1567 PetscSF sf1; 1568 PetscInt nroots = 1,nroots2 = 0; 1569 PetscInt nleaves = nsend,nleaves2 = 0; 1570 PetscInt *offsets; 1571 PetscSFNode *iremote; 1572 1573 ierr = PetscSFCreate(comm,&sf1);CHKERRQ(ierr); 1574 ierr = PetscMalloc1(nsend,&iremote);CHKERRQ(ierr); 1575 ierr = PetscMalloc1(nsend,&offsets);CHKERRQ(ierr); 1576 for (k=0; k<nsend; k++) { 1577 iremote[k].rank = sendto[k]; 1578 iremote[k].index = 0; 1579 nleaves2 += nentries[k]; 1580 } 1581 ierr = PetscSFSetGraph(sf1,nroots,nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr); 1582 ierr = PetscSFFetchAndOpWithMemTypeBegin(sf1,MPIU_INT,PETSC_MEMTYPE_HOST,&nroots2/*rootdata*/,PETSC_MEMTYPE_HOST,nentries/*leafdata*/,PETSC_MEMTYPE_HOST,offsets/*leafupdate*/,MPI_SUM);CHKERRQ(ierr); 1583 ierr = PetscSFFetchAndOpEnd(sf1,MPIU_INT,&nroots2,nentries,offsets,MPI_SUM);CHKERRQ(ierr); 1584 ierr = PetscSFDestroy(&sf1);CHKERRQ(ierr); 1585 1586 /* Build 2nd SF to send remote COOs to their owner */ 1587 PetscSF sf2; 1588 nroots = nroots2; 1589 nleaves = nleaves2; 1590 ierr = PetscSFCreate(comm,&sf2);CHKERRQ(ierr); 1591 ierr = PetscSFSetFromOptions(sf2);CHKERRQ(ierr); 1592 ierr = PetscMalloc1(nleaves,&iremote);CHKERRQ(ierr); 1593 p = 0; 1594 for (k=0; k<nsend; k++) { 1595 for (q=0; q<nentries[k]; q++,p++) { 1596 iremote[p].rank = sendto[k]; 1597 iremote[p].index = offsets[k] + q; 1598 } 1599 } 1600 ierr = PetscSFSetGraph(sf2,nroots,nleaves,NULL,PETSC_USE_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr); 1601 1602 /* sf2 only sends contiguous leafdata to contiguous rootdata. We record the permuation which will be used to fill leafdata */ 1603 ierr = PetscArraycpy(Cperm1_h.data(),perm1+rem,n1-rem);CHKERRQ(ierr); 1604 1605 /* Send the remote COOs to their owner */ 1606 PetscInt n2 = nroots,*i2,*j2,*perm2; /* Buffers for received COOs from other ranks, along with a permutation array */ 1607 ierr = PetscMalloc3(n2,&i2,n2,&j2,n2,&perm2);CHKERRQ(ierr); 1608 ierr = PetscSFReduceWithMemTypeBegin(sf2,MPIU_INT,PETSC_MEMTYPE_HOST,i1+rem,PETSC_MEMTYPE_HOST,i2,MPI_REPLACE);CHKERRQ(ierr); 1609 ierr = PetscSFReduceEnd(sf2,MPIU_INT,i1+rem,i2,MPI_REPLACE);CHKERRQ(ierr); 1610 ierr = PetscSFReduceWithMemTypeBegin(sf2,MPIU_INT,PETSC_MEMTYPE_HOST,j1+rem,PETSC_MEMTYPE_HOST,j2,MPI_REPLACE);CHKERRQ(ierr); 1611 ierr = PetscSFReduceEnd(sf2,MPIU_INT,j1+rem,j2,MPI_REPLACE);CHKERRQ(ierr); 1612 1613 ierr = PetscFree(offsets);CHKERRQ(ierr); 1614 ierr = PetscFree2(sendto,nentries);CHKERRQ(ierr); 1615 1616 /* ---------------------------------------------------------------*/ 1617 /* Sort received COOs by row along with the permutation array */ 1618 /* ---------------------------------------------------------------*/ 1619 for (k=0; k<n2; k++) perm2[k] = k; 1620 ierr = PetscSortIntWithArrayPair(n2,i2,j2,perm2);CHKERRQ(ierr); 1621 1622 /* ---------------------------------------------------------------*/ 1623 /* Split received COOs into diag/offdiag portions */ 1624 /* ---------------------------------------------------------------*/ 1625 PetscInt *rowBegin2,*rowMid2,*rowEnd2; 1626 MatRowMapKokkosViewHost Ajmap2_h,Aperm2_h,Bjmap2_h,Bperm2_h; 1627 1628 ierr = PetscCalloc3(m,&rowBegin2,m,&rowMid2,m,&rowEnd2);CHKERRQ(ierr); 1629 ierr = MatSplitEntries_Internal(mat,n2,i2,j2,perm2,rowBegin2,rowMid2,rowEnd2,Aperm2_h,Ajmap2_h,Bperm2_h,Bjmap2_h);CHKERRQ(ierr); 1630 1631 /* --------------------------------------------------------------------------*/ 1632 /* Merge local COOs with received COOs: diag with diag, offdiag with offdiag */ 1633 /* --------------------------------------------------------------------------*/ 1634 PetscInt Annz1,Annz2,Bnnz1,Bnnz2; 1635 PetscInt *Ai,*Aj,*Bi,*Bj; 1636 1637 Annz1 = Ajmap1_h.extent(0)-1; /* Number of unique local nonzeros in the diagonal block */ 1638 Annz2 = Ajmap2_h.extent(0)-1; /* Number of unique received nonzeros in the diagonal block */ 1639 Bnnz1 = Bjmap1_h.extent(0)-1; /* Similar, but for the off-diagonal block */ 1640 Bnnz2 = Bjmap2_h.extent(0)-1; 1641 ierr = PetscMalloc1(m+1,&Ai);CHKERRQ(ierr); 1642 ierr = PetscMalloc1(m+1,&Bi);CHKERRQ(ierr); 1643 ierr = PetscMalloc1(Annz1+Annz2,&Aj);CHKERRQ(ierr); /* Since local and remote entries might have dups, we might allocate excess memory */ 1644 ierr = PetscMalloc1(Bnnz1+Bnnz2,&Bj);CHKERRQ(ierr); 1645 1646 MatRowMapKokkosViewHost Aimap1_h("Aimpa1",Annz1),Aimap2_h("Aimpa2",Annz2),Bimap1_h("Bimap1",Bnnz1),Bimap2_h("Bimap2",Bnnz2); 1647 ierr = MatMergeEntries_Internal(mat,j1,j2,rowBegin1,rowMid1,rowBegin2,rowMid2,Ajmap1_h,Ajmap2_h,Aimap1_h,Aimap2_h,Ai,Aj);CHKERRQ(ierr); 1648 ierr = MatMergeEntries_Internal(mat,j1,j2,rowMid1, rowEnd1,rowMid2, rowEnd2,Bjmap1_h,Bjmap2_h,Bimap1_h,Bimap2_h,Bi,Bj);CHKERRQ(ierr); 1649 ierr = PetscFree3(rowBegin1,rowMid1,rowEnd1);CHKERRQ(ierr); 1650 ierr = PetscFree3(rowBegin2,rowMid2,rowEnd2);CHKERRQ(ierr); 1651 ierr = PetscFree3(i1,j1,perm1);CHKERRQ(ierr); 1652 ierr = PetscFree3(i2,j2,perm2);CHKERRQ(ierr); 1653 1654 /* Reallocate Aj, Bj once we know actual numbers of unique nonzeros in A and B */ 1655 PetscInt Annz = Ai[m]; 1656 PetscInt Bnnz = Bi[m]; 1657 if (Annz < Annz1 + Annz2) { 1658 PetscInt *Aj_new; 1659 ierr = PetscMalloc1(Annz,&Aj_new);CHKERRQ(ierr); 1660 ierr = PetscArraycpy(Aj_new,Aj,Annz);CHKERRQ(ierr); 1661 ierr = PetscFree(Aj);CHKERRQ(ierr); 1662 Aj = Aj_new; 1663 } 1664 1665 if (Bnnz < Bnnz1 + Bnnz2) { 1666 PetscInt *Bj_new; 1667 ierr = PetscMalloc1(Bnnz,&Bj_new);CHKERRQ(ierr); 1668 ierr = PetscArraycpy(Bj_new,Bj,Bnnz);CHKERRQ(ierr); 1669 ierr = PetscFree(Bj);CHKERRQ(ierr); 1670 Bj = Bj_new; 1671 } 1672 1673 /* --------------------------------------------------------------------------------*/ 1674 /* Create a MPIAIJKOKKOS newmat with CSRs of A and B, then replace mat with newmat */ 1675 /* --------------------------------------------------------------------------------*/ 1676 Mat newmat; 1677 PetscScalar *Aa,*Ba; 1678 Mat_MPIAIJ *mpiaij; 1679 Mat_SeqAIJ *a,*b; 1680 1681 ierr = PetscMalloc1(Annz,&Aa);CHKERRQ(ierr); 1682 ierr = PetscMalloc1(Bnnz,&Ba);CHKERRQ(ierr); 1683 /* make Aj[] local, i.e, based off the start column of the diagonal portion */ 1684 if (cstart) {for (k=0; k<Annz; k++) Aj[k] -= cstart;} 1685 ierr = MatCreateMPIAIJWithSplitArrays(comm,m,n,M,N,Ai,Aj,Aa,Bi,Bj,Ba,&newmat);CHKERRQ(ierr); 1686 mpiaij = (Mat_MPIAIJ*)newmat->data; 1687 a = (Mat_SeqAIJ*)mpiaij->A->data; 1688 b = (Mat_SeqAIJ*)mpiaij->B->data; 1689 a->singlemalloc = b->singlemalloc = PETSC_FALSE; /* Let newmat own Ai,Aj,Aa,Bi,Bj,Ba */ 1690 a->free_a = b->free_a = PETSC_TRUE; 1691 a->free_ij = b->free_ij = PETSC_TRUE; 1692 ierr = MatConvert(newmat,MATMPIAIJKOKKOS,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr); 1693 ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr); 1694 ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */ 1695 mpiaij = (Mat_MPIAIJ*)mat->data; 1696 mpiaij->spptr = new Mat_MPIAIJKokkos(n1,sf2,nroots,nleaves,Annz1,Annz2,Bnnz1,Bnnz2, 1697 Aimap1_h,Aimap2_h,Bimap1_h,Bimap2_h, 1698 Ajmap1_h,Ajmap2_h,Bjmap1_h,Bjmap2_h, 1699 Aperm1_h,Aperm2_h,Bperm1_h,Bperm2_h,Cperm1_h); 1700 ierr = PetscSFDestroy(&sf2);CHKERRQ(ierr); /* ctor of Mat_MPIAIJKokkos already took a reference of sf3 */ 1701 PetscFunctionReturn(0); 1702 } 1703 1704 static PetscErrorCode MatSetValuesCOO_MPIAIJKokkos(Mat mat,const PetscScalar v[],InsertMode imode) 1705 { 1706 PetscErrorCode ierr; 1707 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 1708 Mat_MPIAIJKokkos *mpikok = static_cast<Mat_MPIAIJKokkos*>(mpiaij->spptr); 1709 Mat A = mpiaij->A,B = mpiaij->B; 1710 PetscInt Annz1 = mpikok->Annz1,Annz2 = mpikok->Annz2,Bnnz1 = mpikok->Bnnz1,Bnnz2 = mpikok->Bnnz2; 1711 MatScalarKokkosView Aa,Ba; 1712 ConstMatScalarKokkosView v1; 1713 MatScalarKokkosView& vsend = mpikok->sendbuf_d; 1714 const MatScalarKokkosView& v2 = mpikok->recvbuf_d; 1715 const MatRowMapKokkosView& Ajmap1 = mpikok->Ajmap1_d,Ajmap2 = mpikok->Ajmap2_d,Aimap1 = mpikok->Aimap1_d,Aimap2 = mpikok->Aimap2_d; 1716 const MatRowMapKokkosView& Bjmap1 = mpikok->Bjmap1_d,Bjmap2 = mpikok->Bjmap2_d,Bimap1 = mpikok->Bimap1_d,Bimap2 = mpikok->Bimap2_d; 1717 const MatRowMapKokkosView& Aperm1 = mpikok->Aperm1_d,Aperm2 = mpikok->Aperm2_d,Bperm1 = mpikok->Bperm1_d,Bperm2 = mpikok->Bperm2_d; 1718 const MatRowMapKokkosView& Cperm1 = mpikok->Cperm1_d; 1719 PetscMemType memtype; 1720 1721 PetscFunctionBegin; 1722 if (!v) { /* NULL v means an all zero array */ 1723 ierr = MatZeroEntries(mat);CHKERRQ(ierr); 1724 PetscFunctionReturn(0); 1725 } 1726 1727 ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr); 1728 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device if any */ 1729 v1 = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,mpikok->coo_n)); 1730 } else { 1731 v1 = ConstMatScalarKokkosView(v,mpikok->coo_n); /* Directly use v[]'s memory */ 1732 } 1733 1734 ierr = MatSeqAIJGetKokkosView(A,&Aa);CHKERRQ(ierr); /* Might read and write matrix values */ 1735 ierr = MatSeqAIJGetKokkosView(B,&Ba);CHKERRQ(ierr); 1736 if (imode == INSERT_VALUES) { 1737 Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */ 1738 Kokkos::deep_copy(Ba,0.0); 1739 } 1740 1741 /* Pack entries to be sent to remote */ 1742 Kokkos::parallel_for(vsend.extent(0),KOKKOS_LAMBDA(const PetscInt i) {vsend(i) = v1(Cperm1(i));}); 1743 1744 /* Send remote entries to their owner and overlap the communication with local computation */ 1745 ierr = PetscSFReduceWithMemTypeBegin(mpikok->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_KOKKOS,vsend.data(),PETSC_MEMTYPE_KOKKOS,v2.data(),MPI_REPLACE);CHKERRQ(ierr); 1746 /* Add local entries to A and B */ 1747 Kokkos::parallel_for(Annz1,KOKKOS_LAMBDA(const PetscInt i) {for (PetscInt k=Ajmap1(i); k<Ajmap1(i+1); k++) Aa(Aimap1(i)) += v1(Aperm1(k));}); 1748 Kokkos::parallel_for(Bnnz1,KOKKOS_LAMBDA(const PetscInt i) {for (PetscInt k=Bjmap1(i); k<Bjmap1(i+1); k++) Ba(Bimap1(i)) += v1(Bperm1(k));}); 1749 ierr = PetscSFReduceEnd(mpikok->coo_sf,MPIU_SCALAR,vsend.data(),v2.data(),MPI_REPLACE);CHKERRQ(ierr); 1750 1751 /* Add received remote entries to A and B */ 1752 Kokkos::parallel_for(Annz2,KOKKOS_LAMBDA(const PetscInt i) {for (PetscInt k=Ajmap2(i); k<Ajmap2(i+1); k++) Aa(Aimap2(i)) += v2(Aperm2(k));}); 1753 Kokkos::parallel_for(Bnnz2,KOKKOS_LAMBDA(const PetscInt i) {for (PetscInt k=Bjmap2(i); k<Bjmap2(i+1); k++) Ba(Bimap2(i)) += v2(Bperm2(k));}); 1754 1755 ierr = MatSeqAIJRestoreKokkosView(A,&Aa);CHKERRQ(ierr); 1756 ierr = MatSeqAIJRestoreKokkosView(B,&Ba);CHKERRQ(ierr); 1757 1758 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1759 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1760 PetscFunctionReturn(0); 1761 } 1762 1763 PetscErrorCode MatDestroy_MPIAIJKokkos(Mat A) 1764 { 1765 PetscErrorCode ierr; 1766 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 1767 1768 PetscFunctionBegin; 1769 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 1770 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr); 1771 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C", NULL);CHKERRQ(ierr); 1772 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C", NULL);CHKERRQ(ierr); 1773 delete (Mat_MPIAIJKokkos*)mpiaij->spptr; 1774 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 1775 PetscFunctionReturn(0); 1776 } 1777 1778 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 1779 { 1780 PetscErrorCode ierr; 1781 Mat B; 1782 Mat_MPIAIJ *a; 1783 1784 PetscFunctionBegin; 1785 if (reuse == MAT_INITIAL_MATRIX) { 1786 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 1787 } else if (reuse == MAT_REUSE_MATRIX) { 1788 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 1789 } 1790 B = *newmat; 1791 1792 B->boundtocpu = PETSC_FALSE; 1793 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 1794 ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr); 1795 ierr = PetscObjectChangeTypeName((PetscObject)B,MATMPIAIJKOKKOS);CHKERRQ(ierr); 1796 1797 a = static_cast<Mat_MPIAIJ*>(A->data); 1798 if (a->A) {ierr = MatSetType(a->A,MATSEQAIJKOKKOS);CHKERRQ(ierr);} 1799 if (a->B) {ierr = MatSetType(a->B,MATSEQAIJKOKKOS);CHKERRQ(ierr);} 1800 if (a->lvec) {ierr = VecSetType(a->lvec,VECSEQKOKKOS);CHKERRQ(ierr);} 1801 1802 B->ops->assemblyend = MatAssemblyEnd_MPIAIJKokkos; 1803 B->ops->mult = MatMult_MPIAIJKokkos; 1804 B->ops->multadd = MatMultAdd_MPIAIJKokkos; 1805 B->ops->multtranspose = MatMultTranspose_MPIAIJKokkos; 1806 B->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJKokkos; 1807 B->ops->destroy = MatDestroy_MPIAIJKokkos; 1808 1809 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJKokkos);CHKERRQ(ierr); 1810 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJKokkos);CHKERRQ(ierr); 1811 ierr = PetscObjectComposeFunction((PetscObject)B,"MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJKokkos);CHKERRQ(ierr); 1812 ierr = PetscObjectComposeFunction((PetscObject)B,"MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJKokkos);CHKERRQ(ierr); 1813 PetscFunctionReturn(0); 1814 } 1815 1816 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJKokkos(Mat A) 1817 { 1818 PetscErrorCode ierr; 1819 1820 PetscFunctionBegin; 1821 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 1822 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 1823 ierr = MatConvert_MPIAIJ_MPIAIJKokkos(A,MATMPIAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 1824 PetscFunctionReturn(0); 1825 } 1826 1827 /*@C 1828 MatCreateAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 1829 (the default parallel PETSc format). This matrix will ultimately pushed down 1830 to Kokkos for calculations. For good matrix 1831 assembly performance the user should preallocate the matrix storage by setting 1832 the parameter nz (or the array nnz). By setting these parameters accurately, 1833 performance during matrix assembly can be increased by more than a factor of 50. 1834 1835 Collective 1836 1837 Input Parameters: 1838 + comm - MPI communicator, set to PETSC_COMM_SELF 1839 . m - number of rows 1840 . n - number of columns 1841 . nz - number of nonzeros per row (same for all rows) 1842 - nnz - array containing the number of nonzeros in the various rows 1843 (possibly different for each row) or NULL 1844 1845 Output Parameter: 1846 . A - the matrix 1847 1848 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1849 MatXXXXSetPreallocation() paradigm instead of this routine directly. 1850 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1851 1852 Notes: 1853 If nnz is given then nz is ignored 1854 1855 The AIJ format (also called the Yale sparse matrix format or 1856 compressed row storage), is fully compatible with standard Fortran 77 1857 storage. That is, the stored row and column indices can begin at 1858 either one (as in Fortran) or zero. See the users' manual for details. 1859 1860 Specify the preallocated storage with either nz or nnz (not both). 1861 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1862 allocation. For large problems you MUST preallocate memory or you 1863 will get TERRIBLE performance, see the users' manual chapter on matrices. 1864 1865 By default, this format uses inodes (identical nodes) when possible, to 1866 improve numerical efficiency of matrix-vector products and solves. We 1867 search for consecutive rows with the same nonzero structure, thereby 1868 reusing matrix information to achieve increased efficiency. 1869 1870 Level: intermediate 1871 1872 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJKOKKOS, MATAIJKokkos 1873 @*/ 1874 PetscErrorCode MatCreateAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 1875 { 1876 PetscErrorCode ierr; 1877 PetscMPIInt size; 1878 1879 PetscFunctionBegin; 1880 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1881 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 1882 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 1883 if (size > 1) { 1884 ierr = MatSetType(*A,MATMPIAIJKOKKOS);CHKERRQ(ierr); 1885 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 1886 } else { 1887 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1888 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 1889 } 1890 PetscFunctionReturn(0); 1891 } 1892 1893 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 1894 PetscErrorCode MatKokkosGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) 1895 { 1896 PetscMPIInt size,rank; 1897 MPI_Comm comm; 1898 PetscErrorCode ierr; 1899 PetscSplitCSRDataStructure d_mat=NULL; 1900 1901 PetscFunctionBegin; 1902 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 1903 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 1904 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 1905 if (size == 1) { 1906 ierr = MatSeqAIJKokkosGetDeviceMat(A,&d_mat);CHKERRQ(ierr); 1907 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); /* Since we are going to modify matrix values on device */ 1908 } else { 1909 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 1910 ierr = MatSeqAIJKokkosGetDeviceMat(aij->A,&d_mat);CHKERRQ(ierr); 1911 ierr = MatSeqAIJKokkosModifyDevice(aij->A);CHKERRQ(ierr); 1912 ierr = MatSeqAIJKokkosModifyDevice(aij->B);CHKERRQ(ierr); 1913 } 1914 // act like MatSetValues because not called on host 1915 if (A->assembled) { 1916 if (A->was_assembled) { 1917 ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr); 1918 } 1919 A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 1920 } else { 1921 ierr = PetscInfo1(A,"Warning !assemble ??? assembled=%" PetscInt_FMT "\n",A->assembled);CHKERRQ(ierr); 1922 // SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix"); 1923 } 1924 if (!d_mat) { 1925 struct _n_SplitCSRMat h_mat; /* host container */ 1926 Mat_SeqAIJKokkos *aijkokA; 1927 Mat_SeqAIJ *jaca; 1928 PetscInt n = A->rmap->n, nnz; 1929 Mat Amat; 1930 PetscInt *colmap; 1931 1932 /* create and copy h_mat */ 1933 h_mat.M = A->cmap->N; // use for debug build 1934 ierr = PetscInfo(A,"Create device matrix in Kokkos\n");CHKERRQ(ierr); 1935 if (size == 1) { 1936 Amat = A; 1937 jaca = (Mat_SeqAIJ*)A->data; 1938 h_mat.rstart = 0; h_mat.rend = A->rmap->n; 1939 h_mat.cstart = 0; h_mat.cend = A->cmap->n; 1940 h_mat.offdiag.i = h_mat.offdiag.j = NULL; 1941 h_mat.offdiag.a = NULL; 1942 aijkokA = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1943 aijkokA->i_uncompressed_d = NULL; 1944 aijkokA->colmap_d = NULL; 1945 } else { 1946 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 1947 Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 1948 PetscInt ii; 1949 Mat_SeqAIJKokkos *aijkokB; 1950 1951 Amat = aij->A; 1952 aijkokA = static_cast<Mat_SeqAIJKokkos*>(aij->A->spptr); 1953 aijkokB = static_cast<Mat_SeqAIJKokkos*>(aij->B->spptr); 1954 aijkokA->i_uncompressed_d = NULL; 1955 aijkokA->colmap_d = NULL; 1956 jaca = (Mat_SeqAIJ*)aij->A->data; 1957 if (aij->B->cmap->n && !aij->garray) SETERRQ(comm,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 1958 if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 1959 aij->donotstash = PETSC_TRUE; 1960 aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 1961 jaca->nonew = jacb->nonew = PETSC_TRUE; // no more disassembly 1962 ierr = PetscCalloc1(A->cmap->N,&colmap);CHKERRQ(ierr); 1963 ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N)*sizeof(PetscInt));CHKERRQ(ierr); 1964 for (ii=0; ii<aij->B->cmap->n; ii++) colmap[aij->garray[ii]] = ii+1; 1965 // allocate B copy data 1966 h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 1967 h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 1968 nnz = jacb->i[n]; 1969 if (jacb->compressedrow.use) { 1970 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_i_k (jacb->i,n+1); 1971 aijkokB->i_uncompressed_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_i_k)); 1972 Kokkos::deep_copy (*aijkokB->i_uncompressed_d, h_i_k); 1973 h_mat.offdiag.i = aijkokB->i_uncompressed_d->data(); 1974 } else { 1975 h_mat.offdiag.i = aijkokB->i_device_data(); 1976 } 1977 h_mat.offdiag.j = aijkokB->j_device_data(); 1978 h_mat.offdiag.a = aijkokB->a_device_data(); 1979 { 1980 Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_colmap_k (colmap,A->cmap->N); 1981 aijkokB->colmap_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_colmap_k)); 1982 Kokkos::deep_copy (*aijkokB->colmap_d, h_colmap_k); 1983 h_mat.colmap = aijkokB->colmap_d->data(); 1984 ierr = PetscFree(colmap);CHKERRQ(ierr); 1985 } 1986 h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 1987 h_mat.offdiag.n = n; 1988 } 1989 // allocate A copy data 1990 nnz = jaca->i[n]; 1991 h_mat.diag.n = n; 1992 h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 1993 ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr); 1994 if (jaca->compressedrow.use) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A does not suppport compressed row (todo)"); 1995 else { 1996 h_mat.diag.i = aijkokA->i_device_data(); 1997 } 1998 h_mat.diag.j = aijkokA->j_device_data(); 1999 h_mat.diag.a = aijkokA->a_device_data(); 2000 // copy pointers and metdata to device 2001 ierr = MatSeqAIJKokkosSetDeviceMat(Amat,&h_mat);CHKERRQ(ierr); 2002 ierr = MatSeqAIJKokkosGetDeviceMat(Amat,&d_mat);CHKERRQ(ierr); 2003 ierr = PetscInfo2(A,"Create device Mat n=%" PetscInt_FMT " nnz=%" PetscInt_FMT "\n",h_mat.diag.n, nnz);CHKERRQ(ierr); 2004 } 2005 *B = d_mat; // return it, set it in Mat, and set it up 2006 A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 2007 PetscFunctionReturn(0); 2008 } 2009 2010 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetOffloadMask(Mat A, const char **mask) 2011 { 2012 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 2013 2014 PetscFunctionBegin; 2015 if (!aijkok) *mask = "AIJKOK_UNALLOCATED"; 2016 else if (aijkok->a_dual.need_sync_host()) *mask = "PETSC_OFFLOAD_GPU"; 2017 else if (aijkok->a_dual.need_sync_device()) *mask = "PETSC_OFFLOAD_CPU"; 2018 else *mask = "PETSC_OFFLOAD_BOTH"; 2019 PetscFunctionReturn(0); 2020 } 2021 2022 PETSC_INTERN PetscErrorCode MatAIJKokkosPrintOffloadMask(Mat A) 2023 { 2024 PetscErrorCode ierr; 2025 PetscMPIInt size; 2026 Mat Ad,Ao; 2027 const char *amask,*bmask; 2028 2029 PetscFunctionBegin; 2030 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr); 2031 2032 if (size == 1) { 2033 ierr = MatSeqAIJKokkosGetOffloadMask(A,&amask);CHKERRQ(ierr); 2034 ierr = PetscPrintf(PETSC_COMM_SELF,"%s\n",amask);CHKERRQ(ierr); 2035 } else { 2036 Ad = ((Mat_MPIAIJ*)A->data)->A; 2037 Ao = ((Mat_MPIAIJ*)A->data)->B; 2038 ierr = MatSeqAIJKokkosGetOffloadMask(Ad,&amask);CHKERRQ(ierr); 2039 ierr = MatSeqAIJKokkosGetOffloadMask(Ao,&bmask);CHKERRQ(ierr); 2040 ierr = PetscPrintf(PETSC_COMM_SELF,"Diag : Off-diag = %s : %s\n",amask,bmask);CHKERRQ(ierr); 2041 } 2042 PetscFunctionReturn(0); 2043 } 2044