xref: /petsc/src/mat/impls/aij/mpi/kokkos/mpiaijkok.kokkos.cxx (revision 28b400f66ebc7ae0049166a2294dfcd3df27e64b)
111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp>
2076ba34aSJunchao Zhang #include <petscsf.h>
342550becSJunchao Zhang #include <petsc/private/sfimpl.h>
48c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/mpi/mpiaij.h>
542550becSJunchao Zhang #include <../src/mat/impls/aij/mpi/kokkos/mpiaijkok.hpp>
6076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp>
711d22bbfSJunchao Zhang 
88c3ff71bSJunchao Zhang PetscErrorCode MatAssemblyEnd_MPIAIJKokkos(Mat A,MatAssemblyType mode)
98c3ff71bSJunchao Zhang {
108c3ff71bSJunchao Zhang   Mat_MPIAIJ       *mpiaij = (Mat_MPIAIJ*)A->data;
11a587d139SMark   Mat_SeqAIJKokkos *aijkok = mpiaij->A->spptr ? static_cast<Mat_SeqAIJKokkos*>(mpiaij->A->spptr) : NULL;
128c3ff71bSJunchao Zhang 
138c3ff71bSJunchao Zhang   PetscFunctionBegin;
145f80ce2aSJacob Faibussowitsch   CHKERRQ(MatAssemblyEnd_MPIAIJ(A,mode));
15a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
16a587d139SMark     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
17a587d139SMark   }
18a587d139SMark 
198c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
208c3ff71bSJunchao Zhang }
218c3ff71bSJunchao Zhang 
228c3ff71bSJunchao Zhang PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJKokkos(Mat mat,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
238c3ff71bSJunchao Zhang {
248c3ff71bSJunchao Zhang   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
258c3ff71bSJunchao Zhang 
268c3ff71bSJunchao Zhang   PetscFunctionBegin;
275f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLayoutSetUp(mat->rmap));
285f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLayoutSetUp(mat->cmap));
296a29ce69SStefano Zampini #if defined(PETSC_USE_DEBUG)
308c3ff71bSJunchao Zhang   if (d_nnz) {
316a29ce69SStefano Zampini     PetscInt i;
328c3ff71bSJunchao Zhang     for (i=0; i<mat->rmap->n; i++) {
332c71b3e2SJacob Faibussowitsch       PetscCheckFalse(d_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]);
348c3ff71bSJunchao Zhang     }
358c3ff71bSJunchao Zhang   }
368c3ff71bSJunchao Zhang   if (o_nnz) {
376a29ce69SStefano Zampini     PetscInt i;
388c3ff71bSJunchao Zhang     for (i=0; i<mat->rmap->n; i++) {
392c71b3e2SJacob Faibussowitsch       PetscCheckFalse(o_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]);
408c3ff71bSJunchao Zhang     }
418c3ff71bSJunchao Zhang   }
426a29ce69SStefano Zampini #endif
436a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE)
445f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscTableDestroy(&mpiaij->colmap));
456a29ce69SStefano Zampini #else
465f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(mpiaij->colmap));
476a29ce69SStefano Zampini #endif
485f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(mpiaij->garray));
495f80ce2aSJacob Faibussowitsch   CHKERRQ(VecDestroy(&mpiaij->lvec));
505f80ce2aSJacob Faibussowitsch   CHKERRQ(VecScatterDestroy(&mpiaij->Mvctx));
516a29ce69SStefano Zampini   /* Because the B will have been resized we simply destroy it and create a new one each time */
525f80ce2aSJacob Faibussowitsch   CHKERRQ(MatDestroy(&mpiaij->B));
536a29ce69SStefano Zampini 
546a29ce69SStefano Zampini   if (!mpiaij->A) {
555f80ce2aSJacob Faibussowitsch     CHKERRQ(MatCreate(PETSC_COMM_SELF,&mpiaij->A));
565f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSetSizes(mpiaij->A,mat->rmap->n,mat->cmap->n,mat->rmap->n,mat->cmap->n));
575f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->A));
586a29ce69SStefano Zampini   }
596a29ce69SStefano Zampini   if (!mpiaij->B) {
606a29ce69SStefano Zampini     PetscMPIInt size;
615f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size));
625f80ce2aSJacob Faibussowitsch     CHKERRQ(MatCreate(PETSC_COMM_SELF,&mpiaij->B));
635f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSetSizes(mpiaij->B,mat->rmap->n,size > 1 ? mat->cmap->N : 0,mat->rmap->n,size > 1 ? mat->cmap->N : 0));
645f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->B));
658c3ff71bSJunchao Zhang   }
665f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetType(mpiaij->A,MATSEQAIJKOKKOS));
675f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetType(mpiaij->B,MATSEQAIJKOKKOS));
685f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJSetPreallocation(mpiaij->A,d_nz,d_nnz));
695f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJSetPreallocation(mpiaij->B,o_nz,o_nnz));
708c3ff71bSJunchao Zhang   mat->preallocated = PETSC_TRUE;
718c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
728c3ff71bSJunchao Zhang }
738c3ff71bSJunchao Zhang 
748c3ff71bSJunchao Zhang PetscErrorCode MatMult_MPIAIJKokkos(Mat mat,Vec xx,Vec yy)
758c3ff71bSJunchao Zhang {
768c3ff71bSJunchao Zhang   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
778c3ff71bSJunchao Zhang   PetscInt       nt;
788c3ff71bSJunchao Zhang 
798c3ff71bSJunchao Zhang   PetscFunctionBegin;
805f80ce2aSJacob Faibussowitsch   CHKERRQ(VecGetLocalSize(xx,&nt));
812c71b3e2SJacob Faibussowitsch   PetscCheckFalse(nt != mat->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",mat->cmap->n,nt);
825f80ce2aSJacob Faibussowitsch   CHKERRQ(VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD));
835f80ce2aSJacob Faibussowitsch   CHKERRQ((*mpiaij->A->ops->mult)(mpiaij->A,xx,yy));
845f80ce2aSJacob Faibussowitsch   CHKERRQ(VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD));
855f80ce2aSJacob Faibussowitsch   CHKERRQ((*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,yy,yy));
868c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
878c3ff71bSJunchao Zhang }
888c3ff71bSJunchao Zhang 
898c3ff71bSJunchao Zhang PetscErrorCode MatMultAdd_MPIAIJKokkos(Mat mat,Vec xx,Vec yy,Vec zz)
908c3ff71bSJunchao Zhang {
918c3ff71bSJunchao Zhang   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
928c3ff71bSJunchao Zhang   PetscInt       nt;
938c3ff71bSJunchao Zhang 
948c3ff71bSJunchao Zhang   PetscFunctionBegin;
955f80ce2aSJacob Faibussowitsch   CHKERRQ(VecGetLocalSize(xx,&nt));
962c71b3e2SJacob Faibussowitsch   PetscCheckFalse(nt != mat->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",mat->cmap->n,nt);
975f80ce2aSJacob Faibussowitsch   CHKERRQ(VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD));
985f80ce2aSJacob Faibussowitsch   CHKERRQ((*mpiaij->A->ops->multadd)(mpiaij->A,xx,yy,zz));
995f80ce2aSJacob Faibussowitsch   CHKERRQ(VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD));
1005f80ce2aSJacob Faibussowitsch   CHKERRQ((*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,zz,zz));
1018c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1028c3ff71bSJunchao Zhang }
1038c3ff71bSJunchao Zhang 
1048c3ff71bSJunchao Zhang PetscErrorCode MatMultTranspose_MPIAIJKokkos(Mat mat,Vec xx,Vec yy)
1058c3ff71bSJunchao Zhang {
1068c3ff71bSJunchao Zhang   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
1078c3ff71bSJunchao Zhang   PetscInt       nt;
1088c3ff71bSJunchao Zhang 
1098c3ff71bSJunchao Zhang   PetscFunctionBegin;
1105f80ce2aSJacob Faibussowitsch   CHKERRQ(VecGetLocalSize(xx,&nt));
1112c71b3e2SJacob Faibussowitsch   PetscCheckFalse(nt != mat->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",mat->rmap->n,nt);
1125f80ce2aSJacob Faibussowitsch   CHKERRQ((*mpiaij->B->ops->multtranspose)(mpiaij->B,xx,mpiaij->lvec));
1135f80ce2aSJacob Faibussowitsch   CHKERRQ((*mpiaij->A->ops->multtranspose)(mpiaij->A,xx,yy));
1145f80ce2aSJacob Faibussowitsch   CHKERRQ(VecScatterBegin(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE));
1155f80ce2aSJacob Faibussowitsch   CHKERRQ(VecScatterEnd(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE));
1168c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1178c3ff71bSJunchao Zhang }
1188c3ff71bSJunchao Zhang 
119076ba34aSJunchao Zhang /* Merge the "A, B" matrices of mat into a matrix C.  mat's type is MPIAIJKOKKOS. C's type is MATSEQAIJKOKKOS.
120076ba34aSJunchao Zhang    A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n).
121076ba34aSJunchao Zhang    C still uses local column ids. Their corresponding global column ids are returned in glob.
122076ba34aSJunchao Zhang */
123076ba34aSJunchao Zhang PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJKokkos(Mat mat,MatReuse reuse,IS *glob,Mat *C)
124076ba34aSJunchao Zhang {
125076ba34aSJunchao Zhang   Mat            Ad,Ao;
126076ba34aSJunchao Zhang   const PetscInt *cmap;
127076ba34aSJunchao Zhang 
128076ba34aSJunchao Zhang   PetscFunctionBegin;
1295f80ce2aSJacob Faibussowitsch   CHKERRQ(MatMPIAIJGetSeqAIJ(mat,&Ad,&Ao,&cmap));
1305f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosMergeMats(Ad,Ao,reuse,C));
131076ba34aSJunchao Zhang   if (glob) {
132076ba34aSJunchao Zhang     PetscInt cst, i, dn, on, *gidx;
1335f80ce2aSJacob Faibussowitsch     CHKERRQ(MatGetLocalSize(Ad,NULL,&dn));
1345f80ce2aSJacob Faibussowitsch     CHKERRQ(MatGetLocalSize(Ao,NULL,&on));
1355f80ce2aSJacob Faibussowitsch     CHKERRQ(MatGetOwnershipRangeColumn(mat,&cst,NULL));
1365f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscMalloc1(dn+on,&gidx));
137076ba34aSJunchao Zhang     for (i=0; i<dn; i++) gidx[i]    = cst + i;
138076ba34aSJunchao Zhang     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
1395f80ce2aSJacob Faibussowitsch     CHKERRQ(ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob));
140076ba34aSJunchao Zhang   }
141076ba34aSJunchao Zhang   PetscFunctionReturn(0);
142076ba34aSJunchao Zhang }
143076ba34aSJunchao Zhang 
144076ba34aSJunchao Zhang /* Structs used in matrix product C=AB, C=A^tB and C=B^tAB */
145076ba34aSJunchao Zhang struct MatMatStruct {
146076ba34aSJunchao Zhang   MatRowMapKokkosView   Cdstart; /* Used to split sequential matrix into petsc's A, B format */
147076ba34aSJunchao Zhang   PetscSF               sf; /* SF to send/recv matrix entries */
148076ba34aSJunchao Zhang   MatScalarKokkosView   abuf; /* buf of mat values in send/recv */
149076ba34aSJunchao Zhang   Mat                   C1,C2,B_local;
150076ba34aSJunchao Zhang   KokkosCsrMatrix       C1_global,C2_global,C_global;
151076ba34aSJunchao Zhang   KernelHandle          kh;
152076ba34aSJunchao Zhang   MatMatStruct() {
153076ba34aSJunchao Zhang     C1 = C2 = B_local = NULL;
154076ba34aSJunchao Zhang     sf = NULL;
155076ba34aSJunchao Zhang   }
156076ba34aSJunchao Zhang 
157076ba34aSJunchao Zhang   ~MatMatStruct() {
158076ba34aSJunchao Zhang     MatDestroy(&C1);
159076ba34aSJunchao Zhang     MatDestroy(&C2);
160076ba34aSJunchao Zhang     MatDestroy(&B_local);
161076ba34aSJunchao Zhang     PetscSFDestroy(&sf);
162076ba34aSJunchao Zhang     kh.destroy_spadd_handle();
163076ba34aSJunchao Zhang   }
164076ba34aSJunchao Zhang };
165076ba34aSJunchao Zhang 
166076ba34aSJunchao Zhang struct MatMatStruct_AB : public MatMatStruct {
167076ba34aSJunchao Zhang   MatColIdxKokkosView   rows;
168076ba34aSJunchao Zhang   MatRowMapKokkosView   rowoffset;
169076ba34aSJunchao Zhang   Mat                   B_other,C_petsc; /* SEQAIJKOKKOS matrices. TODO: have a better var name than C_petsc */
170076ba34aSJunchao Zhang 
171076ba34aSJunchao Zhang   MatMatStruct_AB() : B_other(NULL),C_petsc(NULL){}
172076ba34aSJunchao Zhang   ~MatMatStruct_AB() {
173076ba34aSJunchao Zhang     MatDestroy(&B_other);
174076ba34aSJunchao Zhang     MatDestroy(&C_petsc);
175076ba34aSJunchao Zhang   }
176076ba34aSJunchao Zhang };
177076ba34aSJunchao Zhang 
178076ba34aSJunchao Zhang struct MatMatStruct_AtB : public MatMatStruct {
179076ba34aSJunchao Zhang   MatRowMapKokkosView   srcrowoffset,dstrowoffset;
180076ba34aSJunchao Zhang };
181076ba34aSJunchao Zhang 
182076ba34aSJunchao Zhang struct MatProductData_MPIAIJKokkos
183076ba34aSJunchao Zhang {
184076ba34aSJunchao Zhang   MatMatStruct_AB   *mmAB;
185076ba34aSJunchao Zhang   MatMatStruct_AtB  *mmAtB;
186076ba34aSJunchao Zhang   PetscBool         reusesym;
187076ba34aSJunchao Zhang 
188076ba34aSJunchao Zhang   MatProductData_MPIAIJKokkos(): mmAB(NULL),mmAtB(NULL),reusesym(PETSC_FALSE){}
189076ba34aSJunchao Zhang   ~MatProductData_MPIAIJKokkos() {
190076ba34aSJunchao Zhang     delete mmAB;
191076ba34aSJunchao Zhang     delete mmAtB;
192076ba34aSJunchao Zhang   }
193076ba34aSJunchao Zhang };
194076ba34aSJunchao Zhang 
195076ba34aSJunchao Zhang static PetscErrorCode MatProductDataDestroy_MPIAIJKokkos(void *data)
196076ba34aSJunchao Zhang {
197076ba34aSJunchao Zhang   PetscFunctionBegin;
198076ba34aSJunchao Zhang   CHKERRCXX(delete static_cast<MatProductData_MPIAIJKokkos*>(data));
199076ba34aSJunchao Zhang   PetscFunctionReturn(0);
200076ba34aSJunchao Zhang }
201076ba34aSJunchao Zhang 
202076ba34aSJunchao Zhang /* MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds - Get a KokkosCsrMatrix from a MATSEQAIJKOKKOS matrix
203076ba34aSJunchao Zhang 
204076ba34aSJunchao Zhang    Input Parameters:
205076ba34aSJunchao Zhang +  A       - the MATSEQAIJKOKKOS matrix
206076ba34aSJunchao Zhang .  N       - new column size for the returned Kokkos matrix
207076ba34aSJunchao Zhang -  l2g     - a map that maps old col ids to new col ids
208076ba34aSJunchao Zhang 
209076ba34aSJunchao Zhang    Output Parameters:
210076ba34aSJunchao Zhang .  csrmat  - the Kokkos matrix, which has the same row size as A, shares a, i but not j with A.
211076ba34aSJunchao Zhang  */
212076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(Mat A,PetscInt N,const ConstMatColIdxKokkosView& l2g,KokkosCsrMatrix& csrmat)
213076ba34aSJunchao Zhang {
214076ba34aSJunchao Zhang   KokkosCsrMatrix&         orig = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->csrmat;
215076ba34aSJunchao Zhang   MatColIdxKokkosView      jg("jg",orig.nnz()); /* New j array for csrmat */
216076ba34aSJunchao Zhang 
217076ba34aSJunchao Zhang   PetscFunctionBegin;
218076ba34aSJunchao Zhang   CHKERRCXX(Kokkos::parallel_for(orig.nnz(), KOKKOS_LAMBDA(const PetscInt i) {jg(i) = l2g(orig.graph.entries(i));}));
219076ba34aSJunchao Zhang   CHKERRCXX(csrmat = KokkosCsrMatrix("csrmat",orig.numRows(),N,orig.nnz(),orig.values,orig.graph.row_map,jg));
220076ba34aSJunchao Zhang   PetscFunctionReturn(0);
221076ba34aSJunchao Zhang }
222076ba34aSJunchao Zhang 
223076ba34aSJunchao Zhang /* MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices - Set the diag and offdiag matrices of a MATMPIAIJKOKKOS matrix.
224076ba34aSJunchao Zhang    It is similar to MatCreateMPIAIJWithSplitArrays.
225076ba34aSJunchao Zhang 
226076ba34aSJunchao Zhang   Input Parameters:
227076ba34aSJunchao Zhang +  mat   - the MATMPIAIJKOKKOS matrix, which should have its type and layout set, but should not have its diag, offdiag matrices set
228076ba34aSJunchao Zhang .  A     - the diag matrix using local col ids
229076ba34aSJunchao Zhang -  B     - the offdiag matrix using global col ids
230076ba34aSJunchao Zhang 
231076ba34aSJunchao Zhang   Output Parameters:
232076ba34aSJunchao Zhang .  mat   - the updated MATMPIAIJKOKKOS matrix
233076ba34aSJunchao Zhang */
234076ba34aSJunchao Zhang static PetscErrorCode MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices(Mat mat,Mat A,Mat B)
235076ba34aSJunchao Zhang {
236076ba34aSJunchao Zhang   Mat_MPIAIJ          *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data);
237076ba34aSJunchao Zhang   PetscInt            m,n,M,N,Am,An,Bm,Bn;
238076ba34aSJunchao Zhang   Mat_SeqAIJKokkos    *bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
239076ba34aSJunchao Zhang 
240076ba34aSJunchao Zhang   PetscFunctionBegin;
2415f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetSize(mat,&M,&N));
2425f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetLocalSize(mat,&m,&n));
2435f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetLocalSize(A,&Am,&An));
2445f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetLocalSize(B,&Bm,&Bn));
245076ba34aSJunchao Zhang 
2462c71b3e2SJacob Faibussowitsch   PetscCheckFalse(m != Am || m != Bm,PETSC_COMM_SELF,PETSC_ERR_PLIB,"local number of rows do not match");
2472c71b3e2SJacob Faibussowitsch   PetscCheckFalse(n != An,PETSC_COMM_SELF,PETSC_ERR_PLIB,"local number of columns do not match");
2482c71b3e2SJacob Faibussowitsch   PetscCheckFalse(N != Bn,PETSC_COMM_SELF,PETSC_ERR_PLIB,"global number of columns do not match");
2492c71b3e2SJacob Faibussowitsch   PetscCheckFalse(mpiaij->A || mpiaij->B,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A, B of the MPIAIJ matrix are not empty");
250076ba34aSJunchao Zhang   mpiaij->A = A;
251076ba34aSJunchao Zhang   mpiaij->B = B;
252076ba34aSJunchao Zhang 
253076ba34aSJunchao Zhang   mat->preallocated     = PETSC_TRUE;
254076ba34aSJunchao Zhang   mat->nooffprocentries = PETSC_TRUE; /* See MatAssemblyBegin_MPIAIJ. In effect, making MatAssemblyBegin a nop */
255076ba34aSJunchao Zhang 
2565f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetOption(mat,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE));
2575f80ce2aSJacob Faibussowitsch   CHKERRQ(MatAssemblyBegin(mat,MAT_FINAL_ASSEMBLY));
258076ba34aSJunchao Zhang   /* MatAssemblyEnd is critical here. It sets mat->offloadmask according to A and B's, and
259076ba34aSJunchao Zhang     also gets mpiaij->B compacted, with its col ids and size reduced
260076ba34aSJunchao Zhang   */
2615f80ce2aSJacob Faibussowitsch   CHKERRQ(MatAssemblyEnd(mat,MAT_FINAL_ASSEMBLY));
2625f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetOption(mat,MAT_NO_OFF_PROC_ENTRIES,PETSC_FALSE));
2635f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetOption(mat,MAT_NEW_NONZERO_LOCATION_ERR,PETSC_TRUE));
264076ba34aSJunchao Zhang 
265076ba34aSJunchao Zhang   /* Update bkok with new local col ids (stored on host) and size */
266076ba34aSJunchao Zhang   bkok->j_dual.modify_host();
267076ba34aSJunchao Zhang   bkok->j_dual.sync_device();
268076ba34aSJunchao Zhang   bkok->SetColSize(mpiaij->B->cmap->n);
269076ba34aSJunchao Zhang   PetscFunctionReturn(0);
270076ba34aSJunchao Zhang }
271076ba34aSJunchao Zhang 
272076ba34aSJunchao Zhang /* MatSeqAIJKokkosBcast - Bcast rows of a SEQAIJKOKKOS matrice (B) to form a SEQAIJKOKKOS matrix (C).
273076ba34aSJunchao Zhang 
274076ba34aSJunchao Zhang    It is essentially the MPIAIJKOKKOS counterpart of MatGetBrowsOfAoCols_MPIAIJ, but supports device and uses PetscSF.
275076ba34aSJunchao Zhang    In the given ownerSF, leaves correspond to rows in C, and roots correspond to rows in B. Roots may connect to multiple leaves.
276076ba34aSJunchao Zhang    Suppose C's j-th row is connected to a root identified by PetscSFNode (k,i), it means we will bcast the i-th row of B on rank k
277076ba34aSJunchao Zhang    to j-th row of C. ownerSF's leaves must be contiguous (in other words, as if ilocal=NULL was used to set its graph).
278076ba34aSJunchao Zhang 
279076ba34aSJunchao Zhang    Collective on comm of ownerSF
280076ba34aSJunchao Zhang 
281076ba34aSJunchao Zhang    Input Parameters:
282076ba34aSJunchao Zhang +   B       - the SEQAIJKOKKOS matrix, using local col ids
283076ba34aSJunchao Zhang .   reuse   - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX
284076ba34aSJunchao Zhang .   N       - global col ids are in range of [0,N). N Must be the same across ranks (nonsignificant in MAT_REUSE_MATRIX)
285076ba34aSJunchao Zhang .   l2g     - a map mapping B's local col ids to global ones (nonsignificant in MAT_REUSE_MATRIX)
286076ba34aSJunchao Zhang .   ownerSF - the ownership SF (nonsignificant in MAT_REUSE_MATRIX)
287076ba34aSJunchao Zhang 
288076ba34aSJunchao Zhang    Input/Output Parameters (out when resue = MAT_INITIAL_MATRIX, inout when reuse = MAT_REUSE_MATRIX)
289076ba34aSJunchao Zhang +   bcastSF   - the SF used to bcast rows of B. This plain SF does buffer (abuf) to buffer (Ca) send/recv. In this SF, vertices are nonzeros.
290076ba34aSJunchao Zhang .   abuf      - buffer for sending matrix values
291076ba34aSJunchao Zhang .   rows      - array containing indices of (local) rows that this rank needs to bcast to others. Each receiver rank has a chunk in rows[].
292076ba34aSJunchao Zhang                 Values in rows[] might have repeats, which simply indicates a row will be bcast'ed to multiple neighbors.
293076ba34aSJunchao Zhang .   rowoffset - For each row in rows[], it will be copied to rowoffset[] at abuf[]
294076ba34aSJunchao Zhang -   C         -  the SEQAIJKOKKOS matrix made of the bcast'ed rows, using local col ids.
295076ba34aSJunchao Zhang */
296076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosBcast(Mat B,MatReuse reuse,PetscInt N,const ConstMatColIdxKokkosView& l2g,PetscSF ownerSF,
297076ba34aSJunchao Zhang                                            PetscSF& bcastSF,MatScalarKokkosView& abuf,MatColIdxKokkosView& rows,
298076ba34aSJunchao Zhang                                            MatRowMapKokkosView& rowoffset,Mat& C)
299076ba34aSJunchao Zhang {
300076ba34aSJunchao Zhang   Mat_SeqAIJKokkos             *bkok,*ckok;
301076ba34aSJunchao Zhang 
302076ba34aSJunchao Zhang   PetscFunctionBegin;
3035f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosSyncDevice(B)); /* Make sure B->spptr is accessible */
304076ba34aSJunchao Zhang   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
305076ba34aSJunchao Zhang 
306076ba34aSJunchao Zhang   if (reuse == MAT_REUSE_MATRIX) {
307076ba34aSJunchao Zhang     ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
308076ba34aSJunchao Zhang 
309076ba34aSJunchao Zhang     const auto& Ba = bkok->a_dual.view_device();
310076ba34aSJunchao Zhang     const auto& Bi = bkok->i_dual.view_device();
311076ba34aSJunchao Zhang     const auto& Ca = ckok->a_dual.view_device();
312076ba34aSJunchao Zhang 
313076ba34aSJunchao Zhang     /* Copy Ba to abuf */
314076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(rows.extent(0), Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
315076ba34aSJunchao Zhang       PetscInt i    = t.league_rank(); /* rows[i] is r-th row of B */
316076ba34aSJunchao Zhang       PetscInt r    = rows(i);
317076ba34aSJunchao Zhang       PetscInt base = rowoffset(i); /* Copy r-th row of B to this offset in abuf[] */
318076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t,Bi(r+1)-Bi(r)),[&](PetscInt k) {
319076ba34aSJunchao Zhang         abuf(base+k) = Ba(Bi(r)+k);
320076ba34aSJunchao Zhang       });
321076ba34aSJunchao Zhang     });
322076ba34aSJunchao Zhang 
323076ba34aSJunchao Zhang     /* Send abuf to Ca through bcastSF and then mark C is updated on device */
3245f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscSFBcastBegin(bcastSF,MPIU_SCALAR,abuf.data(),Ca.data(),MPI_REPLACE)); /* TODO: get memtype for abuf */
3255f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscSFBcastEnd  (bcastSF,MPIU_SCALAR,abuf.data(),Ca.data(),MPI_REPLACE));
326076ba34aSJunchao Zhang     ckok->a_dual.modify_device();
327076ba34aSJunchao Zhang   } else if (reuse == MAT_INITIAL_MATRIX) {
328076ba34aSJunchao Zhang     MPI_Comm       comm;
329076ba34aSJunchao Zhang     PetscMPIInt    tag;
330076ba34aSJunchao Zhang     PetscInt       k,Cm,Cn,Cnnz,*Ci_h,nroots,nleaves;
331076ba34aSJunchao Zhang 
3325f80ce2aSJacob Faibussowitsch     CHKERRMPI(PetscObjectGetComm((PetscObject)ownerSF,&comm));
3335f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscSFGetGraph(ownerSF,&nroots,&nleaves,NULL,NULL));
334076ba34aSJunchao Zhang     Cm   = nleaves; /* row size of C */
335076ba34aSJunchao Zhang     Cn   = N;  /* col size of C, which initially uses global ids, so we can safely set its col size as N */
336076ba34aSJunchao Zhang 
337076ba34aSJunchao Zhang     /* Get row lens (nz) of B's rows for later fast query */
338076ba34aSJunchao Zhang     PetscInt       *Browlens;
339076ba34aSJunchao Zhang     const PetscInt *tmp = bkok->i_host_data();
3405f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscMalloc1(nroots,&Browlens));
341076ba34aSJunchao Zhang     for (k=0; k<nroots; k++) Browlens[k] = tmp[k+1]-tmp[k];
342076ba34aSJunchao Zhang 
343076ba34aSJunchao Zhang     /* By ownerSF, each proc gets lens of rows of C */
344076ba34aSJunchao Zhang     MatRowMapKokkosDualView Ci("i",Cm+1); /* C's rowmap */
345076ba34aSJunchao Zhang     Ci_h    = Ci.view_host().data();
346076ba34aSJunchao Zhang     Ci_h[0] = 0;
3475f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscSFBcastWithMemTypeBegin(ownerSF,MPIU_INT,PETSC_MEMTYPE_HOST,Browlens,PETSC_MEMTYPE_HOST,&Ci_h[1],MPI_REPLACE));
3485f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscSFBcastEnd(ownerSF,MPIU_INT,Browlens,&Ci_h[1],MPI_REPLACE));
349076ba34aSJunchao Zhang     for (k=1; k<Cm+1; k++) Ci_h[k] += Ci_h[k-1]; /* Convert lens to CSR */
350076ba34aSJunchao Zhang     Cnnz    = Ci_h[Cm];
351076ba34aSJunchao Zhang     Ci.modify_host();
352076ba34aSJunchao Zhang     Ci.sync_device();
353076ba34aSJunchao Zhang 
354076ba34aSJunchao Zhang     /* With the newly known Cnnz, we are able to allocate (j, a) for C on host & device */
355076ba34aSJunchao Zhang     MatColIdxKokkosDualView  Cj("j",Cnnz);
356076ba34aSJunchao Zhang     MatScalarKokkosDualView  Ca("a",Cnnz);
357076ba34aSJunchao Zhang 
358076ba34aSJunchao Zhang     /* Now build the bcastSF to fill Ca, Cj. This plain SF only does (contiguous) buffer to buffer send/recv */
359076ba34aSJunchao Zhang     const PetscMPIInt *iranks,*ranks;
360076ba34aSJunchao Zhang     const PetscInt    *ioffset,*irootloc,*roffset;
361076ba34aSJunchao Zhang     PetscInt          i,j,niranks,nranks,*sdisp,*rdisp,*rowptr;
362076ba34aSJunchao Zhang     MPI_Request       *reqs;
363076ba34aSJunchao Zhang 
3645f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscSFGetLeafRanks(ownerSF,&niranks,&iranks,&ioffset,&irootloc)); /* irootloc[] contains indices of rows I need to send to each receiver */
3655f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscSFGetRootRanks(ownerSF,&nranks,&ranks,&roffset,NULL/*rmine*/,NULL/*rremote*/)); /* recv info */
366076ba34aSJunchao Zhang 
367076ba34aSJunchao Zhang     /* figure out offsets at the send buffer, to build the SF
368076ba34aSJunchao Zhang       sdisp[]  - stores offsets of nonzeros (in abuf or jbuf, see later) I need to send, per receiver.
369076ba34aSJunchao Zhang       rowptr[] - stores offsets for data of each row in abuf
370076ba34aSJunchao Zhang 
371076ba34aSJunchao Zhang       rdisp[]  - to receive sdisp[]
372076ba34aSJunchao Zhang     */
3735f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscMalloc3(niranks+1,&sdisp,nranks,&rdisp,niranks+nranks,&reqs));
374076ba34aSJunchao Zhang     MatRowMapKokkosViewHost rowptr_h("rowptr_h",ioffset[niranks]+1); /* Let Kokkos do the allocation, so that we can do an easy mirror later */
375076ba34aSJunchao Zhang     rowptr = rowptr_h.data();
376076ba34aSJunchao Zhang 
377076ba34aSJunchao Zhang     sdisp[0] = 0;
378076ba34aSJunchao Zhang     rowptr[0]  = 0;
379076ba34aSJunchao Zhang     for (i=0; i<niranks; i++) { /* for each receiver */
380076ba34aSJunchao Zhang       PetscInt len, nz = 0;
381076ba34aSJunchao Zhang       for (j=ioffset[i]; j<ioffset[i+1]; j++) { /* for each row to this receiver */
382076ba34aSJunchao Zhang         len         = Browlens[irootloc[j]];
383076ba34aSJunchao Zhang         rowptr[j+1] = rowptr[j] + len;
384076ba34aSJunchao Zhang         nz         += len;
385076ba34aSJunchao Zhang       }
386076ba34aSJunchao Zhang       sdisp[i+1] = sdisp[i] + nz;
387076ba34aSJunchao Zhang     }
3885f80ce2aSJacob Faibussowitsch     CHKERRMPI(PetscCommGetNewTag(comm,&tag));
3895f80ce2aSJacob Faibussowitsch     for (i=0; i<nranks; i++)  CHKERRMPI(MPI_Irecv(&rdisp[i],1,MPIU_INT,ranks[i],tag,comm,&reqs[i]));
3905f80ce2aSJacob Faibussowitsch     for (i=0; i<niranks; i++) CHKERRMPI(MPI_Isend(&sdisp[i],1,MPIU_INT,iranks[i],tag,comm,&reqs[nranks+i]));
3915f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Waitall(niranks+nranks,reqs,MPI_STATUSES_IGNORE));
392076ba34aSJunchao Zhang 
393076ba34aSJunchao Zhang     PetscInt    nleaves2 = Cnnz; /* leaves are the nonzeros I will receive */
394076ba34aSJunchao Zhang     PetscInt    nroots2  = sdisp[niranks]; /* roots are the nonzeros (in abuf) I will send */
395076ba34aSJunchao Zhang     PetscSFNode *iremote;
3965f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscMalloc1(nleaves2,&iremote));
397076ba34aSJunchao Zhang     for (i=0; i<nranks; i++) { /* for each sender */
398076ba34aSJunchao Zhang       k = 0;
399076ba34aSJunchao Zhang       for (j=Ci_h[roffset[i]]; j<Ci_h[roffset[i+1]]; j++) {
400076ba34aSJunchao Zhang         iremote[j].rank  = ranks[i];
401076ba34aSJunchao Zhang         iremote[j].index = rdisp[i] + k;
402076ba34aSJunchao Zhang         k++;
403076ba34aSJunchao Zhang       }
404076ba34aSJunchao Zhang     }
405076ba34aSJunchao Zhang     /* TODO: we should extend PetscSF APIs for this buffer-to-buffer send/recv */
4065f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscSFCreate(comm,&bcastSF));
4075f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscSFSetGraph(bcastSF,nroots2,nleaves2,NULL/*ilocal*/,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER));
408076ba34aSJunchao Zhang 
409076ba34aSJunchao Zhang     /* Extract selected rows of B, and copy their (a, j) into abuf[] and jbuf[], with j converted
410076ba34aSJunchao Zhang       from local to global. Then use bcastSF to fill Ca, Cj.
411076ba34aSJunchao Zhang     */
412076ba34aSJunchao Zhang     ConstMatColIdxKokkosViewHost rows_h(irootloc,ioffset[niranks]); /* irootloc[] stores indices of rows I need to send */
413076ba34aSJunchao Zhang     MatColIdxKokkosView          rows("rows",ioffset[niranks]);
414076ba34aSJunchao Zhang     Kokkos::deep_copy(rows,rows_h); /* Use deep copy since irootoc is managed by PetscSF and we want 'rows' to be standalone */
415076ba34aSJunchao Zhang 
416076ba34aSJunchao Zhang     rowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),rowptr_h); /* If no device, rowoffset will be an alias to rowptr_h */
417076ba34aSJunchao Zhang 
418076ba34aSJunchao Zhang     MatColIdxKokkosView jbuf("jbuf",sdisp[niranks]); /* send buf for (global) col ids */
419076ba34aSJunchao Zhang     abuf = MatScalarKokkosView("abuf",sdisp[niranks]); /* send buf for mat values */
420076ba34aSJunchao Zhang 
421076ba34aSJunchao Zhang     const auto& Ba = bkok->a_dual.view_device();
422076ba34aSJunchao Zhang     const auto& Bi = bkok->i_dual.view_device();
423076ba34aSJunchao Zhang     const auto& Bj = bkok->j_dual.view_device();
424076ba34aSJunchao Zhang 
425076ba34aSJunchao Zhang     /* Copy Ba, Bj to abuf, jbuf with change col ids from local to global */
426076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(rows.extent(0), Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
427076ba34aSJunchao Zhang       PetscInt i    = t.league_rank(); /* rows[i] is r-th row of B */
428076ba34aSJunchao Zhang       PetscInt r    = rows(i);
429076ba34aSJunchao Zhang       PetscInt base = rowoffset(i); /* Copy r-th row of B to this offset in abuf[] */
430076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t,Bi(r+1)-Bi(r)),[&](PetscInt k) {
431076ba34aSJunchao Zhang         abuf(base+k) = Ba(Bi(r)+k);
432076ba34aSJunchao Zhang         jbuf(base+k) = l2g(Bj(Bi(r)+k));
433076ba34aSJunchao Zhang       });
434076ba34aSJunchao Zhang     });
435076ba34aSJunchao Zhang 
436076ba34aSJunchao Zhang     /* Send abuf & jbuf to fill Ca, Cj */
4375f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscSFBcastBegin(bcastSF,MPIU_INT,   jbuf.data(),Cj.view_device().data(),MPI_REPLACE));
4385f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscSFBcastBegin(bcastSF,MPIU_SCALAR,abuf.data(),Ca.view_device().data(),MPI_REPLACE));
4395f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscSFBcastEnd  (bcastSF,MPIU_INT,   jbuf.data(),Cj.view_device().data(),MPI_REPLACE));
4405f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscSFBcastEnd  (bcastSF,MPIU_SCALAR,abuf.data(),Ca.view_device().data(),MPI_REPLACE));
441076ba34aSJunchao Zhang     Cj.modify_device(); /* Mark Cj, Ca modified on device, but only sync Cj since we might not need Ca on host at all */
442076ba34aSJunchao Zhang     Cj.sync_host();
443076ba34aSJunchao Zhang     Ca.modify_device();
444076ba34aSJunchao Zhang 
445076ba34aSJunchao Zhang     /* Construct C with Ca, Ci, Cj */
446076ba34aSJunchao Zhang     auto ckok = new Mat_SeqAIJKokkos(Cm,Cn,Cnnz,Ci,Cj,Ca);
4475f80ce2aSJacob Faibussowitsch     CHKERRQ(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,&C));
4485f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFree3(sdisp,rdisp,reqs));
4495f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFree(Browlens));
45098921bdaSJacob Faibussowitsch   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unsupported MatReuse enum %d",reuse);
451076ba34aSJunchao Zhang   PetscFunctionReturn(0);
452076ba34aSJunchao Zhang }
453076ba34aSJunchao Zhang 
454076ba34aSJunchao Zhang /* MatSeqAIJKokkosReduce - Reduce rows of a SEQAIJKOKKOS matrix (A) to form a Kokkos Csr matrix (C)
455076ba34aSJunchao Zhang 
456076ba34aSJunchao Zhang   It is the reverse of MatSeqAIJKokkosBcast in some sense.
457076ba34aSJunchao Zhang 
458076ba34aSJunchao Zhang   Think each row of A as a leaf, then the given ownerSF specifies roots of the leaves. Roots may connect to multiple leaves.
459076ba34aSJunchao Zhang   In this routine, we reduce (i.e., concatenate) leaves (rows) at their roots to form potentially longer rows in C. Such rows might
460076ba34aSJunchao Zhang   contain repeats, which does not matter since they will be summed up by other routines. C's row size will be nroots of ownerSF.
461076ba34aSJunchao Zhang 
462076ba34aSJunchao Zhang   Input Parameters:
463076ba34aSJunchao Zhang +  A        - the SEQAIJKOKKOS matrix to be reduced
464076ba34aSJunchao Zhang .  reuse    - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX
465076ba34aSJunchao Zhang .  local    - true if A uses local col ids; false if A is already in global col ids.
466076ba34aSJunchao Zhang .  N        - if local, N is A's global col size
467076ba34aSJunchao Zhang .  l2g      - if local, a map mapping A's local col ids to global ones, which are in range of [0,N).
468076ba34aSJunchao Zhang -  ownerSF  - the SF specifies ownership (root) of rows in A
469076ba34aSJunchao Zhang 
470076ba34aSJunchao Zhang   Output Parameters:
471076ba34aSJunchao Zhang +  reduceSF    - the SF to reduce A's rows to contiguous buffers at the receiver side
472076ba34aSJunchao Zhang .  abuf         - a contiguous buffer to receive A's rows sent to this proc. Suppose there are 'nrows' such rows.
473076ba34aSJunchao Zhang .  srcrowoffset - offset array of size nrows+1. Each entry is the corresponding row's offset in abuf[]. srcrowoffset[i+1]-srcrowoffset[i] is row i's len.
474076ba34aSJunchao Zhang .  dstrowoffset - offset array of size nrows. Each entry is the corresponding row's offset in Ca[], i.e., C's 'a' array. Row i, i+1 in abuf[] may go to
475076ba34aSJunchao Zhang                   unrelated places in Ca, so dstrowoffset is not in CSR-like format as srcrowoffset.
476076ba34aSJunchao Zhang -  C            - the matrix made up by rows sent to me from other ranks, using global col ids
477076ba34aSJunchao Zhang 
478076ba34aSJunchao Zhang    TODO: we can even have MatSeqAIJKokkosReduceBegin/End to provide oppertunity for callers to overlap comp./comm. when reuse = MAT_REUSE_MATRIX.
479076ba34aSJunchao Zhang  */
480076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosReduce(Mat A,MatReuse reuse,PetscBool local,PetscInt N,const ConstMatColIdxKokkosView& l2g,PetscSF ownerSF,
481076ba34aSJunchao Zhang                                             PetscSF& reduceSF,MatScalarKokkosView& abuf,
482076ba34aSJunchao Zhang                                             MatRowMapKokkosView& srcrowoffset,MatRowMapKokkosView& dstrowoffset,
483076ba34aSJunchao Zhang                                             KokkosCsrMatrix& C)
484076ba34aSJunchao Zhang {
485076ba34aSJunchao Zhang   PetscInt               i,r,Am,An,Annz,Cnnz,nrows;
486076ba34aSJunchao Zhang   const PetscInt         *Ai;
487076ba34aSJunchao Zhang   Mat_SeqAIJKokkos       *akok;
488076ba34aSJunchao Zhang 
489076ba34aSJunchao Zhang   PetscFunctionBegin;
4905f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosSyncDevice(A)); /* So that A's latest data is on device */
4915f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetSize(A,&Am,&An));
492076ba34aSJunchao Zhang   Ai   = static_cast<Mat_SeqAIJ*>(A->data)->i;
493076ba34aSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
494076ba34aSJunchao Zhang   Annz = Ai[Am];
495076ba34aSJunchao Zhang 
496076ba34aSJunchao Zhang   if (reuse == MAT_REUSE_MATRIX) {
497076ba34aSJunchao Zhang     /* Send Aa to abuf */
4985f80ce2aSJacob Faibussowitsch     CHKERRMPI(PetscSFReduceBegin(reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE));
4995f80ce2aSJacob Faibussowitsch     CHKERRMPI(PetscSFReduceEnd  (reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE));
500076ba34aSJunchao Zhang 
501076ba34aSJunchao Zhang     /* Copy abuf to Ca */
502076ba34aSJunchao Zhang     const MatScalarKokkosView& Ca = C.values;
503076ba34aSJunchao Zhang     nrows = dstrowoffset.extent(0); /* Not srcrowoffset[] since it has an extra entry for CSR */
504076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(nrows, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
505076ba34aSJunchao Zhang       PetscInt i   = t.league_rank();
506076ba34aSJunchao Zhang       PetscInt src = srcrowoffset(i), dst = dstrowoffset(i);
507076ba34aSJunchao Zhang       PetscInt len = srcrowoffset(i+1) - srcrowoffset(i);
508076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t,len), [&](PetscInt k) {Ca(dst+k) = abuf(src+k);});
509076ba34aSJunchao Zhang     });
510076ba34aSJunchao Zhang   } else if (reuse == MAT_INITIAL_MATRIX) {
511076ba34aSJunchao Zhang     MPI_Comm               comm;
512076ba34aSJunchao Zhang     MPI_Request            *reqs;
513076ba34aSJunchao Zhang     PetscMPIInt            tag;
514076ba34aSJunchao Zhang     PetscInt               Cm;
515076ba34aSJunchao Zhang 
5165f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscObjectGetComm((PetscObject)ownerSF,&comm));
5175f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscCommGetNewTag(comm,&tag));
518076ba34aSJunchao Zhang 
519076ba34aSJunchao Zhang     PetscInt niranks,nranks,nroots,nleaves;
520076ba34aSJunchao Zhang     const PetscMPIInt *iranks,*ranks;
521076ba34aSJunchao Zhang     const PetscInt *ioffset,*rows,*roffset;  /* rows[] contains local indices of rows scattered to me from others. ioffset[] is a CSR on rows[] */
5225f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscSFSetUp(ownerSF));
5235f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscSFGetLeafRanks(ownerSF,&niranks,&iranks,&ioffset,&rows)); /* recv info: iranks[] will send rows to me */
5245f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscSFGetRootRanks(ownerSF,&nranks,&ranks,&roffset,NULL/*rmine*/,NULL/*rremote*/)); /* send info */
5255f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscSFGetGraph(ownerSF,&nroots,&nleaves,NULL,NULL));
5262c71b3e2SJacob Faibussowitsch     PetscCheckFalse(nleaves != Am,PETSC_COMM_SELF,PETSC_ERR_PLIB,"ownerSF's nleaves(%" PetscInt_FMT ") != row size of A(%" PetscInt_FMT ")",nleaves,Am);
527076ba34aSJunchao Zhang     Cm    = nroots;
528076ba34aSJunchao Zhang     nrows = ioffset[niranks]; /* # of rows to be received. Might receive same row (each is partial) from different senders */
529076ba34aSJunchao Zhang 
530076ba34aSJunchao Zhang     /* Tell owners how long each row I will send */
531076ba34aSJunchao Zhang     PetscInt                *srowlens; /* send buf of row lens */
532076ba34aSJunchao Zhang     MatRowMapKokkosViewHost rrowlens_h("rrowoffset_h",nrows+1); /* recv buf of row lens. +1 to make CSR later. Memory might be passed to other views */
533076ba34aSJunchao Zhang     PetscInt                *rrowlens = rrowlens_h.data();
534076ba34aSJunchao Zhang 
5355f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscMalloc2(Am,&srowlens,niranks+nranks,&reqs));
536076ba34aSJunchao Zhang     for (i=0; i<Am; i++) srowlens[i] = Ai[i+1] - Ai[i];
537076ba34aSJunchao Zhang     rrowlens[0] = 0;
538076ba34aSJunchao Zhang     rrowlens++; /* shift the pointer to make the following expression more readable */
5395f80ce2aSJacob Faibussowitsch     for (i=0; i<niranks; i++)CHKERRMPI(MPI_Irecv(&rrowlens[ioffset[i]],ioffset[i+1]-ioffset[i],MPIU_INT,iranks[i],tag,comm,&reqs[i]));
5405f80ce2aSJacob Faibussowitsch     for (i=0; i<nranks; i++) CHKERRMPI(MPI_Isend(&srowlens[roffset[i]],roffset[i+1]-roffset[i],MPIU_INT,ranks[i],tag,comm,&reqs[niranks+i]));
5415f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Waitall(niranks+nranks,reqs,MPI_STATUSES_IGNORE));
542076ba34aSJunchao Zhang 
543076ba34aSJunchao Zhang     /* Owner builds Ci on host by histogramming rrowlens[] */
544076ba34aSJunchao Zhang     MatRowMapKokkosViewHost Ci_h("i",Cm+1);
545076ba34aSJunchao Zhang     Kokkos::deep_copy(Ci_h,0); /* Zero Ci */
546076ba34aSJunchao Zhang     MatRowMapType *Ci_ptr = Ci_h.data();
547076ba34aSJunchao Zhang 
548076ba34aSJunchao Zhang     for (i=0; i<nrows; i++) {
549076ba34aSJunchao Zhang       r = rows[i]; /* local row id of i-th received row */
550076ba34aSJunchao Zhang      #if defined(PETSC_USE_DEBUG)
5512c71b3e2SJacob Faibussowitsch       PetscCheckFalse(r<0 || r>=Cm,PETSC_COMM_SELF,PETSC_ERR_PLIB,"local row id (%" PetscInt_FMT ") is out of range [0,%" PetscInt_FMT ")",r,Cm);
552076ba34aSJunchao Zhang      #endif
553076ba34aSJunchao Zhang       Ci_ptr[r+1] += rrowlens[i]; /* add to length of row r in C */
554076ba34aSJunchao Zhang     }
555076ba34aSJunchao Zhang     for (i=0; i<Cm; i++) Ci_ptr[i+1] += Ci_ptr[i]; /* to CSR format */
556076ba34aSJunchao Zhang     Cnnz = Ci_ptr[Cm];
557076ba34aSJunchao Zhang 
558076ba34aSJunchao Zhang     /* For each received row, compute src & dst offsets in memory copying (from recv bufs abuf, jbuf to Ca, Cj) */
559076ba34aSJunchao Zhang     MatRowMapKokkosViewHost dstrowoffset_h("dstrowoffset_h",nrows);
560076ba34aSJunchao Zhang     PetscInt                *dstrowoffset_hptr = dstrowoffset_h.data();
561076ba34aSJunchao Zhang     PetscInt                *currowlens; /* Current row lens. They are temp accumulators for row lens in C, to help build dstrowoffset */
562076ba34aSJunchao Zhang 
5635f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscCalloc1(Cm,&currowlens)); /* Init with zero, to be added to */
564076ba34aSJunchao Zhang     for (i=0; i<nrows; i++) { /* for each row I receive */
565076ba34aSJunchao Zhang       r                    = rows[i]; /* row id in C */
566076ba34aSJunchao Zhang       dstrowoffset_hptr[i] = Ci_ptr[r] + currowlens[r]; /* dst offset of the new place for each recv'ed row in Ca/Cj */
567076ba34aSJunchao Zhang       currowlens[r]       += rrowlens[i]; /* accumulate to length of row r in C */
568076ba34aSJunchao Zhang     }
5695f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFree(currowlens));
570076ba34aSJunchao Zhang 
571076ba34aSJunchao Zhang     rrowlens--;
572076ba34aSJunchao Zhang     for (i=0; i<nrows; i++) rrowlens[i+1] += rrowlens[i]; /* Change rrowlens[] to CSR format */
573076ba34aSJunchao Zhang     dstrowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),dstrowoffset_h);
574076ba34aSJunchao Zhang     srcrowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),rrowlens_h); /* src offset of each recv'ed row in abuf/jbuf */
575076ba34aSJunchao Zhang 
576076ba34aSJunchao Zhang     /* Build the reduceSF, which performs buffer to buffer send/recv */
577076ba34aSJunchao Zhang     PetscInt *sdisp,*rdisp; /* buffer to send offsets of roots, and buffer to recv them */
5785f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscMalloc2(niranks,&sdisp,nranks,&rdisp));
579076ba34aSJunchao Zhang     for (i=0; i<niranks; i++) sdisp[i] = rrowlens[ioffset[i]];
5805f80ce2aSJacob Faibussowitsch     for (i=0; i<nranks; i++)  CHKERRMPI(MPI_Irecv(&rdisp[i],1,MPIU_INT,ranks[i],tag,comm,&reqs[i]));
5815f80ce2aSJacob Faibussowitsch     for (i=0; i<niranks; i++) CHKERRMPI(MPI_Isend(&sdisp[i],1,MPIU_INT,iranks[i],tag,comm,&reqs[nranks+i]));
5825f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Waitall(niranks+nranks,reqs,MPI_STATUSES_IGNORE));
583076ba34aSJunchao Zhang 
584076ba34aSJunchao Zhang     /* Nonzeros in abuf/jbuf are roots and those in A are leaves */
585076ba34aSJunchao Zhang     PetscInt    nroots2 = Cnnz,nleaves2 = Annz;
586076ba34aSJunchao Zhang     PetscSFNode *iremote;
5875f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscMalloc1(nleaves2,&iremote)); /* no free, since memory will be given to reduceSF */
588076ba34aSJunchao Zhang     for (i=0; i<nranks; i++) {
589076ba34aSJunchao Zhang       PetscInt rootbase = rdisp[i]; /* root offset at this root rank */
590076ba34aSJunchao Zhang       PetscInt leafbase = Ai[roffset[i]]; /* leaf base */
591076ba34aSJunchao Zhang       PetscInt nz       = Ai[roffset[i+1]] - leafbase; /* I will send nz nonzeros to this root rank */
592076ba34aSJunchao Zhang       for (PetscInt k=0; k<nz; k++) {
593076ba34aSJunchao Zhang         iremote[leafbase+k].rank  = ranks[i];
594076ba34aSJunchao Zhang         iremote[leafbase+k].index = rootbase + k;
595076ba34aSJunchao Zhang       }
596076ba34aSJunchao Zhang     }
5975f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscSFCreate(comm,&reduceSF));
5985f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscSFSetGraph(reduceSF,nroots2,nleaves2,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER));
5995f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFree2(sdisp,rdisp));
600076ba34aSJunchao Zhang 
601076ba34aSJunchao Zhang     /* Reduce Aa, Ajg to abuf and jbuf */
602076ba34aSJunchao Zhang 
603076ba34aSJunchao Zhang     /* If A uses local col ids, convert them to global ones before sending */
604076ba34aSJunchao Zhang     MatColIdxKokkosView Ajg;
605076ba34aSJunchao Zhang     if (local) {
606076ba34aSJunchao Zhang       Ajg = MatColIdxKokkosView("j",Annz);
607076ba34aSJunchao Zhang       const MatColIdxKokkosView& Aj = akok->j_dual.view_device();
608076ba34aSJunchao Zhang       Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscInt i) {Ajg(i) = l2g(Aj(i));});
609076ba34aSJunchao Zhang     } else {
610076ba34aSJunchao Zhang       Ajg = akok->j_dual.view_device(); /* no data copy, just take a reference */
611076ba34aSJunchao Zhang     }
612076ba34aSJunchao Zhang 
613076ba34aSJunchao Zhang     MatColIdxKokkosView   jbuf("jbuf",Cnnz);
614076ba34aSJunchao Zhang     abuf = MatScalarKokkosView("abuf",Cnnz);
6155f80ce2aSJacob Faibussowitsch     CHKERRMPI(PetscSFReduceBegin(reduceSF,MPIU_INT,   Ajg.data(),           jbuf.data(),MPI_REPLACE));
6165f80ce2aSJacob Faibussowitsch     CHKERRMPI(PetscSFReduceEnd  (reduceSF,MPIU_INT,   Ajg.data(),           jbuf.data(),MPI_REPLACE));
6175f80ce2aSJacob Faibussowitsch     CHKERRMPI(PetscSFReduceBegin(reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE));
6185f80ce2aSJacob Faibussowitsch     CHKERRMPI(PetscSFReduceEnd  (reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE));
619076ba34aSJunchao Zhang 
620076ba34aSJunchao Zhang     /* Copy data from abuf, jbuf to Ca, Cj */
621076ba34aSJunchao Zhang     MatRowMapKokkosView    Ci = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),Ci_h); /* Ci is an alias of Ci_h if no device */
622076ba34aSJunchao Zhang     MatColIdxKokkosView    Cj("j",Cnnz);
623076ba34aSJunchao Zhang     MatScalarKokkosView    Ca("a",Cnnz);
624076ba34aSJunchao Zhang 
625076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(nrows, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
626076ba34aSJunchao Zhang       PetscInt i   = t.league_rank();
627076ba34aSJunchao Zhang       PetscInt src = srcrowoffset(i), dst = dstrowoffset(i);
628076ba34aSJunchao Zhang       PetscInt len = srcrowoffset(i+1) - srcrowoffset(i);
629076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t,len), [&](PetscInt k) {
630076ba34aSJunchao Zhang         Ca(dst+k) = abuf(src+k);
631076ba34aSJunchao Zhang         Cj(dst+k) = jbuf(src+k);
632076ba34aSJunchao Zhang       });
633076ba34aSJunchao Zhang     });
634076ba34aSJunchao Zhang 
635076ba34aSJunchao Zhang     /* Build C with Ca, Ci, Cj */
636076ba34aSJunchao Zhang     C    = KokkosCsrMatrix("csrmat",Cm,N,Cnnz,Ca,Ci,Cj);
6375f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFree2(srowlens,reqs));
63898921bdaSJacob Faibussowitsch   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unsupported MatReuse enum %d",reuse);
639076ba34aSJunchao Zhang   PetscFunctionReturn(0);
640076ba34aSJunchao Zhang }
641076ba34aSJunchao Zhang 
642076ba34aSJunchao Zhang /* MatSetMPIAIJKokkosWithGlobalCSRMatrix - Set the diag and offdiag parts of a MATMPIAIJKOKKOS matrix by splitting a KokkosCsrMatrix
643076ba34aSJunchao Zhang 
644076ba34aSJunchao Zhang   Input Parameters:
645076ba34aSJunchao Zhang +  C        - the MATMPIAIJKOKKOS matrix, of size m,n,M,N
646076ba34aSJunchao Zhang .  reuse    - indicate whether the matrix has called this function before
647076ba34aSJunchao Zhang .  csrmat   - the KokkosCsrMatrix, of size m,N
648076ba34aSJunchao Zhang -  Cdstart  - when reuse == MAT_REUSE_MATRIX, it is an input parameter. For each row in csrmat, it stores the start of the first
649076ba34aSJunchao Zhang               entry of the diag block of C in csrmat's j array. E.g, if row i has col ids = {0, 3, 4, 5, 7, 9} and the first diag
650076ba34aSJunchao Zhang               entry is 5, then Cdstart[i] = 3.
651076ba34aSJunchao Zhang 
652076ba34aSJunchao Zhang   Output Parameters:
653076ba34aSJunchao Zhang +  C        - the updated MATMPIAIJKOKKOS matrix
654076ba34aSJunchao Zhang -  Cdstart - when reuse == MAT_INITIAL_MATRIX, it is an output parameter
655076ba34aSJunchao Zhang 
656076ba34aSJunchao Zhang   Notes:
657076ba34aSJunchao Zhang    Between calls with MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX, csrmat must have the same nonzero pattern
658076ba34aSJunchao Zhang  */
659076ba34aSJunchao Zhang static PetscErrorCode MatSetMPIAIJKokkosWithGlobalCSRMatrix(Mat C,MatReuse reuse,const KokkosCsrMatrix& csrmat,MatRowMapKokkosView& Cdstart)
660076ba34aSJunchao Zhang {
661076ba34aSJunchao Zhang   const MatScalarKokkosView&      Ca = csrmat.values;
662076ba34aSJunchao Zhang   const ConstMatRowMapKokkosView& Ci = csrmat.graph.row_map;
663076ba34aSJunchao Zhang   PetscInt                        m,n,N;
664076ba34aSJunchao Zhang 
665076ba34aSJunchao Zhang   PetscFunctionBegin;
6665f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetLocalSize(C,&m,&n));
6675f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetSize(C,NULL,&N));
668076ba34aSJunchao Zhang 
669076ba34aSJunchao Zhang   if (reuse == MAT_REUSE_MATRIX) {
670076ba34aSJunchao Zhang     Mat_MPIAIJ                  *mpiaij = static_cast<Mat_MPIAIJ*>(C->data);
671076ba34aSJunchao Zhang     Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(mpiaij->A->spptr);
672076ba34aSJunchao Zhang     Mat_SeqAIJKokkos            *bkok = static_cast<Mat_SeqAIJKokkos*>(mpiaij->B->spptr);
673076ba34aSJunchao Zhang     const MatScalarKokkosView&  Cda = akok->a_dual.view_device(),Coa = bkok->a_dual.view_device();
674076ba34aSJunchao Zhang     const MatRowMapKokkosView&  Cdi = akok->i_dual.view_device(),Coi = bkok->i_dual.view_device();
675076ba34aSJunchao Zhang 
676076ba34aSJunchao Zhang     /* Fill 'a' of Cd and Co on device */
677076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
678076ba34aSJunchao Zhang       PetscInt i       = t.league_rank(); /* row i */
679076ba34aSJunchao Zhang       PetscInt clen    = Ci(i+1) - Ci(i); /* len of row i of C */
680076ba34aSJunchao Zhang       PetscInt cdlen   = Cdi(i+1) - Cdi(i); /* len of row i of Cd */
681076ba34aSJunchao Zhang       PetscInt cdstart = Cdstart(i); /* [start, end) of row i of Cd in C */
682076ba34aSJunchao Zhang       PetscInt cdend   = cdstart + cdlen;
683076ba34aSJunchao Zhang       /* [0, clen) is cut into three blocks: [0, cdstart), [cdstart, cdend), [cdend, clen) */
684076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clen), [&](PetscInt k) {
685076ba34aSJunchao Zhang         if (k < cdstart) {  /* k in [0, cdstart) */
686076ba34aSJunchao Zhang           Coa(Coi(i)+k) = Ca(Ci(i)+k);
687076ba34aSJunchao Zhang         } else if (k < cdend) { /* k in [cdstart, cdend) */
688076ba34aSJunchao Zhang           Cda(Cdi(i)+(k-cdstart)) = Ca(Ci(i)+k);
689076ba34aSJunchao Zhang         } else { /* k in [cdend, clen) */
690076ba34aSJunchao Zhang           Coa(Coi(i)+k-cdlen) = Ca(Ci(i)+k);
691076ba34aSJunchao Zhang         }
692076ba34aSJunchao Zhang       });
693076ba34aSJunchao Zhang     });
694076ba34aSJunchao Zhang 
695076ba34aSJunchao Zhang     akok->a_dual.modify_device();
696076ba34aSJunchao Zhang     bkok->a_dual.modify_device();
697076ba34aSJunchao Zhang   } else if (reuse == MAT_INITIAL_MATRIX) {
698076ba34aSJunchao Zhang     Mat                         Cd,Co;
699076ba34aSJunchao Zhang     const MatColIdxKokkosView&  Cj = csrmat.graph.entries;
700076ba34aSJunchao Zhang     MatRowMapKokkosDualView     Cdi_dual("i",m+1),Coi_dual("i",m+1);
701076ba34aSJunchao Zhang     MatRowMapKokkosView         Cdi = Cdi_dual.view_device(),Coi = Coi_dual.view_device();
702076ba34aSJunchao Zhang     PetscInt                    cstart,cend;
703076ba34aSJunchao Zhang 
704076ba34aSJunchao Zhang     /* Note that each row of C is sorted by col ids. We want to find out how to cut each row into three blocks:
705076ba34aSJunchao Zhang        left to the diag block, diag block, right to the diag block. The diag block have col ids in [cstart,cend).
706076ba34aSJunchao Zhang        Suppose a row of C has len nonzeros, indexed by [0, len). We want to know two indices: cdstart and cdend,
707076ba34aSJunchao Zhang        such that the three blocks are [0,cdstart), [cdstart,cdend), [cdend,len). The following code equivalentaly
708076ba34aSJunchao Zhang        stores values of cdstart and cdend-cstart (aka Cdi[]) instead.
709076ba34aSJunchao Zhang      */
710076ba34aSJunchao Zhang     Cdstart = MatRowMapKokkosView("Cdstart",m);
7115f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLayoutGetRange(C->cmap,&cstart,&cend)); /* Not MatGetOwnershipRangeColumn() since C has not been preallocated yet */
712076ba34aSJunchao Zhang 
713076ba34aSJunchao Zhang     /* I could use RangePolicy and one thread per row. But since each thread essentially does binary search, threads in a
714076ba34aSJunchao Zhang       CUDA warp would completely diverge. So I use TeamPolicy with a team size 1.
715076ba34aSJunchao Zhang      */
716076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
717076ba34aSJunchao Zhang       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
718076ba34aSJunchao Zhang         PetscInt i = t.league_rank(); /* row i */
719076ba34aSJunchao Zhang         PetscInt j,first,count,step;
720076ba34aSJunchao Zhang 
721076ba34aSJunchao Zhang         if (i == 0) { /* Set the first entry of the i arrays to zero on device, to be used in CSR */
722076ba34aSJunchao Zhang           Cdi(0) = 0;
723076ba34aSJunchao Zhang           Coi(0) = 0;
724076ba34aSJunchao Zhang         }
725076ba34aSJunchao Zhang 
726076ba34aSJunchao Zhang         /* Do std::lower_bound(Ci(i),Ci(i+1),cstart) on Cj[]. We use j as the iterator. lower_bound() returns
727076ba34aSJunchao Zhang           in 'first' the first iterator with a value >= cstart, or last iterator if no such element is found.
728076ba34aSJunchao Zhang         */
729076ba34aSJunchao Zhang         count = Ci(i+1)-Ci(i);
730076ba34aSJunchao Zhang         first = Ci(i);
731076ba34aSJunchao Zhang         while (count > 0) {
732076ba34aSJunchao Zhang           j    = first;
733076ba34aSJunchao Zhang           step = count / 2;
734076ba34aSJunchao Zhang           j   += step;
735076ba34aSJunchao Zhang           if (Cj(j) < cstart) {
736076ba34aSJunchao Zhang             first  = ++j;
737076ba34aSJunchao Zhang             count -= step + 1;
738076ba34aSJunchao Zhang           } else count = step;
739076ba34aSJunchao Zhang         }
740076ba34aSJunchao Zhang         Cdstart(i) = first - Ci(i); /* 'first' is the while-loop's output */
741076ba34aSJunchao Zhang 
742076ba34aSJunchao Zhang         /* Do std::lower_bound(first,Ci(i+1),cend) on Cj[] */
743076ba34aSJunchao Zhang         count = Ci(i+1) - first;
744076ba34aSJunchao Zhang         while (count > 0) {
745076ba34aSJunchao Zhang           j    = first;
746076ba34aSJunchao Zhang           step = count / 2;
747076ba34aSJunchao Zhang           j   += step;
748076ba34aSJunchao Zhang           if (Cj(j) < cend) {
749076ba34aSJunchao Zhang             first  = ++j;
750076ba34aSJunchao Zhang             count -= step + 1;
751076ba34aSJunchao Zhang           } else count = step;
752076ba34aSJunchao Zhang         }
753076ba34aSJunchao Zhang         Cdi(i+1) = first - (Ci(i)+Cdstart(i)); /* 'first' is the while-loop's output */
754076ba34aSJunchao Zhang         Coi(i+1) = (Ci(i+1)-Ci(i)) - Cdi(i+1); /* Co's row len = C's row len - Cd's row len */
755076ba34aSJunchao Zhang       });
756076ba34aSJunchao Zhang     });
757076ba34aSJunchao Zhang 
758076ba34aSJunchao Zhang     /* Convert row lens in Cdi[], Coi[] to CSR format using inclusive scan, e.g., changing [0,1,2,3] into [0,1,3,6] */
759076ba34aSJunchao Zhang     Kokkos::parallel_scan(m+1,KOKKOS_LAMBDA(const PetscInt i,PetscInt& update,const bool final) {
760076ba34aSJunchao Zhang       update += Cdi(i);
761076ba34aSJunchao Zhang       if (final) Cdi(i) = update;
762076ba34aSJunchao Zhang     });
763076ba34aSJunchao Zhang     Kokkos::parallel_scan(m+1,KOKKOS_LAMBDA(const PetscInt i,PetscInt& update,const bool final) {
764076ba34aSJunchao Zhang       update += Coi(i);
765076ba34aSJunchao Zhang       if (final) Coi(i) = update;
766076ba34aSJunchao Zhang     });
767076ba34aSJunchao Zhang 
768076ba34aSJunchao Zhang     /* Get Cdi, Coi on host (it is not a waste, since we do need them on host in
769076ba34aSJunchao Zhang        MatCreateSeqAIJKokkosWithCSRMatrix() below), then get nnz of Cd and Co.
770076ba34aSJunchao Zhang     */
771076ba34aSJunchao Zhang     Cdi_dual.modify_device();
772076ba34aSJunchao Zhang     Coi_dual.modify_device();
773076ba34aSJunchao Zhang     Cdi_dual.sync_host();
774076ba34aSJunchao Zhang     Coi_dual.sync_host();
775076ba34aSJunchao Zhang     PetscInt Cd_nnz = Cdi_dual.view_host().data()[m];
776076ba34aSJunchao Zhang     PetscInt Co_nnz = Coi_dual.view_host().data()[m];
777076ba34aSJunchao Zhang 
778076ba34aSJunchao Zhang     /* With nnz, allocate a, j for Cd and Co */
779076ba34aSJunchao Zhang     MatColIdxKokkosDualView Cdj_dual("j",Cd_nnz),Coj_dual("j",Co_nnz);
780076ba34aSJunchao Zhang     MatScalarKokkosDualView Cda_dual("a",Cd_nnz),Coa_dual("a",Co_nnz);
781076ba34aSJunchao Zhang 
782076ba34aSJunchao Zhang     /* Fill a, j of Cd and Co on device */
783076ba34aSJunchao Zhang     MatColIdxKokkosView     Cdj = Cdj_dual.view_device(),Coj = Coj_dual.view_device();
784076ba34aSJunchao Zhang     MatScalarKokkosView     Cda = Cda_dual.view_device(),Coa = Coa_dual.view_device();
785076ba34aSJunchao Zhang 
786076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
787076ba34aSJunchao Zhang       PetscInt i       = t.league_rank(); /* row i */
788076ba34aSJunchao Zhang       PetscInt clen    = Ci(i+1) - Ci(i); /* len of row i of C */
789076ba34aSJunchao Zhang       PetscInt cdlen   = Cdi(i+1) - Cdi(i); /* len of row i of Cd */
790076ba34aSJunchao Zhang       PetscInt cdstart = Cdstart(i); /* [start, end) of row i of Cd in C */
791076ba34aSJunchao Zhang       PetscInt cdend   = cdstart + cdlen;
792076ba34aSJunchao Zhang       /* [0, clen) is cut into three blocks: [0, cdstart), [cdstart, cdend), [cdend, clen) */
793076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clen), [&](PetscInt k) {
794076ba34aSJunchao Zhang         if (k < cdstart) { /* k in [0, cdstart) */
795076ba34aSJunchao Zhang           Coa(Coi(i)+k) = Ca(Ci(i)+k);
796076ba34aSJunchao Zhang           Coj(Coi(i)+k) = Cj(Ci(i)+k);
797076ba34aSJunchao Zhang         } else if (k < cdend) { /* k in [cdstart, cdend) */
798076ba34aSJunchao Zhang           Cda(Cdi(i)+(k-cdstart)) = Ca(Ci(i)+k);
799076ba34aSJunchao Zhang           Cdj(Cdi(i)+(k-cdstart)) = Cj(Ci(i)+k) - cstart; /* Use local col ids in Cdj */
800076ba34aSJunchao Zhang         } else { /* k in [cdend, clen) */
801076ba34aSJunchao Zhang           Coa(Coi(i)+k-cdlen) = Ca(Ci(i)+k);
802076ba34aSJunchao Zhang           Coj(Coi(i)+k-cdlen) = Cj(Ci(i)+k);
803076ba34aSJunchao Zhang         }
804076ba34aSJunchao Zhang       });
805076ba34aSJunchao Zhang     });
806076ba34aSJunchao Zhang 
807076ba34aSJunchao Zhang     Cdj_dual.modify_device();
808076ba34aSJunchao Zhang     Cda_dual.modify_device();
809076ba34aSJunchao Zhang     Coj_dual.modify_device();
810076ba34aSJunchao Zhang     Coa_dual.modify_device();
811076ba34aSJunchao Zhang     /* With a, i, j for Cd and Co, finally build Cd, Co and then C. Their offloadmask will be set in each's MatAssemblyEnd */
812076ba34aSJunchao Zhang     auto cdkok = new Mat_SeqAIJKokkos(m,n,Cd_nnz,Cdi_dual,Cdj_dual,Cda_dual);
813076ba34aSJunchao Zhang     auto cokok = new Mat_SeqAIJKokkos(m,N,Co_nnz,Coi_dual,Coj_dual,Coa_dual);
8145f80ce2aSJacob Faibussowitsch     CHKERRQ(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,cdkok,&Cd));
8155f80ce2aSJacob Faibussowitsch     CHKERRQ(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,cokok,&Co));
8165f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices(C,Cd,Co)); /* Coj will be converted to local ids within */
817076ba34aSJunchao Zhang   }
818076ba34aSJunchao Zhang   PetscFunctionReturn(0);
819076ba34aSJunchao Zhang }
820076ba34aSJunchao Zhang 
821076ba34aSJunchao Zhang /* MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos - Compact a SEQAIJKOKKS matrix's global col ids.
822076ba34aSJunchao Zhang 
823076ba34aSJunchao Zhang   It is similar to MatSeqAIJCompactOutExtraColumns_SeqAIJ, but it applies to SEQAIJKOKKOS and returns the l2g map in Kokkos view.
824076ba34aSJunchao Zhang 
825076ba34aSJunchao Zhang   Input Parameters:
826076ba34aSJunchao Zhang +  C        - the MATMPIAIJKOKKOS matrix, of size m,n,M,N
827076ba34aSJunchao Zhang .  reuse    - indicate whether the matrix has called this function before
828076ba34aSJunchao Zhang .  csrmat   - the KokkosCsrMatrix, of size m,N
829076ba34aSJunchao Zhang -  Cdoffset - when reuse == MAT_REUSE_MATRIX, it is an input parameter. For each row in csrmat, it stores the offset of the first
830076ba34aSJunchao Zhang               entry of the diag block of C in csrmat's j array.
831076ba34aSJunchao Zhang 
832076ba34aSJunchao Zhang   Output Parameters:
833076ba34aSJunchao Zhang +  C        - the updated MATMPIAIJKOKKOS matrix
834076ba34aSJunchao Zhang -  Cdoffset - when reuse == MAT_INITIAL_MATRIX, it is an output parameter
835076ba34aSJunchao Zhang 
836076ba34aSJunchao Zhang   Notes: the input matrix's col ids and col size will be changed.
837076ba34aSJunchao Zhang */
838076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos(Mat C,MatColIdxKokkosView& l2g)
839076ba34aSJunchao Zhang {
840076ba34aSJunchao Zhang   Mat_SeqAIJKokkos       *ckok;
841076ba34aSJunchao Zhang   ISLocalToGlobalMapping l2gmap;
842076ba34aSJunchao Zhang   const PetscInt         *garray;
843076ba34aSJunchao Zhang   PetscInt               sz;
844076ba34aSJunchao Zhang 
845076ba34aSJunchao Zhang   PetscFunctionBegin;
846076ba34aSJunchao Zhang   /* Compact P_other's global col ids and col size. We do it since we guess with local ids KK might be more memory scalable */
8475f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCompactOutExtraColumns_SeqAIJ(C,&l2gmap));
848076ba34aSJunchao Zhang   ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
849076ba34aSJunchao Zhang   ckok->j_dual.modify_host(); /* P_other's j is modified on host; we need to sync it on device */
850076ba34aSJunchao Zhang   ckok->j_dual.sync_device();
851076ba34aSJunchao Zhang   ckok->SetColSize(C->cmap->n); /* Update col size of the csrmat in spptr */
852076ba34aSJunchao Zhang 
853076ba34aSJunchao Zhang   /* Build l2g -- the local to global mapping of C's cols */
8545f80ce2aSJacob Faibussowitsch   CHKERRQ(ISLocalToGlobalMappingGetIndices(l2gmap,&garray));
8555f80ce2aSJacob Faibussowitsch   CHKERRQ(ISLocalToGlobalMappingGetSize(l2gmap,&sz));
8562c71b3e2SJacob Faibussowitsch   PetscCheckFalse(C->cmap->n != sz,PETSC_COMM_SELF,PETSC_ERR_PLIB,"matrix column size(%" PetscInt_FMT ") != l2g mapping size(%" PetscInt_FMT ")", C->cmap->n,sz);
857076ba34aSJunchao Zhang 
858076ba34aSJunchao Zhang   ConstMatColIdxKokkosViewHost tmp(garray,sz);
859076ba34aSJunchao Zhang   l2g = MatColIdxKokkosView("l2g",sz);
860076ba34aSJunchao Zhang   Kokkos::deep_copy(l2g,tmp);
861076ba34aSJunchao Zhang 
8625f80ce2aSJacob Faibussowitsch   CHKERRQ(ISLocalToGlobalMappingRestoreIndices(l2gmap,&garray));
8635f80ce2aSJacob Faibussowitsch   CHKERRQ(ISLocalToGlobalMappingDestroy(&l2gmap));
864076ba34aSJunchao Zhang   PetscFunctionReturn(0);
865076ba34aSJunchao Zhang }
866076ba34aSJunchao Zhang 
867076ba34aSJunchao Zhang /* MatProductSymbolic_MPIAIJKokkos_AB - AB flavor of MatProductSymbolic_MPIAIJKokkos
868076ba34aSJunchao Zhang 
869076ba34aSJunchao Zhang   Input Parameters:
870076ba34aSJunchao Zhang +  product  - Mat_Product which carried out the computation. Passed in to access info about this mat product.
871076ba34aSJunchao Zhang .  A        - an MPIAIJKOKKOS matrix
872076ba34aSJunchao Zhang .  B        - an MPIAIJKOKKOS matrix
873076ba34aSJunchao Zhang -  mm       - a struct used to stash intermediate data when computing AB. Persist from symbolic to numeric operations.
874076ba34aSJunchao Zhang 
875076ba34aSJunchao Zhang   Notes: The local part of the result C is stored as mm->C_global, which is of type KokkosCsrMatrix and uses global col ids.
876076ba34aSJunchao Zhang */
877076ba34aSJunchao Zhang static PetscErrorCode MatProductSymbolic_MPIAIJKokkos_AB(Mat_Product *product,Mat A,Mat B,MatMatStruct_AB *mm)
878076ba34aSJunchao Zhang {
879076ba34aSJunchao Zhang   Mat_MPIAIJ                  *a = static_cast<Mat_MPIAIJ*>(A->data);
880076ba34aSJunchao Zhang   Mat                         Ad = a->A,Ao = a->B; /* diag and offdiag of A */
881076ba34aSJunchao Zhang   IS                          glob = NULL;
882076ba34aSJunchao Zhang   const PetscInt              *garray;
883076ba34aSJunchao Zhang   PetscInt                    N = B->cmap->N,sz;
884076ba34aSJunchao Zhang   ConstMatColIdxKokkosView    l2g1; /* two temp maps mapping local col ids to global ones */
885076ba34aSJunchao Zhang   MatColIdxKokkosView         l2g2;
886076ba34aSJunchao Zhang   Mat                         C1,C2; /* intermediate matrices */
887076ba34aSJunchao Zhang 
888076ba34aSJunchao Zhang   PetscFunctionBegin;
889076ba34aSJunchao Zhang   /* C1 = Ad * B_local. B_local is a matrix got by merging Bd and Bo, and uses local col ids */
8905f80ce2aSJacob Faibussowitsch   CHKERRQ(MatMPIAIJGetLocalMatMerge(B,MAT_INITIAL_MATRIX,&glob,&mm->B_local));
8915f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductCreate(Ad,mm->B_local,NULL,&C1));
8925f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductSetType(C1,MATPRODUCT_AB));
8935f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductSetFill(C1,product->fill));
894076ba34aSJunchao Zhang   C1->product->api_user = product->api_user;
8955f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductSetFromOptions(C1));
8965f80ce2aSJacob Faibussowitsch   PetscCheck(C1->ops->productsymbolic,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C1->product->type]);
8975f80ce2aSJacob Faibussowitsch   CHKERRQ((*C1->ops->productsymbolic)(C1));
898076ba34aSJunchao Zhang 
8995f80ce2aSJacob Faibussowitsch   CHKERRQ(ISGetIndices(glob,&garray));
9005f80ce2aSJacob Faibussowitsch   CHKERRQ(ISGetSize(glob,&sz));
901076ba34aSJunchao Zhang   const auto& tmp  = ConstMatColIdxKokkosViewHost(garray,sz); /* wrap garray as a view */
902076ba34aSJunchao Zhang   l2g1 = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),tmp); /* maybe just an alias to tmp, so we restore garray at the very end */
9035f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C1,N,l2g1,mm->C1_global));
904076ba34aSJunchao Zhang 
905076ba34aSJunchao Zhang   /* C2 = Ao * B_other. B_other is a matrix consisting of needed rows of B gathered from other procs */
9065f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosBcast(mm->B_local,MAT_INITIAL_MATRIX,N,l2g1,a->Mvctx,mm->sf,mm->abuf,mm->rows,mm->rowoffset,mm->B_other));
907076ba34aSJunchao Zhang 
908076ba34aSJunchao Zhang   /* Compact B_other to use local ids as we guess KK spgemm is more memroy scalable with that; We could skip the compaction to simplify code */
9095f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos(mm->B_other,l2g2));
9105f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductCreate(Ao,mm->B_other,NULL,&C2));
9115f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductSetType(C2,MATPRODUCT_AB));
9125f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductSetFill(C2,product->fill));
913076ba34aSJunchao Zhang   C2->product->api_user = product->api_user;
9145f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductSetFromOptions(C2));
9155f80ce2aSJacob Faibussowitsch   PetscCheck(C2->ops->productsymbolic,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C2->product->type]);
9165f80ce2aSJacob Faibussowitsch   CHKERRQ((*C2->ops->productsymbolic)(C2));
9175f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C2,N,l2g2,mm->C2_global));
918076ba34aSJunchao Zhang 
919076ba34aSJunchao Zhang   /* C = C1 + C2.  We actually use their global col ids versions in adding */
920076ba34aSJunchao Zhang   mm->kh.create_spadd_handle(false); /* Input C1, C2 are NOT sorted, since B_local, B_other are not */
921076ba34aSJunchao Zhang   KokkosSparse::spadd_symbolic(&mm->kh,mm->C1_global,mm->C2_global,mm->C_global);
922076ba34aSJunchao Zhang   /* Have to do numeric since spadd_symbolic does not really populate column indices of the result matrix */
923076ba34aSJunchao Zhang   KokkosSparse::spadd_numeric(&mm->kh,(MatScalarType)1.0,mm->C1_global,(MatScalarType)1.0,mm->C2_global,mm->C_global);
924076ba34aSJunchao Zhang 
925076ba34aSJunchao Zhang   mm->C1 = C1;
926076ba34aSJunchao Zhang   mm->C2 = C2;
9275f80ce2aSJacob Faibussowitsch   CHKERRQ(ISRestoreIndices(glob,&garray));
9285f80ce2aSJacob Faibussowitsch   CHKERRQ(ISDestroy(&glob));
929076ba34aSJunchao Zhang   PetscFunctionReturn(0);
930076ba34aSJunchao Zhang }
931076ba34aSJunchao Zhang 
932076ba34aSJunchao Zhang /* MatProductSymbolic_MPIAIJKokkos_AtB - A^tB flavor of MatProductSymbolic_MPIAIJKokkos
933076ba34aSJunchao Zhang 
934076ba34aSJunchao Zhang   Input Parameters:
935076ba34aSJunchao Zhang +  product  - Mat_Product which carried out the computation. Passed in to access info about this mat product.
936076ba34aSJunchao Zhang .  A        - an MPIAIJKOKKOS matrix
937076ba34aSJunchao Zhang .  B        - a SEQAIJKOKKOS matrix. It works as if A^t is multiplied by a parallel matrix made up of Bs on each rank.
938076ba34aSJunchao Zhang .  localB   - Does B use local col ids? If false, then B is already in global col ids.
939076ba34aSJunchao Zhang .  N        - col size of the "parallel B matrix". It implies B's global col ids are in range of [0,N) and N is the same across the communicator.
940076ba34aSJunchao Zhang .  l2g      - If localB, then l2g maps B's local col ids to global ones.
941076ba34aSJunchao Zhang -  mm       - a struct used to stash intermediate data in AtB
942076ba34aSJunchao Zhang 
943076ba34aSJunchao Zhang   Notes: The local part of the result C is stored as mm->C_global, which is of type KokkosCsrMatrix and uses global col ids.
944076ba34aSJunchao Zhang */
945076ba34aSJunchao Zhang static PetscErrorCode MatProductSymbolic_MPIAIJKokkos_AtB(Mat_Product *product,Mat A,Mat B,PetscBool localB,PetscInt N,const ConstMatColIdxKokkosView& l2g,MatMatStruct_AtB *mm)
946076ba34aSJunchao Zhang {
947076ba34aSJunchao Zhang   Mat_MPIAIJ             *a = static_cast<Mat_MPIAIJ*>(A->data);
948076ba34aSJunchao Zhang   Mat                    Ad = a->A,Ao = a->B; /* diag and offdiag of A */
949076ba34aSJunchao Zhang   Mat                    C1,C2; /* intermediate matrices */
950076ba34aSJunchao Zhang 
951076ba34aSJunchao Zhang   PetscFunctionBegin;
952076ba34aSJunchao Zhang   /* C1 = Ad^t * B */
9535f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductCreate(Ad,B,NULL,&C1));
9545f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductSetType(C1,MATPRODUCT_AtB));
9555f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductSetFill(C1,product->fill));
956076ba34aSJunchao Zhang   C1->product->api_user = product->api_user;
9575f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductSetFromOptions(C1));
9585f80ce2aSJacob Faibussowitsch   PetscCheck(C1->ops->productsymbolic,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C1->product->type]);
9595f80ce2aSJacob Faibussowitsch   CHKERRQ((*C1->ops->productsymbolic)(C1));
960076ba34aSJunchao Zhang 
9615f80ce2aSJacob Faibussowitsch   if (localB) CHKERRQ(MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C1,N,l2g,mm->C1_global));
962076ba34aSJunchao Zhang   else mm->C1_global = static_cast<Mat_SeqAIJKokkos*>(C1->spptr)->csrmat; /* the csrmat already uses global col ids */
963076ba34aSJunchao Zhang 
964076ba34aSJunchao Zhang   /* C2 = Ao^t * B */
9655f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductCreate(Ao,B,NULL,&C2));
9665f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductSetType(C2,MATPRODUCT_AtB));
9675f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductSetFill(C2,product->fill));
968076ba34aSJunchao Zhang   C2->product->api_user = product->api_user;
9695f80ce2aSJacob Faibussowitsch   CHKERRQ(MatProductSetFromOptions(C2));
9705f80ce2aSJacob Faibussowitsch   PetscCheck(C2->ops->productsymbolic,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C2->product->type]);
9715f80ce2aSJacob Faibussowitsch   CHKERRQ((*C2->ops->productsymbolic)(C2));
972076ba34aSJunchao Zhang 
9735f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosReduce(C2,MAT_INITIAL_MATRIX,localB,N,l2g,a->Mvctx,mm->sf,mm->abuf,mm->srcrowoffset,mm->dstrowoffset,mm->C2_global));
974076ba34aSJunchao Zhang 
975076ba34aSJunchao Zhang   mm->kh.create_spadd_handle(false); /* Input C1, C2 are NOT sorted, since B may be not */
976076ba34aSJunchao Zhang   KokkosSparse::spadd_symbolic(&mm->kh,mm->C1_global,mm->C2_global,mm->C_global);
977076ba34aSJunchao Zhang   /* Have to do numeric since spadd_symbolic does not really populate column indices of the result matrix */
978076ba34aSJunchao Zhang   KokkosSparse::spadd_numeric(&mm->kh,(MatScalarType)1.0,mm->C1_global,(MatScalarType)1.0,mm->C2_global,mm->C_global);
979076ba34aSJunchao Zhang   mm->C1 = C1;
980076ba34aSJunchao Zhang   mm->C2 = C2;
981076ba34aSJunchao Zhang   PetscFunctionReturn(0);
982076ba34aSJunchao Zhang }
983076ba34aSJunchao Zhang 
984076ba34aSJunchao Zhang PetscErrorCode MatProductNumeric_MPIAIJKokkos(Mat C)
985076ba34aSJunchao Zhang {
986076ba34aSJunchao Zhang   Mat_Product                   *product = C->product;
987076ba34aSJunchao Zhang   MatProductType                ptype;
988076ba34aSJunchao Zhang   MatProductData_MPIAIJKokkos   *mmdata;
989076ba34aSJunchao Zhang   MatMatStruct                  *mm = NULL;
990076ba34aSJunchao Zhang   MatMatStruct_AB               *ab;
991076ba34aSJunchao Zhang   MatMatStruct_AtB              *atb;
992076ba34aSJunchao Zhang   Mat                           A,B,Ad,Ao,Bd,Bo;
993076ba34aSJunchao Zhang   const MatScalarType           one = 1.0; /* Not use literal 1.0 directly, to avoid wrong template instantiation in KokkosSparse::spadd_numeric */
994076ba34aSJunchao Zhang 
995076ba34aSJunchao Zhang   PetscFunctionBegin;
996076ba34aSJunchao Zhang   MatCheckProduct(C,1);
997076ba34aSJunchao Zhang   mmdata = static_cast<MatProductData_MPIAIJKokkos*>(product->data);
998076ba34aSJunchao Zhang   ptype  = product->type;
999076ba34aSJunchao Zhang   A      = product->A;
1000076ba34aSJunchao Zhang   B      = product->B;
1001076ba34aSJunchao Zhang   Ad     = static_cast<Mat_MPIAIJ*>(A->data)->A;
1002076ba34aSJunchao Zhang   Ao     = static_cast<Mat_MPIAIJ*>(A->data)->B;
1003076ba34aSJunchao Zhang   Bd     = static_cast<Mat_MPIAIJ*>(B->data)->A;
1004076ba34aSJunchao Zhang   Bo     = static_cast<Mat_MPIAIJ*>(B->data)->B;
1005076ba34aSJunchao Zhang 
1006076ba34aSJunchao Zhang   if (mmdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
1007076ba34aSJunchao Zhang     mmdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
1008076ba34aSJunchao Zhang     ab  = mmdata->mmAB;
1009076ba34aSJunchao Zhang     atb = mmdata->mmAtB;
1010076ba34aSJunchao Zhang     if (ab) {
1011076ba34aSJunchao Zhang       static_cast<MatProductData_SeqAIJKokkos*>(ab->C1->product->data)->reusesym = PETSC_FALSE;
1012076ba34aSJunchao Zhang       static_cast<MatProductData_SeqAIJKokkos*>(ab->C2->product->data)->reusesym = PETSC_FALSE;
1013076ba34aSJunchao Zhang     }
1014076ba34aSJunchao Zhang     if (atb) {
1015076ba34aSJunchao Zhang       static_cast<MatProductData_SeqAIJKokkos*>(atb->C1->product->data)->reusesym = PETSC_FALSE;
1016076ba34aSJunchao Zhang       static_cast<MatProductData_SeqAIJKokkos*>(atb->C2->product->data)->reusesym = PETSC_FALSE;
1017076ba34aSJunchao Zhang     }
1018076ba34aSJunchao Zhang     PetscFunctionReturn(0);
1019076ba34aSJunchao Zhang   }
1020076ba34aSJunchao Zhang 
1021076ba34aSJunchao Zhang   if (ptype == MATPRODUCT_AB) {
1022076ba34aSJunchao Zhang     ab   = mmdata->mmAB;
1023076ba34aSJunchao Zhang     /* C1 = Ad * B_local */
10242c71b3e2SJacob Faibussowitsch     PetscCheckFalse(!ab->C1->ops->productnumeric || !ab->C2->ops->productnumeric,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing numeric op for MATPRODUCT_AB");
10255f80ce2aSJacob Faibussowitsch     CHKERRQ(MatMPIAIJGetLocalMatMerge(B,MAT_REUSE_MATRIX,NULL/*glob*/,&ab->B_local));
10265f80ce2aSJacob Faibussowitsch     PetscCheck(ab->C1->product->B == ab->B_local,PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AB, internal mat product matrix C1->B has unexpectedly changed");
10275f80ce2aSJacob Faibussowitsch     if (ab->C1->product->A != Ad) CHKERRQ(MatProductReplaceMats(Ad,NULL,NULL,ab->C1));
10285f80ce2aSJacob Faibussowitsch     CHKERRQ((*ab->C1->ops->productnumeric)(ab->C1));
10295f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJKokkosBcast(ab->B_local,MAT_REUSE_MATRIX,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,ab->sf,
10305f80ce2aSJacob Faibussowitsch                                  ab->abuf,ab->rows,ab->rowoffset,ab->B_other));
1031076ba34aSJunchao Zhang     /* C2 = Ao * B_other */
10322c71b3e2SJacob Faibussowitsch     PetscCheckFalse(ab->C2->product->B != ab->B_other,PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AB, internal mat product matrix C2->B has unexpectedly changed");
10335f80ce2aSJacob Faibussowitsch     if (ab->C1->product->A != Ao) CHKERRQ(MatProductReplaceMats(Ao,NULL,NULL,ab->C2));
10345f80ce2aSJacob Faibussowitsch     CHKERRQ((*ab->C2->ops->productnumeric)(ab->C2));
1035076ba34aSJunchao Zhang     /* C = C1_global + C2_global */
1036076ba34aSJunchao Zhang     KokkosSparse::spadd_numeric(&ab->kh,one,ab->C1_global,one,ab->C2_global,ab->C_global);
1037076ba34aSJunchao Zhang     mm = static_cast<MatMatStruct*>(ab);
1038076ba34aSJunchao Zhang   } else if (ptype == MATPRODUCT_AtB) {
1039076ba34aSJunchao Zhang     atb  = mmdata->mmAtB;
10402c71b3e2SJacob Faibussowitsch     PetscCheckFalse(!atb->C1->ops->productnumeric || !atb->C2->ops->productnumeric,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing numeric op for MATPRODUCT_AtB");
1041076ba34aSJunchao Zhang     /* C1 = Ad^t * B_local */
10425f80ce2aSJacob Faibussowitsch     CHKERRQ(MatMPIAIJGetLocalMatMerge(B,MAT_REUSE_MATRIX,NULL/*glob*/,&atb->B_local));
10432c71b3e2SJacob Faibussowitsch     PetscCheckFalse(atb->C1->product->B != atb->B_local,PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AtB, internal mat product matrix C1->B has unexpectedly changed");
10445f80ce2aSJacob Faibussowitsch     if (atb->C1->product->A != Ad) CHKERRQ(MatProductReplaceMats(Ad,NULL,NULL,atb->C1));
10455f80ce2aSJacob Faibussowitsch     CHKERRQ((*atb->C1->ops->productnumeric)(atb->C1));
1046076ba34aSJunchao Zhang 
1047076ba34aSJunchao Zhang     /* C2 = Ao^t * B_local */
10482c71b3e2SJacob Faibussowitsch     PetscCheckFalse(atb->C2->product->B != atb->B_local,PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AtB, internal mat product matrix C2->B has unexpectedly changed");
10495f80ce2aSJacob Faibussowitsch     if (atb->C2->product->A != Ao) CHKERRQ(MatProductReplaceMats(Ao,NULL,NULL,atb->C2));
10505f80ce2aSJacob Faibussowitsch     CHKERRQ((*atb->C2->ops->productnumeric)(atb->C2));
1051076ba34aSJunchao Zhang     /* Form C2_global */
10525f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJKokkosReduce(atb->C2,MAT_REUSE_MATRIX,PETSC_TRUE,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,atb->sf,
10535f80ce2aSJacob Faibussowitsch                                   atb->abuf,atb->srcrowoffset,atb->dstrowoffset,atb->C2_global));
1054076ba34aSJunchao Zhang     /* C = C1_global + C2_global */
1055076ba34aSJunchao Zhang     KokkosSparse::spadd_numeric(&atb->kh,one,atb->C1_global,one,atb->C2_global,atb->C_global);
1056076ba34aSJunchao Zhang     mm = static_cast<MatMatStruct*>(atb);
1057076ba34aSJunchao Zhang   } else if (ptype == MATPRODUCT_PtAP) { /* BtAB */
1058076ba34aSJunchao Zhang     ab   = mmdata->mmAB;
10595f80ce2aSJacob Faibussowitsch     CHKERRQ(MatMPIAIJGetLocalMatMerge(B,MAT_REUSE_MATRIX,NULL/*glob*/,&ab->B_local));
1060076ba34aSJunchao Zhang 
1061076ba34aSJunchao Zhang     /* ab->C1 = Ad * B_local */
10622c71b3e2SJacob Faibussowitsch     PetscCheckFalse(ab->C1->product->B != ab->B_local,PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_PtAP, internal mat product matrix ab->C1->B has unexpectedly changed");
10635f80ce2aSJacob Faibussowitsch     if (ab->C1->product->A != Ad) CHKERRQ(MatProductReplaceMats(Ad,NULL,NULL,ab->C1));
10645f80ce2aSJacob Faibussowitsch     CHKERRQ((*ab->C1->ops->productnumeric)(ab->C1));
10655f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJKokkosBcast(ab->B_local,MAT_REUSE_MATRIX,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,ab->sf,
10665f80ce2aSJacob Faibussowitsch                                  ab->abuf,ab->rows,ab->rowoffset,ab->B_other));
1067076ba34aSJunchao Zhang     /* ab->C2 = Ao * B_other */
10685f80ce2aSJacob Faibussowitsch     if (ab->C2->product->A != Ao) CHKERRQ(MatProductReplaceMats(Ao,NULL,NULL,ab->C2));
10695f80ce2aSJacob Faibussowitsch     CHKERRQ((*ab->C2->ops->productnumeric)(ab->C2)); /* C2 = Ao * B_other */
1070076ba34aSJunchao Zhang     KokkosSparse::spadd_numeric(&ab->kh,one,ab->C1_global,one,ab->C2_global,ab->C_global);
1071076ba34aSJunchao Zhang 
1072076ba34aSJunchao Zhang     /* atb->C1 = Bd^t * ab->C_petsc */
1073076ba34aSJunchao Zhang     atb  = mmdata->mmAtB;
10742c71b3e2SJacob Faibussowitsch     PetscCheckFalse(atb->C1->product->B != ab->C_petsc,PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_PtAP, internal mat product matrix atb->C1->B has unexpectedly changed");
10755f80ce2aSJacob Faibussowitsch     if (atb->C1->product->A != Bd) CHKERRQ(MatProductReplaceMats(Bd,NULL,NULL,atb->C1));
10765f80ce2aSJacob Faibussowitsch     CHKERRQ((*atb->C1->ops->productnumeric)(atb->C1));
1077076ba34aSJunchao Zhang     /* atb->C2 = Bo^t * ab->C_petsc */
10785f80ce2aSJacob Faibussowitsch     if (atb->C2->product->A != Bo) CHKERRQ(MatProductReplaceMats(Bo,NULL,NULL,atb->C2));
10795f80ce2aSJacob Faibussowitsch     CHKERRQ((*atb->C2->ops->productnumeric)(atb->C2));
10805f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJKokkosReduce(atb->C2,MAT_REUSE_MATRIX,PETSC_FALSE,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,atb->sf,
10815f80ce2aSJacob Faibussowitsch                                   atb->abuf,atb->srcrowoffset,atb->dstrowoffset,atb->C2_global));
1082076ba34aSJunchao Zhang     KokkosSparse::spadd_numeric(&atb->kh,one,atb->C1_global,one,atb->C2_global,atb->C_global);
1083076ba34aSJunchao Zhang     mm = static_cast<MatMatStruct*>(atb);
1084076ba34aSJunchao Zhang   }
1085076ba34aSJunchao Zhang   /* Split C_global to form C */
10865f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetMPIAIJKokkosWithGlobalCSRMatrix(C,MAT_REUSE_MATRIX,mm->C_global,mm->Cdstart));
1087076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1088076ba34aSJunchao Zhang }
1089076ba34aSJunchao Zhang 
1090076ba34aSJunchao Zhang PetscErrorCode MatProductSymbolic_MPIAIJKokkos(Mat C)
1091076ba34aSJunchao Zhang {
1092076ba34aSJunchao Zhang   Mat                         A,B;
1093076ba34aSJunchao Zhang   Mat_Product                 *product = C->product;
1094076ba34aSJunchao Zhang   MatProductType              ptype;
1095076ba34aSJunchao Zhang   MatProductData_MPIAIJKokkos *mmdata;
1096076ba34aSJunchao Zhang   MatMatStruct                *mm = NULL;
1097076ba34aSJunchao Zhang   IS                          glob = NULL;
1098076ba34aSJunchao Zhang   const PetscInt              *garray;
1099076ba34aSJunchao Zhang   PetscInt                    m,n,M,N,sz;
1100076ba34aSJunchao Zhang   ConstMatColIdxKokkosView    l2g; /* map local col ids to global ones */
1101076ba34aSJunchao Zhang 
1102076ba34aSJunchao Zhang   PetscFunctionBegin;
1103076ba34aSJunchao Zhang   MatCheckProduct(C,1);
1104*28b400f6SJacob Faibussowitsch   PetscCheck(!product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
1105076ba34aSJunchao Zhang   ptype = product->type;
1106076ba34aSJunchao Zhang   A     = product->A;
1107076ba34aSJunchao Zhang   B     = product->B;
1108076ba34aSJunchao Zhang 
1109076ba34aSJunchao Zhang   switch (ptype) {
1110076ba34aSJunchao Zhang     case MATPRODUCT_AB:   m = A->rmap->n; n = B->cmap->n; M = A->rmap->N; N = B->cmap->N; break;
1111076ba34aSJunchao Zhang     case MATPRODUCT_AtB:  m = A->cmap->n; n = B->cmap->n; M = A->cmap->N; N = B->cmap->N; break;
1112076ba34aSJunchao Zhang     case MATPRODUCT_PtAP: m = B->cmap->n; n = B->cmap->n; M = B->cmap->N; N = B->cmap->N; break; /* BtAB */
111398921bdaSJacob Faibussowitsch     default: SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for product type %s",MatProductTypes[ptype]);
1114076ba34aSJunchao Zhang   }
1115076ba34aSJunchao Zhang 
11165f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetSizes(C,m,n,M,N));
11175f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLayoutSetUp(C->rmap));
11185f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLayoutSetUp(C->cmap));
11195f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetType(C,((PetscObject)A)->type_name));
1120076ba34aSJunchao Zhang 
1121076ba34aSJunchao Zhang   mmdata           = new MatProductData_MPIAIJKokkos();
1122076ba34aSJunchao Zhang   mmdata->reusesym = product->api_user;
1123076ba34aSJunchao Zhang 
1124076ba34aSJunchao Zhang   if (ptype == MATPRODUCT_AB) {
1125076ba34aSJunchao Zhang     mmdata->mmAB = new MatMatStruct_AB();
11265f80ce2aSJacob Faibussowitsch     CHKERRQ(MatProductSymbolic_MPIAIJKokkos_AB(product,A,B,mmdata->mmAB));
1127076ba34aSJunchao Zhang     mm   = static_cast<MatMatStruct*>(mmdata->mmAB);
1128076ba34aSJunchao Zhang   } else if (ptype == MATPRODUCT_AtB) {
1129076ba34aSJunchao Zhang     mmdata->mmAtB = new MatMatStruct_AtB();
1130076ba34aSJunchao Zhang     auto atb      = mmdata->mmAtB;
11315f80ce2aSJacob Faibussowitsch     CHKERRQ(MatMPIAIJGetLocalMatMerge(B,MAT_INITIAL_MATRIX,&glob,&atb->B_local));
11325f80ce2aSJacob Faibussowitsch     CHKERRQ(ISGetIndices(glob,&garray));
11335f80ce2aSJacob Faibussowitsch     CHKERRQ(ISGetSize(glob,&sz));
1134076ba34aSJunchao Zhang     l2g  = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatColIdxKokkosViewHost(garray,sz));
11355f80ce2aSJacob Faibussowitsch     CHKERRQ(MatProductSymbolic_MPIAIJKokkos_AtB(product,A,atb->B_local,PETSC_TRUE,N,l2g,atb));
11365f80ce2aSJacob Faibussowitsch     CHKERRQ(ISRestoreIndices(glob,&garray));
11375f80ce2aSJacob Faibussowitsch     CHKERRQ(ISDestroy(&glob));
1138076ba34aSJunchao Zhang     mm   = static_cast<MatMatStruct*>(atb);
1139076ba34aSJunchao Zhang   } else if (ptype == MATPRODUCT_PtAP) { /* BtAB */
1140076ba34aSJunchao Zhang     mmdata->mmAB  = new MatMatStruct_AB(); /* tmp=A*B */
1141076ba34aSJunchao Zhang     mmdata->mmAtB = new MatMatStruct_AtB(); /* C=B^t*tmp */
1142076ba34aSJunchao Zhang     auto ab       = mmdata->mmAB;
1143076ba34aSJunchao Zhang     auto atb      = mmdata->mmAtB;
11445f80ce2aSJacob Faibussowitsch     CHKERRQ(MatProductSymbolic_MPIAIJKokkos_AB(product,A,B,ab));
1145076ba34aSJunchao Zhang     auto tmp = new Mat_SeqAIJKokkos(ab->C_global); /* Memory will be owned by ab->C_petsc */
11465f80ce2aSJacob Faibussowitsch     CHKERRQ(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,tmp,&ab->C_petsc));
11475f80ce2aSJacob Faibussowitsch     CHKERRQ(MatProductSymbolic_MPIAIJKokkos_AtB(product,B,ab->C_petsc,PETSC_FALSE,N,l2g/*not used*/,atb));
1148076ba34aSJunchao Zhang     mm   = static_cast<MatMatStruct*>(atb);
1149076ba34aSJunchao Zhang   }
1150076ba34aSJunchao Zhang   /* Split the C_global into petsc A, B format */
11515f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetMPIAIJKokkosWithGlobalCSRMatrix(C,MAT_INITIAL_MATRIX,mm->C_global,mm->Cdstart));
1152076ba34aSJunchao Zhang   C->product->data        = mmdata;
1153076ba34aSJunchao Zhang   C->product->destroy     = MatProductDataDestroy_MPIAIJKokkos;
1154076ba34aSJunchao Zhang   C->ops->productnumeric  = MatProductNumeric_MPIAIJKokkos;
1155076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1156076ba34aSJunchao Zhang }
1157076ba34aSJunchao Zhang 
1158076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_MPIAIJKokkos(Mat mat)
1159076ba34aSJunchao Zhang {
1160076ba34aSJunchao Zhang   PetscErrorCode ierr;
1161076ba34aSJunchao Zhang   Mat_Product    *product = mat->product;
1162076ba34aSJunchao Zhang   PetscBool      match = PETSC_FALSE;
1163076ba34aSJunchao Zhang   PetscBool      usecpu = PETSC_FALSE;
1164076ba34aSJunchao Zhang 
1165076ba34aSJunchao Zhang   PetscFunctionBegin;
1166076ba34aSJunchao Zhang   MatCheckProduct(mat,1);
1167076ba34aSJunchao Zhang   if (!product->A->boundtocpu && !product->B->boundtocpu) {
11685f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscObjectTypeCompare((PetscObject)product->B,((PetscObject)product->A)->type_name,&match));
1169076ba34aSJunchao Zhang   }
1170076ba34aSJunchao Zhang   if (match) { /* we can always fallback to the CPU if requested */
1171076ba34aSJunchao Zhang     switch (product->type) {
1172076ba34aSJunchao Zhang     case MATPRODUCT_AB:
1173076ba34aSJunchao Zhang       if (product->api_user) {
1174076ba34aSJunchao Zhang         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
11755f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
1176076ba34aSJunchao Zhang         ierr = PetscOptionsEnd();CHKERRQ(ierr);
1177076ba34aSJunchao Zhang       } else {
1178076ba34aSJunchao Zhang         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
11795f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
1180076ba34aSJunchao Zhang         ierr = PetscOptionsEnd();CHKERRQ(ierr);
1181076ba34aSJunchao Zhang       }
1182076ba34aSJunchao Zhang       break;
1183076ba34aSJunchao Zhang     case MATPRODUCT_AtB:
1184076ba34aSJunchao Zhang       if (product->api_user) {
1185076ba34aSJunchao Zhang         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
11865f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
1187076ba34aSJunchao Zhang         ierr = PetscOptionsEnd();CHKERRQ(ierr);
1188076ba34aSJunchao Zhang       } else {
1189076ba34aSJunchao Zhang         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
11905f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
1191076ba34aSJunchao Zhang         ierr = PetscOptionsEnd();CHKERRQ(ierr);
1192076ba34aSJunchao Zhang       }
1193076ba34aSJunchao Zhang       break;
1194076ba34aSJunchao Zhang     case MATPRODUCT_PtAP:
1195076ba34aSJunchao Zhang       if (product->api_user) {
1196076ba34aSJunchao Zhang         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
11975f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
1198076ba34aSJunchao Zhang         ierr = PetscOptionsEnd();CHKERRQ(ierr);
1199076ba34aSJunchao Zhang       } else {
1200076ba34aSJunchao Zhang         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
12015f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
1202076ba34aSJunchao Zhang         ierr = PetscOptionsEnd();CHKERRQ(ierr);
1203076ba34aSJunchao Zhang       }
1204076ba34aSJunchao Zhang       break;
1205076ba34aSJunchao Zhang     default:
1206076ba34aSJunchao Zhang       break;
1207076ba34aSJunchao Zhang     }
1208076ba34aSJunchao Zhang     match = (PetscBool)!usecpu;
1209076ba34aSJunchao Zhang   }
1210076ba34aSJunchao Zhang   if (match) {
1211076ba34aSJunchao Zhang     switch (product->type) {
1212076ba34aSJunchao Zhang     case MATPRODUCT_AB:
1213076ba34aSJunchao Zhang     case MATPRODUCT_AtB:
1214076ba34aSJunchao Zhang     case MATPRODUCT_PtAP:
1215076ba34aSJunchao Zhang       mat->ops->productsymbolic = MatProductSymbolic_MPIAIJKokkos;
1216076ba34aSJunchao Zhang       break;
1217076ba34aSJunchao Zhang     default:
1218076ba34aSJunchao Zhang       break;
1219076ba34aSJunchao Zhang     }
1220076ba34aSJunchao Zhang   }
1221076ba34aSJunchao Zhang   /* fallback to MPIAIJ ops */
1222076ba34aSJunchao Zhang   if (!mat->ops->productsymbolic) {
12235f80ce2aSJacob Faibussowitsch     CHKERRQ(MatProductSetFromOptions_MPIAIJ(mat));
1224076ba34aSJunchao Zhang   }
1225076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1226076ba34aSJunchao Zhang }
1227076ba34aSJunchao Zhang 
122882a78a4eSJed Brown static PetscErrorCode MatSetPreallocationCOO_MPIAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
122942550becSJunchao Zhang {
1230394ed5ebSJunchao Zhang   Mat_MPIAIJ       *mpiaij = (Mat_MPIAIJ*)mat->data;
1231cbc6b225SStefano Zampini   Mat_MPIAIJKokkos *mpikok;
123242550becSJunchao Zhang 
123342550becSJunchao Zhang   PetscFunctionBegin;
12345f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetPreallocationCOO_MPIAIJ(mat,coo_n,coo_i,coo_j));
1235cbc6b225SStefano Zampini   mat->preallocated = PETSC_TRUE;
12365f80ce2aSJacob Faibussowitsch   CHKERRQ(MatAssemblyBegin(mat,MAT_FINAL_ASSEMBLY));
12375f80ce2aSJacob Faibussowitsch   CHKERRQ(MatAssemblyEnd(mat,MAT_FINAL_ASSEMBLY));
12385f80ce2aSJacob Faibussowitsch   CHKERRQ(MatZeroEntries(mat));
1239cbc6b225SStefano Zampini   mpikok = static_cast<Mat_MPIAIJKokkos*>(mpiaij->spptr);
1240cbc6b225SStefano Zampini   delete mpikok;
1241394ed5ebSJunchao Zhang   mpiaij->spptr = new Mat_MPIAIJKokkos(mpiaij);
124242550becSJunchao Zhang   PetscFunctionReturn(0);
124342550becSJunchao Zhang }
124442550becSJunchao Zhang 
124542550becSJunchao Zhang static PetscErrorCode MatSetValuesCOO_MPIAIJKokkos(Mat mat,const PetscScalar v[],InsertMode imode)
124642550becSJunchao Zhang {
1247394ed5ebSJunchao Zhang   Mat_MPIAIJ                     *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data);
124842550becSJunchao Zhang   Mat_MPIAIJKokkos               *mpikok = static_cast<Mat_MPIAIJKokkos*>(mpiaij->spptr);
124942550becSJunchao Zhang   Mat                            A = mpiaij->A,B = mpiaij->B;
1250394ed5ebSJunchao Zhang   PetscCount                     Annz1 = mpiaij->Annz1,Annz2 = mpiaij->Annz2,Bnnz1 = mpiaij->Bnnz1,Bnnz2 = mpiaij->Bnnz2;
125142550becSJunchao Zhang   MatScalarKokkosView            Aa,Ba;
1252394ed5ebSJunchao Zhang   MatScalarKokkosView            v1;
125342550becSJunchao Zhang   MatScalarKokkosView&           vsend = mpikok->sendbuf_d;
125442550becSJunchao Zhang   const MatScalarKokkosView&     v2 = mpikok->recvbuf_d;
1255394ed5ebSJunchao Zhang   const PetscCountKokkosView&    Ajmap1 = mpikok->Ajmap1_d,Ajmap2 = mpikok->Ajmap2_d,Aimap1 = mpikok->Aimap1_d,Aimap2 = mpikok->Aimap2_d;
1256394ed5ebSJunchao Zhang   const PetscCountKokkosView&    Bjmap1 = mpikok->Bjmap1_d,Bjmap2 = mpikok->Bjmap2_d,Bimap1 = mpikok->Bimap1_d,Bimap2 = mpikok->Bimap2_d;
1257394ed5ebSJunchao Zhang   const PetscCountKokkosView&    Aperm1 = mpikok->Aperm1_d,Aperm2 = mpikok->Aperm2_d,Bperm1 = mpikok->Bperm1_d,Bperm2 = mpikok->Bperm2_d;
1258394ed5ebSJunchao Zhang   const PetscCountKokkosView&    Cperm1 = mpikok->Cperm1_d;
125942550becSJunchao Zhang   PetscMemType                   memtype;
126042550becSJunchao Zhang 
126142550becSJunchao Zhang   PetscFunctionBegin;
12625f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscGetMemType(v,&memtype)); /* Return PETSC_MEMTYPE_HOST when v is NULL */
126342550becSJunchao Zhang   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device if any */
1264394ed5ebSJunchao Zhang     v1 = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),MatScalarKokkosViewHost((PetscScalar*)v,mpiaij->coo_n));
126542550becSJunchao Zhang   } else {
1266394ed5ebSJunchao Zhang     v1 = MatScalarKokkosView((PetscScalar*)v,mpiaij->coo_n); /* Directly use v[]'s memory */
126742550becSJunchao Zhang   }
126842550becSJunchao Zhang 
126942550becSJunchao Zhang   if (imode == INSERT_VALUES) {
12705f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJGetKokkosViewWrite(A,&Aa)); /* write matrix values */
12715f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJGetKokkosViewWrite(B,&Ba));
127242550becSJunchao Zhang     Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */
127342550becSJunchao Zhang     Kokkos::deep_copy(Ba,0.0);
1274394ed5ebSJunchao Zhang   } else {
12755f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJGetKokkosView(A,&Aa)); /* read & write matrix values */
12765f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJGetKokkosView(B,&Ba));
127742550becSJunchao Zhang   }
127842550becSJunchao Zhang 
127942550becSJunchao Zhang   /* Pack entries to be sent to remote */
1280394ed5ebSJunchao Zhang   Kokkos::parallel_for(vsend.extent(0),KOKKOS_LAMBDA(const PetscCount i) {vsend(i) = v1(Cperm1(i));});
128142550becSJunchao Zhang 
128242550becSJunchao Zhang   /* Send remote entries to their owner and overlap the communication with local computation */
12835f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_KOKKOS,vsend.data(),PETSC_MEMTYPE_KOKKOS,v2.data(),MPI_REPLACE));
128442550becSJunchao Zhang   /* Add local entries to A and B */
1285394ed5ebSJunchao Zhang   Kokkos::parallel_for(Annz1,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=Ajmap1(i); k<Ajmap1(i+1); k++) Aa(Aimap1(i)) += v1(Aperm1(k));});
1286394ed5ebSJunchao Zhang   Kokkos::parallel_for(Bnnz1,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=Bjmap1(i); k<Bjmap1(i+1); k++) Ba(Bimap1(i)) += v1(Bperm1(k));});
12875f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend.data(),v2.data(),MPI_REPLACE));
128842550becSJunchao Zhang 
128942550becSJunchao Zhang   /* Add received remote entries to A and B */
1290394ed5ebSJunchao Zhang   Kokkos::parallel_for(Annz2,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=Ajmap2(i); k<Ajmap2(i+1); k++) Aa(Aimap2(i)) += v2(Aperm2(k));});
1291394ed5ebSJunchao Zhang   Kokkos::parallel_for(Bnnz2,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=Bjmap2(i); k<Bjmap2(i+1); k++) Ba(Bimap2(i)) += v2(Bperm2(k));});
129242550becSJunchao Zhang 
1293394ed5ebSJunchao Zhang   if (imode == INSERT_VALUES) {
12945f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJRestoreKokkosViewWrite(A,&Aa)); /* Increase A & B's state etc. */
12955f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJRestoreKokkosViewWrite(B,&Ba));
1296394ed5ebSJunchao Zhang   } else {
12975f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJRestoreKokkosView(A,&Aa));
12985f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJRestoreKokkosView(B,&Ba));
1299394ed5ebSJunchao Zhang   }
130042550becSJunchao Zhang   PetscFunctionReturn(0);
130142550becSJunchao Zhang }
130242550becSJunchao Zhang 
1303076ba34aSJunchao Zhang PetscErrorCode MatDestroy_MPIAIJKokkos(Mat A)
1304076ba34aSJunchao Zhang {
130542550becSJunchao Zhang   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ*)A->data;
1306076ba34aSJunchao Zhang 
1307076ba34aSJunchao Zhang   PetscFunctionBegin;
13085f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL));
13095f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL));
13105f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",   NULL));
13115f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",          NULL));
131242550becSJunchao Zhang   delete (Mat_MPIAIJKokkos*)mpiaij->spptr;
13135f80ce2aSJacob Faibussowitsch   CHKERRQ(MatDestroy_MPIAIJ(A));
1314076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1315076ba34aSJunchao Zhang }
1316076ba34aSJunchao Zhang 
13178c3ff71bSJunchao Zhang PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
13188c3ff71bSJunchao Zhang {
13198c3ff71bSJunchao Zhang   Mat                B;
1320076ba34aSJunchao Zhang   Mat_MPIAIJ         *a;
13218c3ff71bSJunchao Zhang 
13228c3ff71bSJunchao Zhang   PetscFunctionBegin;
13238c3ff71bSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) {
13245f80ce2aSJacob Faibussowitsch     CHKERRQ(MatDuplicate(A,MAT_COPY_VALUES,newmat));
13258c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {
13265f80ce2aSJacob Faibussowitsch     CHKERRQ(MatCopy(A,*newmat,SAME_NONZERO_PATTERN));
13278c3ff71bSJunchao Zhang   }
13288c3ff71bSJunchao Zhang   B = *newmat;
13298c3ff71bSJunchao Zhang 
13306f3d89d0SStefano Zampini   B->boundtocpu = PETSC_FALSE;
13315f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(B->defaultvectype));
13325f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscStrallocpy(VECKOKKOS,&B->defaultvectype));
13335f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectChangeTypeName((PetscObject)B,MATMPIAIJKOKKOS));
13348c3ff71bSJunchao Zhang 
1335076ba34aSJunchao Zhang   a = static_cast<Mat_MPIAIJ*>(A->data);
13365f80ce2aSJacob Faibussowitsch   if (a->A) CHKERRQ(MatSetType(a->A,MATSEQAIJKOKKOS));
13375f80ce2aSJacob Faibussowitsch   if (a->B) CHKERRQ(MatSetType(a->B,MATSEQAIJKOKKOS));
13385f80ce2aSJacob Faibussowitsch   if (a->lvec) CHKERRQ(VecSetType(a->lvec,VECSEQKOKKOS));
1339076ba34aSJunchao Zhang 
13408c3ff71bSJunchao Zhang   B->ops->assemblyend           = MatAssemblyEnd_MPIAIJKokkos;
13418c3ff71bSJunchao Zhang   B->ops->mult                  = MatMult_MPIAIJKokkos;
13428c3ff71bSJunchao Zhang   B->ops->multadd               = MatMultAdd_MPIAIJKokkos;
13438c3ff71bSJunchao Zhang   B->ops->multtranspose         = MatMultTranspose_MPIAIJKokkos;
1344076ba34aSJunchao Zhang   B->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJKokkos;
1345076ba34aSJunchao Zhang   B->ops->destroy               = MatDestroy_MPIAIJKokkos;
13468c3ff71bSJunchao Zhang 
13475f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)B,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJKokkos));
13485f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)B,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJKokkos));
13495f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)B,"MatSetPreallocationCOO_C",   MatSetPreallocationCOO_MPIAIJKokkos));
13505f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)B,"MatSetValuesCOO_C",          MatSetValuesCOO_MPIAIJKokkos));
13518c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
13528c3ff71bSJunchao Zhang }
13533f3ba80aSJunchao Zhang /*MC
13543f3ba80aSJunchao Zhang    MATSMPIAIJKOKKOS - MATAIJKOKKOS = "(mpi)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
13558c3ff71bSJunchao Zhang 
13563f3ba80aSJunchao Zhang    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
13573f3ba80aSJunchao Zhang 
13583f3ba80aSJunchao Zhang    Options Database Keys:
13593f3ba80aSJunchao Zhang .  -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions()
13603f3ba80aSJunchao Zhang 
13613f3ba80aSJunchao Zhang   Level: beginner
13623f3ba80aSJunchao Zhang 
13633f3ba80aSJunchao Zhang .seealso: MatCreateAIJKokkos(), MATSEQAIJKOKKOS
13643f3ba80aSJunchao Zhang M*/
13658c3ff71bSJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJKokkos(Mat A)
13668c3ff71bSJunchao Zhang {
13678c3ff71bSJunchao Zhang   PetscFunctionBegin;
13685f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscKokkosInitializeCheck());
13695f80ce2aSJacob Faibussowitsch   CHKERRQ(MatCreate_MPIAIJ(A));
13705f80ce2aSJacob Faibussowitsch   CHKERRQ(MatConvert_MPIAIJ_MPIAIJKokkos(A,MATMPIAIJKOKKOS,MAT_INPLACE_MATRIX,&A));
13718c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
13728c3ff71bSJunchao Zhang }
13738c3ff71bSJunchao Zhang 
13748c3ff71bSJunchao Zhang /*@C
13758c3ff71bSJunchao Zhang    MatCreateAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
13768c3ff71bSJunchao Zhang    (the default parallel PETSc format).  This matrix will ultimately pushed down
13778c3ff71bSJunchao Zhang    to Kokkos for calculations. For good matrix
13788c3ff71bSJunchao Zhang    assembly performance the user should preallocate the matrix storage by setting
13798c3ff71bSJunchao Zhang    the parameter nz (or the array nnz).  By setting these parameters accurately,
13808c3ff71bSJunchao Zhang    performance during matrix assembly can be increased by more than a factor of 50.
13818c3ff71bSJunchao Zhang 
13828c3ff71bSJunchao Zhang    Collective
13838c3ff71bSJunchao Zhang 
13848c3ff71bSJunchao Zhang    Input Parameters:
13858c3ff71bSJunchao Zhang +  comm - MPI communicator, set to PETSC_COMM_SELF
13868c3ff71bSJunchao Zhang .  m - number of rows
13878c3ff71bSJunchao Zhang .  n - number of columns
13888c3ff71bSJunchao Zhang .  nz - number of nonzeros per row (same for all rows)
13898c3ff71bSJunchao Zhang -  nnz - array containing the number of nonzeros in the various rows
13908c3ff71bSJunchao Zhang          (possibly different for each row) or NULL
13918c3ff71bSJunchao Zhang 
13928c3ff71bSJunchao Zhang    Output Parameter:
13938c3ff71bSJunchao Zhang .  A - the matrix
13948c3ff71bSJunchao Zhang 
13958c3ff71bSJunchao Zhang    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
13968c3ff71bSJunchao Zhang    MatXXXXSetPreallocation() paradigm instead of this routine directly.
13978c3ff71bSJunchao Zhang    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
13988c3ff71bSJunchao Zhang 
13998c3ff71bSJunchao Zhang    Notes:
14008c3ff71bSJunchao Zhang    If nnz is given then nz is ignored
14018c3ff71bSJunchao Zhang 
14028c3ff71bSJunchao Zhang    The AIJ format (also called the Yale sparse matrix format or
14038c3ff71bSJunchao Zhang    compressed row storage), is fully compatible with standard Fortran 77
14048c3ff71bSJunchao Zhang    storage.  That is, the stored row and column indices can begin at
14058c3ff71bSJunchao Zhang    either one (as in Fortran) or zero.  See the users' manual for details.
14068c3ff71bSJunchao Zhang 
14078c3ff71bSJunchao Zhang    Specify the preallocated storage with either nz or nnz (not both).
14088c3ff71bSJunchao Zhang    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
14098c3ff71bSJunchao Zhang    allocation.  For large problems you MUST preallocate memory or you
14108c3ff71bSJunchao Zhang    will get TERRIBLE performance, see the users' manual chapter on matrices.
14118c3ff71bSJunchao Zhang 
14128c3ff71bSJunchao Zhang    By default, this format uses inodes (identical nodes) when possible, to
14138c3ff71bSJunchao Zhang    improve numerical efficiency of matrix-vector products and solves. We
14148c3ff71bSJunchao Zhang    search for consecutive rows with the same nonzero structure, thereby
14158c3ff71bSJunchao Zhang    reusing matrix information to achieve increased efficiency.
14168c3ff71bSJunchao Zhang 
14178c3ff71bSJunchao Zhang    Level: intermediate
14188c3ff71bSJunchao Zhang 
14198c3ff71bSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJKOKKOS, MATAIJKokkos
14208c3ff71bSJunchao Zhang @*/
14218c3ff71bSJunchao Zhang PetscErrorCode  MatCreateAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
14228c3ff71bSJunchao Zhang {
14238c3ff71bSJunchao Zhang   PetscMPIInt    size;
14248c3ff71bSJunchao Zhang 
14258c3ff71bSJunchao Zhang   PetscFunctionBegin;
14265f80ce2aSJacob Faibussowitsch   CHKERRQ(MatCreate(comm,A));
14275f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetSizes(*A,m,n,M,N));
14285f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Comm_size(comm,&size));
14298c3ff71bSJunchao Zhang   if (size > 1) {
14305f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSetType(*A,MATMPIAIJKOKKOS));
14315f80ce2aSJacob Faibussowitsch     CHKERRQ(MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz));
14328c3ff71bSJunchao Zhang   } else {
14335f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSetType(*A,MATSEQAIJKOKKOS));
14345f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJSetPreallocation(*A,d_nz,d_nnz));
14358c3ff71bSJunchao Zhang   }
14368c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
14378c3ff71bSJunchao Zhang }
14388c3ff71bSJunchao Zhang 
1439a587d139SMark // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
1440042217e8SBarry Smith PetscErrorCode MatKokkosGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
1441a587d139SMark {
1442a587d139SMark   PetscMPIInt                size,rank;
1443a587d139SMark   MPI_Comm                   comm;
1444042217e8SBarry Smith   PetscSplitCSRDataStructure d_mat=NULL;
1445a587d139SMark 
1446a587d139SMark   PetscFunctionBegin;
14475f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectGetComm((PetscObject)A,&comm));
14485f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Comm_size(comm,&size));
14495f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Comm_rank(comm,&rank));
1450a587d139SMark   if (size == 1) {
14515f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJKokkosGetDeviceMat(A,&d_mat));
14525f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJKokkosModifyDevice(A)); /* Since we are going to modify matrix values on device */
1453a587d139SMark   } else {
1454a587d139SMark     Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
14555f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJKokkosGetDeviceMat(aij->A,&d_mat));
14565f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJKokkosModifyDevice(aij->A));
14575f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJKokkosModifyDevice(aij->B));
14582c71b3e2SJacob Faibussowitsch     PetscCheck(A->nooffprocentries || aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
1459a587d139SMark   }
1460a587d139SMark   // act like MatSetValues because not called on host
1461a587d139SMark   if (A->assembled) {
1462a587d139SMark     if (A->was_assembled) {
14635f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscInfo(A,"Assemble more than once already\n"));
1464a587d139SMark     }
1465a587d139SMark     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
1466a587d139SMark   } else {
14675f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscInfo(A,"Warning !assemble ??? assembled=%" PetscInt_FMT "\n",A->assembled));
1468a587d139SMark   }
1469a587d139SMark   if (!d_mat) {
1470042217e8SBarry Smith     struct _n_SplitCSRMat h_mat; /* host container */
1471a587d139SMark     Mat_SeqAIJKokkos      *aijkokA;
1472a587d139SMark     Mat_SeqAIJ            *jaca;
1473a587d139SMark     PetscInt              n = A->rmap->n, nnz;
1474a587d139SMark     Mat                   Amat;
1475042217e8SBarry Smith     PetscInt              *colmap;
1476042217e8SBarry Smith 
1477042217e8SBarry Smith     /* create and copy h_mat */
147849b994a9SMark Adams     h_mat.M = A->cmap->N; // use for debug build
14795f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscInfo(A,"Create device matrix in Kokkos\n"));
1480a587d139SMark     if (size == 1) {
1481a587d139SMark       Amat = A;
1482a587d139SMark       jaca = (Mat_SeqAIJ*)A->data;
1483a587d139SMark       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
1484a587d139SMark       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
1485a587d139SMark       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
1486a587d139SMark       h_mat.offdiag.a = NULL;
1487a587d139SMark       aijkokA = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1488a587d139SMark     } else {
1489a587d139SMark       Mat_MPIAIJ       *aij = (Mat_MPIAIJ*)A->data;
1490a587d139SMark       Mat_SeqAIJ       *jacb = (Mat_SeqAIJ*)aij->B->data;
1491a587d139SMark       PetscInt         ii;
1492a587d139SMark       Mat_SeqAIJKokkos *aijkokB;
1493042217e8SBarry Smith 
1494a587d139SMark       Amat = aij->A;
1495a587d139SMark       aijkokA = static_cast<Mat_SeqAIJKokkos*>(aij->A->spptr);
1496a587d139SMark       aijkokB = static_cast<Mat_SeqAIJKokkos*>(aij->B->spptr);
1497a587d139SMark       jaca = (Mat_SeqAIJ*)aij->A->data;
14982c71b3e2SJacob Faibussowitsch       PetscCheckFalse(aij->B->cmap->n && !aij->garray,comm,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
14992c71b3e2SJacob Faibussowitsch       PetscCheckFalse(aij->B->rmap->n != aij->A->rmap->n,comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
1500a587d139SMark       aij->donotstash = PETSC_TRUE;
1501a587d139SMark       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
1502a5b23f4aSJose E. Roman       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more disassembly
15035f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscCalloc1(A->cmap->N,&colmap));
15045f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscLogObjectMemory((PetscObject)A,(A->cmap->N)*sizeof(PetscInt)));
1505042217e8SBarry Smith       for (ii=0; ii<aij->B->cmap->n; ii++) colmap[aij->garray[ii]] = ii+1;
1506a587d139SMark       // allocate B copy data
1507a587d139SMark       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
1508a587d139SMark       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
1509a587d139SMark       nnz = jacb->i[n];
1510a587d139SMark       if (jacb->compressedrow.use) {
1511a587d139SMark         const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_i_k (jacb->i,n+1);
1512300d22a6SJunchao Zhang         aijkokB->i_uncompressed_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_i_k));
1513300d22a6SJunchao Zhang         Kokkos::deep_copy (aijkokB->i_uncompressed_d, h_i_k);
1514300d22a6SJunchao Zhang         h_mat.offdiag.i = aijkokB->i_uncompressed_d.data();
1515a587d139SMark       } else {
151699551766SMark Adams          h_mat.offdiag.i = aijkokB->i_device_data();
1517a587d139SMark       }
151899551766SMark Adams       h_mat.offdiag.j = aijkokB->j_device_data();
1519076ba34aSJunchao Zhang       h_mat.offdiag.a = aijkokB->a_device_data();
1520a587d139SMark       {
1521042217e8SBarry Smith         Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_colmap_k (colmap,A->cmap->N);
1522300d22a6SJunchao Zhang         aijkokB->colmap_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_colmap_k));
1523300d22a6SJunchao Zhang         Kokkos::deep_copy (aijkokB->colmap_d, h_colmap_k);
1524300d22a6SJunchao Zhang         h_mat.colmap = aijkokB->colmap_d.data();
15255f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscFree(colmap));
1526a587d139SMark       }
1527a587d139SMark       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
1528a587d139SMark       h_mat.offdiag.n = n;
1529a587d139SMark     }
1530a587d139SMark     // allocate A copy data
1531a587d139SMark     nnz = jaca->i[n];
1532a587d139SMark     h_mat.diag.n = n;
1533a587d139SMark     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
15345f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Comm_rank(comm,&h_mat.rank));
15352c71b3e2SJacob Faibussowitsch     PetscCheckFalse(jaca->compressedrow.use,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A does not suppport compressed row (todo)");
1536042217e8SBarry Smith     else {
153799551766SMark Adams       h_mat.diag.i = aijkokA->i_device_data();
1538a587d139SMark     }
153999551766SMark Adams     h_mat.diag.j = aijkokA->j_device_data();
1540076ba34aSJunchao Zhang     h_mat.diag.a = aijkokA->a_device_data();
1541a587d139SMark     // copy pointers and metdata to device
15425f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJKokkosSetDeviceMat(Amat,&h_mat));
15435f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJKokkosGetDeviceMat(Amat,&d_mat));
15445f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscInfo(A,"Create device Mat n=%" PetscInt_FMT " nnz=%" PetscInt_FMT "\n",h_mat.diag.n, nnz));
1545a587d139SMark   }
1546a587d139SMark   *B = d_mat; // return it, set it in Mat, and set it up
1547a587d139SMark   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
1548a587d139SMark   PetscFunctionReturn(0);
1549a587d139SMark }
1550076ba34aSJunchao Zhang 
1551076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetOffloadMask(Mat A, const char **mask)
1552076ba34aSJunchao Zhang {
1553076ba34aSJunchao Zhang   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1554076ba34aSJunchao Zhang 
1555076ba34aSJunchao Zhang   PetscFunctionBegin;
1556076ba34aSJunchao Zhang   if (!aijkok) *mask = "AIJKOK_UNALLOCATED";
1557076ba34aSJunchao Zhang   else if (aijkok->a_dual.need_sync_host()) *mask = "PETSC_OFFLOAD_GPU";
1558076ba34aSJunchao Zhang   else if (aijkok->a_dual.need_sync_device()) *mask = "PETSC_OFFLOAD_CPU";
1559076ba34aSJunchao Zhang   else *mask = "PETSC_OFFLOAD_BOTH";
1560076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1561076ba34aSJunchao Zhang }
1562076ba34aSJunchao Zhang 
1563076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatAIJKokkosPrintOffloadMask(Mat A)
1564076ba34aSJunchao Zhang {
1565076ba34aSJunchao Zhang   PetscMPIInt  size;
1566076ba34aSJunchao Zhang   Mat          Ad,Ao;
1567076ba34aSJunchao Zhang   const char  *amask,*bmask;
1568076ba34aSJunchao Zhang 
1569076ba34aSJunchao Zhang   PetscFunctionBegin;
15705f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1571076ba34aSJunchao Zhang 
1572076ba34aSJunchao Zhang   if (size == 1) {
15735f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJKokkosGetOffloadMask(A,&amask));
15745f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscPrintf(PETSC_COMM_SELF,"%s\n",amask));
1575076ba34aSJunchao Zhang   } else {
1576076ba34aSJunchao Zhang     Ad  = ((Mat_MPIAIJ*)A->data)->A;
1577076ba34aSJunchao Zhang     Ao  = ((Mat_MPIAIJ*)A->data)->B;
15785f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJKokkosGetOffloadMask(Ad,&amask));
15795f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJKokkosGetOffloadMask(Ao,&bmask));
15805f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscPrintf(PETSC_COMM_SELF,"Diag : Off-diag = %s : %s\n",amask,bmask));
1581076ba34aSJunchao Zhang   }
1582076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1583076ba34aSJunchao Zhang }
1584