xref: /petsc/src/mat/impls/aij/mpi/kokkos/mpiaijkok.kokkos.cxx (revision c0aa6a63a7860d309a895cb4aa0f9e11e7859f3a)
111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp>
2076ba34aSJunchao Zhang #include <petscsf.h>
38c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/mpi/mpiaij.h>
4a587d139SMark #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
5076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp>
611d22bbfSJunchao Zhang 
78c3ff71bSJunchao Zhang PetscErrorCode MatAssemblyEnd_MPIAIJKokkos(Mat A,MatAssemblyType mode)
88c3ff71bSJunchao Zhang {
98c3ff71bSJunchao Zhang   PetscErrorCode   ierr;
108c3ff71bSJunchao Zhang   Mat_MPIAIJ       *mpiaij = (Mat_MPIAIJ*)A->data;
11a587d139SMark   Mat_SeqAIJKokkos *aijkok = mpiaij->A->spptr ? static_cast<Mat_SeqAIJKokkos*>(mpiaij->A->spptr) : NULL;
128c3ff71bSJunchao Zhang 
138c3ff71bSJunchao Zhang   PetscFunctionBegin;
148c3ff71bSJunchao Zhang   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
15a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
16a587d139SMark     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
17a587d139SMark   }
18a587d139SMark 
198c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
208c3ff71bSJunchao Zhang }
218c3ff71bSJunchao Zhang 
228c3ff71bSJunchao Zhang PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJKokkos(Mat mat,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
238c3ff71bSJunchao Zhang {
248c3ff71bSJunchao Zhang   PetscErrorCode ierr;
258c3ff71bSJunchao Zhang   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
268c3ff71bSJunchao Zhang 
278c3ff71bSJunchao Zhang   PetscFunctionBegin;
288c3ff71bSJunchao Zhang   ierr = PetscLayoutSetUp(mat->rmap);CHKERRQ(ierr);
298c3ff71bSJunchao Zhang   ierr = PetscLayoutSetUp(mat->cmap);CHKERRQ(ierr);
306a29ce69SStefano Zampini #if defined(PETSC_USE_DEBUG)
318c3ff71bSJunchao Zhang   if (d_nnz) {
326a29ce69SStefano Zampini     PetscInt i;
338c3ff71bSJunchao Zhang     for (i=0; i<mat->rmap->n; i++) {
34*c0aa6a63SJacob Faibussowitsch       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]);
358c3ff71bSJunchao Zhang     }
368c3ff71bSJunchao Zhang   }
378c3ff71bSJunchao Zhang   if (o_nnz) {
386a29ce69SStefano Zampini     PetscInt i;
398c3ff71bSJunchao Zhang     for (i=0; i<mat->rmap->n; i++) {
40*c0aa6a63SJacob Faibussowitsch       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]);
418c3ff71bSJunchao Zhang     }
428c3ff71bSJunchao Zhang   }
436a29ce69SStefano Zampini #endif
446a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE)
456a29ce69SStefano Zampini   ierr = PetscTableDestroy(&mpiaij->colmap);CHKERRQ(ierr);
466a29ce69SStefano Zampini #else
476a29ce69SStefano Zampini   ierr = PetscFree(mpiaij->colmap);CHKERRQ(ierr);
486a29ce69SStefano Zampini #endif
496a29ce69SStefano Zampini   ierr = PetscFree(mpiaij->garray);CHKERRQ(ierr);
506a29ce69SStefano Zampini   ierr = VecDestroy(&mpiaij->lvec);CHKERRQ(ierr);
516a29ce69SStefano Zampini   ierr = VecScatterDestroy(&mpiaij->Mvctx);CHKERRQ(ierr);
526a29ce69SStefano Zampini   /* Because the B will have been resized we simply destroy it and create a new one each time */
536a29ce69SStefano Zampini   ierr = MatDestroy(&mpiaij->B);CHKERRQ(ierr);
546a29ce69SStefano Zampini 
556a29ce69SStefano Zampini   if (!mpiaij->A) {
568c3ff71bSJunchao Zhang     ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->A);CHKERRQ(ierr);
578c3ff71bSJunchao Zhang     ierr = MatSetSizes(mpiaij->A,mat->rmap->n,mat->cmap->n,mat->rmap->n,mat->cmap->n);CHKERRQ(ierr);
588c3ff71bSJunchao Zhang     ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->A);CHKERRQ(ierr);
596a29ce69SStefano Zampini   }
606a29ce69SStefano Zampini   if (!mpiaij->B) {
616a29ce69SStefano Zampini     PetscMPIInt size;
6255b25c41SPierre Jolivet     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size);CHKERRMPI(ierr);
638c3ff71bSJunchao Zhang     ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->B);CHKERRQ(ierr);
646a29ce69SStefano Zampini     ierr = MatSetSizes(mpiaij->B,mat->rmap->n,size > 1 ? mat->cmap->N : 0,mat->rmap->n,size > 1 ? mat->cmap->N : 0);CHKERRQ(ierr);
658c3ff71bSJunchao Zhang     ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->B);CHKERRQ(ierr);
668c3ff71bSJunchao Zhang   }
676a29ce69SStefano Zampini   ierr = MatSetType(mpiaij->A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
686a29ce69SStefano Zampini   ierr = MatSetType(mpiaij->B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
698c3ff71bSJunchao Zhang   ierr = MatSeqAIJSetPreallocation(mpiaij->A,d_nz,d_nnz);CHKERRQ(ierr);
708c3ff71bSJunchao Zhang   ierr = MatSeqAIJSetPreallocation(mpiaij->B,o_nz,o_nnz);CHKERRQ(ierr);
718c3ff71bSJunchao Zhang   mat->preallocated = PETSC_TRUE;
728c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
738c3ff71bSJunchao Zhang }
748c3ff71bSJunchao Zhang 
758c3ff71bSJunchao Zhang PetscErrorCode MatMult_MPIAIJKokkos(Mat mat,Vec xx,Vec yy)
768c3ff71bSJunchao Zhang {
778c3ff71bSJunchao Zhang   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
788c3ff71bSJunchao Zhang   PetscErrorCode ierr;
798c3ff71bSJunchao Zhang   PetscInt       nt;
808c3ff71bSJunchao Zhang 
818c3ff71bSJunchao Zhang   PetscFunctionBegin;
828c3ff71bSJunchao Zhang   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
83*c0aa6a63SJacob Faibussowitsch   if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",mat->cmap->n,nt);
848c3ff71bSJunchao Zhang   ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
858c3ff71bSJunchao Zhang   ierr = (*mpiaij->A->ops->mult)(mpiaij->A,xx,yy);CHKERRQ(ierr);
868c3ff71bSJunchao Zhang   ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
878c3ff71bSJunchao Zhang   ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,yy,yy);CHKERRQ(ierr);
888c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
898c3ff71bSJunchao Zhang }
908c3ff71bSJunchao Zhang 
918c3ff71bSJunchao Zhang PetscErrorCode MatMultAdd_MPIAIJKokkos(Mat mat,Vec xx,Vec yy,Vec zz)
928c3ff71bSJunchao Zhang {
938c3ff71bSJunchao Zhang   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
948c3ff71bSJunchao Zhang   PetscErrorCode ierr;
958c3ff71bSJunchao Zhang   PetscInt       nt;
968c3ff71bSJunchao Zhang 
978c3ff71bSJunchao Zhang   PetscFunctionBegin;
988c3ff71bSJunchao Zhang   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
99*c0aa6a63SJacob Faibussowitsch   if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",mat->cmap->n,nt);
1008c3ff71bSJunchao Zhang   ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1018c3ff71bSJunchao Zhang   ierr = (*mpiaij->A->ops->multadd)(mpiaij->A,xx,yy,zz);CHKERRQ(ierr);
1028c3ff71bSJunchao Zhang   ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1038c3ff71bSJunchao Zhang   ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,zz,zz);CHKERRQ(ierr);
1048c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1058c3ff71bSJunchao Zhang }
1068c3ff71bSJunchao Zhang 
1078c3ff71bSJunchao Zhang PetscErrorCode MatMultTranspose_MPIAIJKokkos(Mat mat,Vec xx,Vec yy)
1088c3ff71bSJunchao Zhang {
1098c3ff71bSJunchao Zhang   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
1108c3ff71bSJunchao Zhang   PetscErrorCode ierr;
1118c3ff71bSJunchao Zhang   PetscInt       nt;
1128c3ff71bSJunchao Zhang 
1138c3ff71bSJunchao Zhang   PetscFunctionBegin;
1148c3ff71bSJunchao Zhang   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
115*c0aa6a63SJacob Faibussowitsch   if (nt != mat->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",mat->rmap->n,nt);
1168c3ff71bSJunchao Zhang   ierr = (*mpiaij->B->ops->multtranspose)(mpiaij->B,xx,mpiaij->lvec);CHKERRQ(ierr);
1178c3ff71bSJunchao Zhang   ierr = (*mpiaij->A->ops->multtranspose)(mpiaij->A,xx,yy);CHKERRQ(ierr);
1188c3ff71bSJunchao Zhang   ierr = VecScatterBegin(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
1198c3ff71bSJunchao Zhang   ierr = VecScatterEnd(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
1208c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1218c3ff71bSJunchao Zhang }
1228c3ff71bSJunchao Zhang 
123076ba34aSJunchao Zhang /* Merge the "A, B" matrices of mat into a matrix C.  mat's type is MPIAIJKOKKOS. C's type is MATSEQAIJKOKKOS.
124076ba34aSJunchao Zhang    A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n).
125076ba34aSJunchao Zhang    C still uses local column ids. Their corresponding global column ids are returned in glob.
126076ba34aSJunchao Zhang */
127076ba34aSJunchao Zhang PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJKokkos(Mat mat,MatReuse reuse,IS *glob,Mat *C)
128076ba34aSJunchao Zhang {
129076ba34aSJunchao Zhang   Mat            Ad,Ao;
130076ba34aSJunchao Zhang   const PetscInt *cmap;
131076ba34aSJunchao Zhang   PetscErrorCode ierr;
132076ba34aSJunchao Zhang 
133076ba34aSJunchao Zhang   PetscFunctionBegin;
134076ba34aSJunchao Zhang   ierr = MatMPIAIJGetSeqAIJ(mat,&Ad,&Ao,&cmap);CHKERRQ(ierr);
135076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosMergeMats(Ad,Ao,reuse,C);CHKERRQ(ierr);
136076ba34aSJunchao Zhang   if (glob) {
137076ba34aSJunchao Zhang     PetscInt cst, i, dn, on, *gidx;
138076ba34aSJunchao Zhang     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
139076ba34aSJunchao Zhang     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
140076ba34aSJunchao Zhang     ierr = MatGetOwnershipRangeColumn(mat,&cst,NULL);CHKERRQ(ierr);
141076ba34aSJunchao Zhang     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
142076ba34aSJunchao Zhang     for (i=0; i<dn; i++) gidx[i]    = cst + i;
143076ba34aSJunchao Zhang     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
144076ba34aSJunchao Zhang     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
145076ba34aSJunchao Zhang   }
146076ba34aSJunchao Zhang   PetscFunctionReturn(0);
147076ba34aSJunchao Zhang }
148076ba34aSJunchao Zhang 
149076ba34aSJunchao Zhang /* Structs used in matrix product C=AB, C=A^tB and C=B^tAB */
150076ba34aSJunchao Zhang struct MatMatStruct {
151076ba34aSJunchao Zhang   MatRowMapKokkosView   Cdstart; /* Used to split sequential matrix into petsc's A, B format */
152076ba34aSJunchao Zhang   PetscSF               sf; /* SF to send/recv matrix entries */
153076ba34aSJunchao Zhang   MatScalarKokkosView   abuf; /* buf of mat values in send/recv */
154076ba34aSJunchao Zhang   Mat                   C1,C2,B_local;
155076ba34aSJunchao Zhang   KokkosCsrMatrix       C1_global,C2_global,C_global;
156076ba34aSJunchao Zhang   KernelHandle          kh;
157076ba34aSJunchao Zhang   MatMatStruct() {
158076ba34aSJunchao Zhang     C1 = C2 = B_local = NULL;
159076ba34aSJunchao Zhang     sf = NULL;
160076ba34aSJunchao Zhang   }
161076ba34aSJunchao Zhang 
162076ba34aSJunchao Zhang   ~MatMatStruct() {
163076ba34aSJunchao Zhang     MatDestroy(&C1);
164076ba34aSJunchao Zhang     MatDestroy(&C2);
165076ba34aSJunchao Zhang     MatDestroy(&B_local);
166076ba34aSJunchao Zhang     PetscSFDestroy(&sf);
167076ba34aSJunchao Zhang     kh.destroy_spadd_handle();
168076ba34aSJunchao Zhang   }
169076ba34aSJunchao Zhang };
170076ba34aSJunchao Zhang 
171076ba34aSJunchao Zhang struct MatMatStruct_AB : public MatMatStruct {
172076ba34aSJunchao Zhang   MatColIdxKokkosView   rows;
173076ba34aSJunchao Zhang   MatRowMapKokkosView   rowoffset;
174076ba34aSJunchao Zhang   Mat                   B_other,C_petsc; /* SEQAIJKOKKOS matrices. TODO: have a better var name than C_petsc */
175076ba34aSJunchao Zhang 
176076ba34aSJunchao Zhang   MatMatStruct_AB() : B_other(NULL),C_petsc(NULL){}
177076ba34aSJunchao Zhang   ~MatMatStruct_AB() {
178076ba34aSJunchao Zhang     MatDestroy(&B_other);
179076ba34aSJunchao Zhang     MatDestroy(&C_petsc);
180076ba34aSJunchao Zhang   }
181076ba34aSJunchao Zhang };
182076ba34aSJunchao Zhang 
183076ba34aSJunchao Zhang struct MatMatStruct_AtB : public MatMatStruct {
184076ba34aSJunchao Zhang   MatRowMapKokkosView   srcrowoffset,dstrowoffset;
185076ba34aSJunchao Zhang };
186076ba34aSJunchao Zhang 
187076ba34aSJunchao Zhang struct MatProductData_MPIAIJKokkos
188076ba34aSJunchao Zhang {
189076ba34aSJunchao Zhang   MatMatStruct_AB   *mmAB;
190076ba34aSJunchao Zhang   MatMatStruct_AtB  *mmAtB;
191076ba34aSJunchao Zhang   PetscBool         reusesym;
192076ba34aSJunchao Zhang 
193076ba34aSJunchao Zhang   MatProductData_MPIAIJKokkos(): mmAB(NULL),mmAtB(NULL),reusesym(PETSC_FALSE){}
194076ba34aSJunchao Zhang   ~MatProductData_MPIAIJKokkos() {
195076ba34aSJunchao Zhang     delete mmAB;
196076ba34aSJunchao Zhang     delete mmAtB;
197076ba34aSJunchao Zhang   }
198076ba34aSJunchao Zhang };
199076ba34aSJunchao Zhang 
200076ba34aSJunchao Zhang static PetscErrorCode MatProductDataDestroy_MPIAIJKokkos(void *data)
201076ba34aSJunchao Zhang {
202076ba34aSJunchao Zhang   PetscFunctionBegin;
203076ba34aSJunchao Zhang   CHKERRCXX(delete static_cast<MatProductData_MPIAIJKokkos*>(data));
204076ba34aSJunchao Zhang   PetscFunctionReturn(0);
205076ba34aSJunchao Zhang }
206076ba34aSJunchao Zhang 
207076ba34aSJunchao Zhang /* MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds - Get a KokkosCsrMatrix from a MATSEQAIJKOKKOS matrix
208076ba34aSJunchao Zhang 
209076ba34aSJunchao Zhang    Input Parameters:
210076ba34aSJunchao Zhang +  A       - the MATSEQAIJKOKKOS matrix
211076ba34aSJunchao Zhang .  N       - new column size for the returned Kokkos matrix
212076ba34aSJunchao Zhang -  l2g     - a map that maps old col ids to new col ids
213076ba34aSJunchao Zhang 
214076ba34aSJunchao Zhang    Output Parameters:
215076ba34aSJunchao Zhang .  csrmat  - the Kokkos matrix, which has the same row size as A, shares a, i but not j with A.
216076ba34aSJunchao Zhang  */
217076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(Mat A,PetscInt N,const ConstMatColIdxKokkosView& l2g,KokkosCsrMatrix& csrmat)
218076ba34aSJunchao Zhang {
219076ba34aSJunchao Zhang   KokkosCsrMatrix&         orig = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->csrmat;
220076ba34aSJunchao Zhang   MatColIdxKokkosView      jg("jg",orig.nnz()); /* New j array for csrmat */
221076ba34aSJunchao Zhang 
222076ba34aSJunchao Zhang   PetscFunctionBegin;
223076ba34aSJunchao Zhang   CHKERRCXX(Kokkos::parallel_for(orig.nnz(), KOKKOS_LAMBDA(const PetscInt i) {jg(i) = l2g(orig.graph.entries(i));}));
224076ba34aSJunchao Zhang   CHKERRCXX(csrmat = KokkosCsrMatrix("csrmat",orig.numRows(),N,orig.nnz(),orig.values,orig.graph.row_map,jg));
225076ba34aSJunchao Zhang   PetscFunctionReturn(0);
226076ba34aSJunchao Zhang }
227076ba34aSJunchao Zhang 
228076ba34aSJunchao Zhang /* MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices - Set the diag and offdiag matrices of a MATMPIAIJKOKKOS matrix.
229076ba34aSJunchao Zhang    It is similar to MatCreateMPIAIJWithSplitArrays.
230076ba34aSJunchao Zhang 
231076ba34aSJunchao Zhang   Input Parameters:
232076ba34aSJunchao Zhang +  mat   - the MATMPIAIJKOKKOS matrix, which should have its type and layout set, but should not have its diag, offdiag matrices set
233076ba34aSJunchao Zhang .  A     - the diag matrix using local col ids
234076ba34aSJunchao Zhang -  B     - the offdiag matrix using global col ids
235076ba34aSJunchao Zhang 
236076ba34aSJunchao Zhang   Output Parameters:
237076ba34aSJunchao Zhang .  mat   - the updated MATMPIAIJKOKKOS matrix
238076ba34aSJunchao Zhang */
239076ba34aSJunchao Zhang static PetscErrorCode MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices(Mat mat,Mat A,Mat B)
240076ba34aSJunchao Zhang {
241076ba34aSJunchao Zhang   PetscErrorCode      ierr;
242076ba34aSJunchao Zhang   Mat_MPIAIJ          *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data);
243076ba34aSJunchao Zhang   PetscInt            m,n,M,N,Am,An,Bm,Bn;
244076ba34aSJunchao Zhang   Mat_SeqAIJKokkos    *bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
245076ba34aSJunchao Zhang 
246076ba34aSJunchao Zhang   PetscFunctionBegin;
247076ba34aSJunchao Zhang   ierr = MatGetSize(mat,&M,&N);CHKERRQ(ierr);
248076ba34aSJunchao Zhang   ierr = MatGetLocalSize(mat,&m,&n);CHKERRQ(ierr);
249076ba34aSJunchao Zhang   ierr = MatGetLocalSize(A,&Am,&An);CHKERRQ(ierr);
250076ba34aSJunchao Zhang   ierr = MatGetLocalSize(B,&Bm,&Bn);CHKERRQ(ierr);
251076ba34aSJunchao Zhang 
252076ba34aSJunchao Zhang   if (m != Am || m != Bm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"local number of rows do not match");
253076ba34aSJunchao Zhang   if (n != An) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"local number of columns do not match");
254076ba34aSJunchao Zhang   if (N != Bn) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"global number of columns do not match");
255076ba34aSJunchao Zhang   if (mpiaij->A || mpiaij->B) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A, B of the MPIAIJ matrix are not empty");
256076ba34aSJunchao Zhang   mpiaij->A = A;
257076ba34aSJunchao Zhang   mpiaij->B = B;
258076ba34aSJunchao Zhang 
259076ba34aSJunchao Zhang   mat->preallocated     = PETSC_TRUE;
260076ba34aSJunchao Zhang   mat->nooffprocentries = PETSC_TRUE; /* See MatAssemblyBegin_MPIAIJ. In effect, making MatAssemblyBegin a nop */
261076ba34aSJunchao Zhang 
262076ba34aSJunchao Zhang   ierr = MatSetOption(mat,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE);CHKERRQ(ierr);
263076ba34aSJunchao Zhang   ierr = MatAssemblyBegin(mat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
264076ba34aSJunchao Zhang   /* MatAssemblyEnd is critical here. It sets mat->offloadmask according to A and B's, and
265076ba34aSJunchao Zhang     also gets mpiaij->B compacted, with its col ids and size reduced
266076ba34aSJunchao Zhang   */
267076ba34aSJunchao Zhang   ierr = MatAssemblyEnd(mat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
268076ba34aSJunchao Zhang   ierr = MatSetOption(mat,MAT_NO_OFF_PROC_ENTRIES,PETSC_FALSE);CHKERRQ(ierr);
269076ba34aSJunchao Zhang   ierr = MatSetOption(mat,MAT_NEW_NONZERO_LOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
270076ba34aSJunchao Zhang 
271076ba34aSJunchao Zhang   /* Update bkok with new local col ids (stored on host) and size */
272076ba34aSJunchao Zhang   bkok->j_dual.modify_host();
273076ba34aSJunchao Zhang   bkok->j_dual.sync_device();
274076ba34aSJunchao Zhang   bkok->SetColSize(mpiaij->B->cmap->n);
275076ba34aSJunchao Zhang   PetscFunctionReturn(0);
276076ba34aSJunchao Zhang }
277076ba34aSJunchao Zhang 
278076ba34aSJunchao Zhang /* MatSeqAIJKokkosBcast - Bcast rows of a SEQAIJKOKKOS matrice (B) to form a SEQAIJKOKKOS matrix (C).
279076ba34aSJunchao Zhang 
280076ba34aSJunchao Zhang    It is essentially the MPIAIJKOKKOS counterpart of MatGetBrowsOfAoCols_MPIAIJ, but supports device and uses PetscSF.
281076ba34aSJunchao Zhang    In the given ownerSF, leaves correspond to rows in C, and roots correspond to rows in B. Roots may connect to multiple leaves.
282076ba34aSJunchao Zhang    Suppose C's j-th row is connected to a root identified by PetscSFNode (k,i), it means we will bcast the i-th row of B on rank k
283076ba34aSJunchao Zhang    to j-th row of C. ownerSF's leaves must be contiguous (in other words, as if ilocal=NULL was used to set its graph).
284076ba34aSJunchao Zhang 
285076ba34aSJunchao Zhang    Collective on comm of ownerSF
286076ba34aSJunchao Zhang 
287076ba34aSJunchao Zhang    Input Parameters:
288076ba34aSJunchao Zhang +   B       - the SEQAIJKOKKOS matrix, using local col ids
289076ba34aSJunchao Zhang .   reuse   - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX
290076ba34aSJunchao Zhang .   N       - global col ids are in range of [0,N). N Must be the same across ranks (nonsignificant in MAT_REUSE_MATRIX)
291076ba34aSJunchao Zhang .   l2g     - a map mapping B's local col ids to global ones (nonsignificant in MAT_REUSE_MATRIX)
292076ba34aSJunchao Zhang .   ownerSF - the ownership SF (nonsignificant in MAT_REUSE_MATRIX)
293076ba34aSJunchao Zhang 
294076ba34aSJunchao Zhang    Input/Output Parameters (out when resue = MAT_INITIAL_MATRIX, inout when reuse = MAT_REUSE_MATRIX)
295076ba34aSJunchao Zhang +   bcastSF   - the SF used to bcast rows of B. This plain SF does buffer (abuf) to buffer (Ca) send/recv. In this SF, vertices are nonzeros.
296076ba34aSJunchao Zhang .   abuf      - buffer for sending matrix values
297076ba34aSJunchao Zhang .   rows      - array containing indices of (local) rows that this rank needs to bcast to others. Each receiver rank has a chunk in rows[].
298076ba34aSJunchao Zhang                 Values in rows[] might have repeats, which simply indicates a row will be bcast'ed to multiple neighbors.
299076ba34aSJunchao Zhang .   rowoffset - For each row in rows[], it will be copied to rowoffset[] at abuf[]
300076ba34aSJunchao Zhang -   C         -  the SEQAIJKOKKOS matrix made of the bcast'ed rows, using local col ids.
301076ba34aSJunchao Zhang */
302076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosBcast(Mat B,MatReuse reuse,PetscInt N,const ConstMatColIdxKokkosView& l2g,PetscSF ownerSF,
303076ba34aSJunchao Zhang                                            PetscSF& bcastSF,MatScalarKokkosView& abuf,MatColIdxKokkosView& rows,
304076ba34aSJunchao Zhang                                            MatRowMapKokkosView& rowoffset,Mat& C)
305076ba34aSJunchao Zhang {
306076ba34aSJunchao Zhang   PetscErrorCode               ierr;
307076ba34aSJunchao Zhang   Mat_SeqAIJKokkos             *bkok,*ckok;
308076ba34aSJunchao Zhang 
309076ba34aSJunchao Zhang   PetscFunctionBegin;
310076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); /* Make sure B->spptr is accessible */
311076ba34aSJunchao Zhang   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
312076ba34aSJunchao Zhang 
313076ba34aSJunchao Zhang   if (reuse == MAT_REUSE_MATRIX) {
314076ba34aSJunchao Zhang     ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
315076ba34aSJunchao Zhang 
316076ba34aSJunchao Zhang     const auto& Ba = bkok->a_dual.view_device();
317076ba34aSJunchao Zhang     const auto& Bi = bkok->i_dual.view_device();
318076ba34aSJunchao Zhang     const auto& Ca = ckok->a_dual.view_device();
319076ba34aSJunchao Zhang 
320076ba34aSJunchao Zhang     /* Copy Ba to abuf */
321076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(rows.extent(0), Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
322076ba34aSJunchao Zhang       PetscInt i    = t.league_rank(); /* rows[i] is r-th row of B */
323076ba34aSJunchao Zhang       PetscInt r    = rows(i);
324076ba34aSJunchao Zhang       PetscInt base = rowoffset(i); /* Copy r-th row of B to this offset in abuf[] */
325076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t,Bi(r+1)-Bi(r)),[&](PetscInt k) {
326076ba34aSJunchao Zhang         abuf(base+k) = Ba(Bi(r)+k);
327076ba34aSJunchao Zhang       });
328076ba34aSJunchao Zhang     });
329076ba34aSJunchao Zhang 
330076ba34aSJunchao Zhang     /* Send abuf to Ca through bcastSF and then mark C is updated on device */
331076ba34aSJunchao Zhang     ierr = PetscSFBcastBegin(bcastSF,MPIU_SCALAR,abuf.data(),Ca.data(),MPI_REPLACE);CHKERRQ(ierr); /* TODO: get memtype for abuf */
332076ba34aSJunchao Zhang     ierr = PetscSFBcastEnd  (bcastSF,MPIU_SCALAR,abuf.data(),Ca.data(),MPI_REPLACE);CHKERRQ(ierr);
333076ba34aSJunchao Zhang     ckok->a_dual.modify_device();
334076ba34aSJunchao Zhang   } else if (reuse == MAT_INITIAL_MATRIX) {
335076ba34aSJunchao Zhang     MPI_Comm       comm;
336076ba34aSJunchao Zhang     PetscMPIInt    tag;
337076ba34aSJunchao Zhang     PetscInt       k,Cm,Cn,Cnnz,*Ci_h,nroots,nleaves;
338076ba34aSJunchao Zhang 
339076ba34aSJunchao Zhang     ierr = PetscObjectGetComm((PetscObject)ownerSF,&comm);CHKERRMPI(ierr);
340076ba34aSJunchao Zhang     ierr = PetscSFGetGraph(ownerSF,&nroots,&nleaves,NULL,NULL);CHKERRQ(ierr);
341076ba34aSJunchao Zhang     Cm   = nleaves; /* row size of C */
342076ba34aSJunchao Zhang     Cn   = N;  /* col size of C, which initially uses global ids, so we can safely set its col size as N */
343076ba34aSJunchao Zhang 
344076ba34aSJunchao Zhang     /* Get row lens (nz) of B's rows for later fast query */
345076ba34aSJunchao Zhang     PetscInt       *Browlens;
346076ba34aSJunchao Zhang     const PetscInt *tmp = bkok->i_host_data();
347076ba34aSJunchao Zhang     ierr = PetscMalloc1(nroots,&Browlens);CHKERRQ(ierr);
348076ba34aSJunchao Zhang     for (k=0; k<nroots; k++) Browlens[k] = tmp[k+1]-tmp[k];
349076ba34aSJunchao Zhang 
350076ba34aSJunchao Zhang     /* By ownerSF, each proc gets lens of rows of C */
351076ba34aSJunchao Zhang     MatRowMapKokkosDualView Ci("i",Cm+1); /* C's rowmap */
352076ba34aSJunchao Zhang     Ci_h    = Ci.view_host().data();
353076ba34aSJunchao Zhang     Ci_h[0] = 0;
354076ba34aSJunchao Zhang     ierr    = PetscSFBcastWithMemTypeBegin(ownerSF,MPIU_INT,PETSC_MEMTYPE_HOST,Browlens,PETSC_MEMTYPE_HOST,&Ci_h[1],MPI_REPLACE);CHKERRQ(ierr);
355076ba34aSJunchao Zhang     ierr    = PetscSFBcastEnd(ownerSF,MPIU_INT,Browlens,&Ci_h[1],MPI_REPLACE);CHKERRQ(ierr);
356076ba34aSJunchao Zhang     for (k=1; k<Cm+1; k++) Ci_h[k] += Ci_h[k-1]; /* Convert lens to CSR */
357076ba34aSJunchao Zhang     Cnnz    = Ci_h[Cm];
358076ba34aSJunchao Zhang     Ci.modify_host();
359076ba34aSJunchao Zhang     Ci.sync_device();
360076ba34aSJunchao Zhang 
361076ba34aSJunchao Zhang     /* With the newly known Cnnz, we are able to allocate (j, a) for C on host & device */
362076ba34aSJunchao Zhang     MatColIdxKokkosDualView  Cj("j",Cnnz);
363076ba34aSJunchao Zhang     MatScalarKokkosDualView  Ca("a",Cnnz);
364076ba34aSJunchao Zhang 
365076ba34aSJunchao Zhang     /* Now build the bcastSF to fill Ca, Cj. This plain SF only does (contiguous) buffer to buffer send/recv */
366076ba34aSJunchao Zhang     const PetscMPIInt *iranks,*ranks;
367076ba34aSJunchao Zhang     const PetscInt    *ioffset,*irootloc,*roffset;
368076ba34aSJunchao Zhang     PetscInt          i,j,niranks,nranks,*sdisp,*rdisp,*rowptr;
369076ba34aSJunchao Zhang     MPI_Request       *reqs;
370076ba34aSJunchao Zhang 
371076ba34aSJunchao Zhang     ierr = PetscSFGetLeafRanks(ownerSF,&niranks,&iranks,&ioffset,&irootloc);CHKERRQ(ierr); /* irootloc[] contains indices of rows I need to send to each receiver */
372076ba34aSJunchao Zhang     ierr = PetscSFGetRootRanks(ownerSF,&nranks,&ranks,&roffset,NULL/*rmine*/,NULL/*rremote*/);CHKERRQ(ierr); /* recv info */
373076ba34aSJunchao Zhang 
374076ba34aSJunchao Zhang     /* figure out offsets at the send buffer, to build the SF
375076ba34aSJunchao Zhang       sdisp[]  - stores offsets of nonzeros (in abuf or jbuf, see later) I need to send, per receiver.
376076ba34aSJunchao Zhang       rowptr[] - stores offsets for data of each row in abuf
377076ba34aSJunchao Zhang 
378076ba34aSJunchao Zhang       rdisp[]  - to receive sdisp[]
379076ba34aSJunchao Zhang     */
380076ba34aSJunchao Zhang     ierr = PetscMalloc3(niranks+1,&sdisp,nranks,&rdisp,niranks+nranks,&reqs);CHKERRQ(ierr);
381076ba34aSJunchao Zhang     MatRowMapKokkosViewHost rowptr_h("rowptr_h",ioffset[niranks]+1); /* Let Kokkos do the allocation, so that we can do an easy mirror later */
382076ba34aSJunchao Zhang     rowptr = rowptr_h.data();
383076ba34aSJunchao Zhang 
384076ba34aSJunchao Zhang     sdisp[0] = 0;
385076ba34aSJunchao Zhang     rowptr[0]  = 0;
386076ba34aSJunchao Zhang     for (i=0; i<niranks; i++) { /* for each receiver */
387076ba34aSJunchao Zhang       PetscInt len, nz = 0;
388076ba34aSJunchao Zhang       for (j=ioffset[i]; j<ioffset[i+1]; j++) { /* for each row to this receiver */
389076ba34aSJunchao Zhang         len         = Browlens[irootloc[j]];
390076ba34aSJunchao Zhang         rowptr[j+1] = rowptr[j] + len;
391076ba34aSJunchao Zhang         nz         += len;
392076ba34aSJunchao Zhang       }
393076ba34aSJunchao Zhang       sdisp[i+1] = sdisp[i] + nz;
394076ba34aSJunchao Zhang     }
395076ba34aSJunchao Zhang     ierr = PetscCommGetNewTag(comm,&tag);CHKERRMPI(ierr);
396076ba34aSJunchao Zhang     for (i=0; i<nranks; i++)  {ierr = MPI_Irecv(&rdisp[i],1,MPIU_INT,ranks[i],tag,comm,&reqs[i]);CHKERRMPI(ierr);}
397076ba34aSJunchao Zhang     for (i=0; i<niranks; i++) {ierr = MPI_Isend(&sdisp[i],1,MPIU_INT,iranks[i],tag,comm,&reqs[nranks+i]);CHKERRMPI(ierr);}
398076ba34aSJunchao Zhang     ierr = MPI_Waitall(niranks+nranks,reqs,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
399076ba34aSJunchao Zhang 
400076ba34aSJunchao Zhang     PetscInt    nleaves2 = Cnnz; /* leaves are the nonzeros I will receive */
401076ba34aSJunchao Zhang     PetscInt    nroots2  = sdisp[niranks]; /* roots are the nonzeros (in abuf) I will send */
402076ba34aSJunchao Zhang     PetscSFNode *iremote;
403076ba34aSJunchao Zhang     ierr = PetscMalloc1(nleaves2,&iremote);CHKERRQ(ierr);
404076ba34aSJunchao Zhang     for (i=0; i<nranks; i++) { /* for each sender */
405076ba34aSJunchao Zhang       k = 0;
406076ba34aSJunchao Zhang       for (j=Ci_h[roffset[i]]; j<Ci_h[roffset[i+1]]; j++) {
407076ba34aSJunchao Zhang         iremote[j].rank  = ranks[i];
408076ba34aSJunchao Zhang         iremote[j].index = rdisp[i] + k;
409076ba34aSJunchao Zhang         k++;
410076ba34aSJunchao Zhang       }
411076ba34aSJunchao Zhang     }
412076ba34aSJunchao Zhang     /* TODO: we should extend PetscSF APIs for this buffer-to-buffer send/recv */
413076ba34aSJunchao Zhang     ierr = PetscSFCreate(comm,&bcastSF);CHKERRQ(ierr);
414076ba34aSJunchao Zhang     ierr = PetscSFSetGraph(bcastSF,nroots2,nleaves2,NULL/*ilocal*/,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
415076ba34aSJunchao Zhang 
416076ba34aSJunchao Zhang     /* Extract selected rows of B, and copy their (a, j) into abuf[] and jbuf[], with j converted
417076ba34aSJunchao Zhang       from local to global. Then use bcastSF to fill Ca, Cj.
418076ba34aSJunchao Zhang     */
419076ba34aSJunchao Zhang     ConstMatColIdxKokkosViewHost rows_h(irootloc,ioffset[niranks]); /* irootloc[] stores indices of rows I need to send */
420076ba34aSJunchao Zhang     MatColIdxKokkosView          rows("rows",ioffset[niranks]);
421076ba34aSJunchao Zhang     Kokkos::deep_copy(rows,rows_h); /* Use deep copy since irootoc is managed by PetscSF and we want 'rows' to be standalone */
422076ba34aSJunchao Zhang 
423076ba34aSJunchao Zhang     rowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),rowptr_h); /* If no device, rowoffset will be an alias to rowptr_h */
424076ba34aSJunchao Zhang 
425076ba34aSJunchao Zhang     MatColIdxKokkosView jbuf("jbuf",sdisp[niranks]); /* send buf for (global) col ids */
426076ba34aSJunchao Zhang     abuf = MatScalarKokkosView("abuf",sdisp[niranks]); /* send buf for mat values */
427076ba34aSJunchao Zhang 
428076ba34aSJunchao Zhang     const auto& Ba = bkok->a_dual.view_device();
429076ba34aSJunchao Zhang     const auto& Bi = bkok->i_dual.view_device();
430076ba34aSJunchao Zhang     const auto& Bj = bkok->j_dual.view_device();
431076ba34aSJunchao Zhang 
432076ba34aSJunchao Zhang     /* Copy Ba, Bj to abuf, jbuf with change col ids from local to global */
433076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(rows.extent(0), Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
434076ba34aSJunchao Zhang       PetscInt i    = t.league_rank(); /* rows[i] is r-th row of B */
435076ba34aSJunchao Zhang       PetscInt r    = rows(i);
436076ba34aSJunchao Zhang       PetscInt base = rowoffset(i); /* Copy r-th row of B to this offset in abuf[] */
437076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t,Bi(r+1)-Bi(r)),[&](PetscInt k) {
438076ba34aSJunchao Zhang         abuf(base+k) = Ba(Bi(r)+k);
439076ba34aSJunchao Zhang         jbuf(base+k) = l2g(Bj(Bi(r)+k));
440076ba34aSJunchao Zhang       });
441076ba34aSJunchao Zhang     });
442076ba34aSJunchao Zhang 
443076ba34aSJunchao Zhang     /* Send abuf & jbuf to fill Ca, Cj */
444076ba34aSJunchao Zhang     ierr = PetscSFBcastBegin(bcastSF,MPIU_INT,   jbuf.data(),Cj.view_device().data(),MPI_REPLACE);CHKERRQ(ierr);
445076ba34aSJunchao Zhang     ierr = PetscSFBcastBegin(bcastSF,MPIU_SCALAR,abuf.data(),Ca.view_device().data(),MPI_REPLACE);CHKERRQ(ierr);
446076ba34aSJunchao Zhang     ierr = PetscSFBcastEnd  (bcastSF,MPIU_INT,   jbuf.data(),Cj.view_device().data(),MPI_REPLACE);CHKERRQ(ierr);
447076ba34aSJunchao Zhang     ierr = PetscSFBcastEnd  (bcastSF,MPIU_SCALAR,abuf.data(),Ca.view_device().data(),MPI_REPLACE);CHKERRQ(ierr);
448076ba34aSJunchao Zhang     Cj.modify_device(); /* Mark Cj, Ca modified on device, but only sync Cj since we might not need Ca on host at all */
449076ba34aSJunchao Zhang     Cj.sync_host();
450076ba34aSJunchao Zhang     Ca.modify_device();
451076ba34aSJunchao Zhang 
452076ba34aSJunchao Zhang     /* Construct C with Ca, Ci, Cj */
453076ba34aSJunchao Zhang     auto ckok = new Mat_SeqAIJKokkos(Cm,Cn,Cnnz,Ci,Cj,Ca);
454076ba34aSJunchao Zhang     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,&C);CHKERRQ(ierr);
455076ba34aSJunchao Zhang     ierr = PetscFree3(sdisp,rdisp,reqs);CHKERRQ(ierr);
456076ba34aSJunchao Zhang     ierr = PetscFree(Browlens);CHKERRQ(ierr);
457076ba34aSJunchao Zhang   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unsupported MatReuse enum %d\n",reuse);
458076ba34aSJunchao Zhang   PetscFunctionReturn(0);
459076ba34aSJunchao Zhang }
460076ba34aSJunchao Zhang 
461076ba34aSJunchao Zhang /* MatSeqAIJKokkosReduce - Reduce rows of a SEQAIJKOKKOS matrix (A) to form a Kokkos Csr matrix (C)
462076ba34aSJunchao Zhang 
463076ba34aSJunchao Zhang   It is the reverse of MatSeqAIJKokkosBcast in some sense.
464076ba34aSJunchao Zhang 
465076ba34aSJunchao Zhang   Think each row of A as a leaf, then the given ownerSF specifies roots of the leaves. Roots may connect to multiple leaves.
466076ba34aSJunchao Zhang   In this routine, we reduce (i.e., concatenate) leaves (rows) at their roots to form potentially longer rows in C. Such rows might
467076ba34aSJunchao Zhang   contain repeats, which does not matter since they will be summed up by other routines. C's row size will be nroots of ownerSF.
468076ba34aSJunchao Zhang 
469076ba34aSJunchao Zhang   Input Parameters:
470076ba34aSJunchao Zhang +  A        - the SEQAIJKOKKOS matrix to be reduced
471076ba34aSJunchao Zhang .  reuse    - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX
472076ba34aSJunchao Zhang .  local    - true if A uses local col ids; false if A is already in global col ids.
473076ba34aSJunchao Zhang .  N        - if local, N is A's global col size
474076ba34aSJunchao Zhang .  l2g      - if local, a map mapping A's local col ids to global ones, which are in range of [0,N).
475076ba34aSJunchao Zhang -  ownerSF  - the SF specifies ownership (root) of rows in A
476076ba34aSJunchao Zhang 
477076ba34aSJunchao Zhang   Output Parameters:
478076ba34aSJunchao Zhang +  reduceSF    - the SF to reduce A's rows to contiguous buffers at the receiver side
479076ba34aSJunchao Zhang .  abuf         - a contiguous buffer to receive A's rows sent to this proc. Suppose there are 'nrows' such rows.
480076ba34aSJunchao Zhang .  srcrowoffset - offset array of size nrows+1. Each entry is the corresponding row's offset in abuf[]. srcrowoffset[i+1]-srcrowoffset[i] is row i's len.
481076ba34aSJunchao Zhang .  dstrowoffset - offset array of size nrows. Each entry is the corresponding row's offset in Ca[], i.e., C's 'a' array. Row i, i+1 in abuf[] may go to
482076ba34aSJunchao Zhang                   unrelated places in Ca, so dstrowoffset is not in CSR-like format as srcrowoffset.
483076ba34aSJunchao Zhang -  C            - the matrix made up by rows sent to me from other ranks, using global col ids
484076ba34aSJunchao Zhang 
485076ba34aSJunchao Zhang    TODO: we can even have MatSeqAIJKokkosReduceBegin/End to provide oppertunity for callers to overlap comp./comm. when reuse = MAT_REUSE_MATRIX.
486076ba34aSJunchao Zhang  */
487076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosReduce(Mat A,MatReuse reuse,PetscBool local,PetscInt N,const ConstMatColIdxKokkosView& l2g,PetscSF ownerSF,
488076ba34aSJunchao Zhang                                             PetscSF& reduceSF,MatScalarKokkosView& abuf,
489076ba34aSJunchao Zhang                                             MatRowMapKokkosView& srcrowoffset,MatRowMapKokkosView& dstrowoffset,
490076ba34aSJunchao Zhang                                             KokkosCsrMatrix& C)
491076ba34aSJunchao Zhang {
492076ba34aSJunchao Zhang   PetscErrorCode         ierr;
493076ba34aSJunchao Zhang   PetscInt               i,r,Am,An,Annz,Cnnz,nrows;
494076ba34aSJunchao Zhang   const PetscInt         *Ai;
495076ba34aSJunchao Zhang   Mat_SeqAIJKokkos       *akok;
496076ba34aSJunchao Zhang 
497076ba34aSJunchao Zhang   PetscFunctionBegin;
498076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); /* So that A's latest data is on device */
499076ba34aSJunchao Zhang   ierr = MatGetSize(A,&Am,&An);
500076ba34aSJunchao Zhang   Ai   = static_cast<Mat_SeqAIJ*>(A->data)->i;
501076ba34aSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
502076ba34aSJunchao Zhang   Annz = Ai[Am];
503076ba34aSJunchao Zhang 
504076ba34aSJunchao Zhang   if (reuse == MAT_REUSE_MATRIX) {
505076ba34aSJunchao Zhang     /* Send Aa to abuf */
506076ba34aSJunchao Zhang     ierr = PetscSFReduceBegin(reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE);CHKERRMPI(ierr);
507076ba34aSJunchao Zhang     ierr = PetscSFReduceEnd  (reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE);CHKERRMPI(ierr);
508076ba34aSJunchao Zhang 
509076ba34aSJunchao Zhang     /* Copy abuf to Ca */
510076ba34aSJunchao Zhang     const MatScalarKokkosView& Ca = C.values;
511076ba34aSJunchao Zhang     nrows = dstrowoffset.extent(0); /* Not srcrowoffset[] since it has an extra entry for CSR */
512076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(nrows, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
513076ba34aSJunchao Zhang       PetscInt i   = t.league_rank();
514076ba34aSJunchao Zhang       PetscInt src = srcrowoffset(i), dst = dstrowoffset(i);
515076ba34aSJunchao Zhang       PetscInt len = srcrowoffset(i+1) - srcrowoffset(i);
516076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t,len), [&](PetscInt k) {Ca(dst+k) = abuf(src+k);});
517076ba34aSJunchao Zhang     });
518076ba34aSJunchao Zhang   } else if (reuse == MAT_INITIAL_MATRIX) {
519076ba34aSJunchao Zhang     MPI_Comm               comm;
520076ba34aSJunchao Zhang     MPI_Request            *reqs;
521076ba34aSJunchao Zhang     PetscMPIInt            tag;
522076ba34aSJunchao Zhang     PetscInt               Cm;
523076ba34aSJunchao Zhang 
524076ba34aSJunchao Zhang     ierr = PetscObjectGetComm((PetscObject)ownerSF,&comm);CHKERRQ(ierr);
525076ba34aSJunchao Zhang     ierr = PetscCommGetNewTag(comm,&tag);CHKERRQ(ierr);
526076ba34aSJunchao Zhang 
527076ba34aSJunchao Zhang     PetscInt niranks,nranks,nroots,nleaves;
528076ba34aSJunchao Zhang     const PetscMPIInt *iranks,*ranks;
529076ba34aSJunchao Zhang     const PetscInt *ioffset,*rows,*roffset;  /* rows[] contains local indices of rows scattered to me from others. ioffset[] is a CSR on rows[] */
530076ba34aSJunchao Zhang     ierr = PetscSFSetUp(ownerSF);CHKERRQ(ierr);
531076ba34aSJunchao Zhang     ierr = PetscSFGetLeafRanks(ownerSF,&niranks,&iranks,&ioffset,&rows);CHKERRQ(ierr); /* recv info: iranks[] will send rows to me */
532076ba34aSJunchao Zhang     ierr = PetscSFGetRootRanks(ownerSF,&nranks,&ranks,&roffset,NULL/*rmine*/,NULL/*rremote*/);CHKERRQ(ierr); /* send info */
533076ba34aSJunchao Zhang     ierr = PetscSFGetGraph(ownerSF,&nroots,&nleaves,NULL,NULL);CHKERRQ(ierr);
534*c0aa6a63SJacob Faibussowitsch     if (nleaves != Am) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"ownerSF's nleaves(%" PetscInt_FMT ") != row size of A(%" PetscInt_FMT ")\n",nleaves,Am);
535076ba34aSJunchao Zhang     Cm    = nroots;
536076ba34aSJunchao Zhang     nrows = ioffset[niranks]; /* # of rows to be received. Might receive same row (each is partial) from different senders */
537076ba34aSJunchao Zhang 
538076ba34aSJunchao Zhang     /* Tell owners how long each row I will send */
539076ba34aSJunchao Zhang     PetscInt                *srowlens; /* send buf of row lens */
540076ba34aSJunchao Zhang     MatRowMapKokkosViewHost rrowlens_h("rrowoffset_h",nrows+1); /* recv buf of row lens. +1 to make CSR later. Memory might be passed to other views */
541076ba34aSJunchao Zhang     PetscInt                *rrowlens = rrowlens_h.data();
542076ba34aSJunchao Zhang 
543076ba34aSJunchao Zhang     ierr = PetscMalloc2(Am,&srowlens,niranks+nranks,&reqs);CHKERRQ(ierr);
544076ba34aSJunchao Zhang     for (i=0; i<Am; i++) srowlens[i] = Ai[i+1] - Ai[i];
545076ba34aSJunchao Zhang     rrowlens[0] = 0;
546076ba34aSJunchao Zhang     rrowlens++; /* shift the pointer to make the following expression more readable */
547076ba34aSJunchao Zhang     for (i=0; i<niranks; i++){ierr = MPI_Irecv(&rrowlens[ioffset[i]],ioffset[i+1]-ioffset[i],MPIU_INT,iranks[i],tag,comm,&reqs[i]);CHKERRMPI(ierr);}
548076ba34aSJunchao Zhang     for (i=0; i<nranks; i++) {ierr = MPI_Isend(&srowlens[roffset[i]],roffset[i+1]-roffset[i],MPIU_INT,ranks[i],tag,comm,&reqs[niranks+i]);CHKERRMPI(ierr);}
549076ba34aSJunchao Zhang     ierr = MPI_Waitall(niranks+nranks,reqs,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
550076ba34aSJunchao Zhang 
551076ba34aSJunchao Zhang     /* Owner builds Ci on host by histogramming rrowlens[] */
552076ba34aSJunchao Zhang     MatRowMapKokkosViewHost Ci_h("i",Cm+1);
553076ba34aSJunchao Zhang     Kokkos::deep_copy(Ci_h,0); /* Zero Ci */
554076ba34aSJunchao Zhang     MatRowMapType *Ci_ptr = Ci_h.data();
555076ba34aSJunchao Zhang 
556076ba34aSJunchao Zhang     for (i=0; i<nrows; i++) {
557076ba34aSJunchao Zhang       r = rows[i]; /* local row id of i-th received row */
558076ba34aSJunchao Zhang      #if defined(PETSC_USE_DEBUG)
559*c0aa6a63SJacob Faibussowitsch       if (r<0 || r>=Cm) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"local row id (%" PetscInt_FMT ") is out of range [0,%" PetscInt_FMT ")\n",r,Cm);
560076ba34aSJunchao Zhang      #endif
561076ba34aSJunchao Zhang       Ci_ptr[r+1] += rrowlens[i]; /* add to length of row r in C */
562076ba34aSJunchao Zhang     }
563076ba34aSJunchao Zhang     for (i=0; i<Cm; i++) Ci_ptr[i+1] += Ci_ptr[i]; /* to CSR format */
564076ba34aSJunchao Zhang     Cnnz = Ci_ptr[Cm];
565076ba34aSJunchao Zhang 
566076ba34aSJunchao Zhang     /* For each received row, compute src & dst offsets in memory copying (from recv bufs abuf, jbuf to Ca, Cj) */
567076ba34aSJunchao Zhang     MatRowMapKokkosViewHost dstrowoffset_h("dstrowoffset_h",nrows);
568076ba34aSJunchao Zhang     PetscInt                *dstrowoffset_hptr = dstrowoffset_h.data();
569076ba34aSJunchao Zhang     PetscInt                *currowlens; /* Current row lens. They are temp accumulators for row lens in C, to help build dstrowoffset */
570076ba34aSJunchao Zhang 
571076ba34aSJunchao Zhang     ierr = PetscCalloc1(Cm,&currowlens);CHKERRQ(ierr); /* Init with zero, to be added to */
572076ba34aSJunchao Zhang     for (i=0; i<nrows; i++) { /* for each row I receive */
573076ba34aSJunchao Zhang       r                    = rows[i]; /* row id in C */
574076ba34aSJunchao Zhang       dstrowoffset_hptr[i] = Ci_ptr[r] + currowlens[r]; /* dst offset of the new place for each recv'ed row in Ca/Cj */
575076ba34aSJunchao Zhang       currowlens[r]       += rrowlens[i]; /* accumulate to length of row r in C */
576076ba34aSJunchao Zhang     }
577076ba34aSJunchao Zhang     ierr = PetscFree(currowlens);CHKERRQ(ierr);
578076ba34aSJunchao Zhang 
579076ba34aSJunchao Zhang     rrowlens--;
580076ba34aSJunchao Zhang     for (i=0; i<nrows; i++) rrowlens[i+1] += rrowlens[i]; /* Change rrowlens[] to CSR format */
581076ba34aSJunchao Zhang     dstrowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),dstrowoffset_h);
582076ba34aSJunchao Zhang     srcrowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),rrowlens_h); /* src offset of each recv'ed row in abuf/jbuf */
583076ba34aSJunchao Zhang 
584076ba34aSJunchao Zhang     /* Build the reduceSF, which performs buffer to buffer send/recv */
585076ba34aSJunchao Zhang     PetscInt *sdisp,*rdisp; /* buffer to send offsets of roots, and buffer to recv them */
586076ba34aSJunchao Zhang     ierr = PetscMalloc2(niranks,&sdisp,nranks,&rdisp);CHKERRQ(ierr);
587076ba34aSJunchao Zhang     for (i=0; i<niranks; i++) sdisp[i] = rrowlens[ioffset[i]];
588076ba34aSJunchao Zhang     for (i=0; i<nranks; i++)  {ierr = MPI_Irecv(&rdisp[i],1,MPIU_INT,ranks[i],tag,comm,&reqs[i]);CHKERRMPI(ierr);}
589076ba34aSJunchao Zhang     for (i=0; i<niranks; i++) {ierr = MPI_Isend(&sdisp[i],1,MPIU_INT,iranks[i],tag,comm,&reqs[nranks+i]);CHKERRMPI(ierr);}
590076ba34aSJunchao Zhang     ierr = MPI_Waitall(niranks+nranks,reqs,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
591076ba34aSJunchao Zhang 
592076ba34aSJunchao Zhang     /* Nonzeros in abuf/jbuf are roots and those in A are leaves */
593076ba34aSJunchao Zhang     PetscInt    nroots2 = Cnnz,nleaves2 = Annz;
594076ba34aSJunchao Zhang     PetscSFNode *iremote;
595076ba34aSJunchao Zhang     ierr = PetscMalloc1(nleaves2,&iremote);CHKERRQ(ierr); /* no free, since memory will be given to reduceSF */
596076ba34aSJunchao Zhang     for (i=0; i<nranks; i++) {
597076ba34aSJunchao Zhang       PetscInt rootbase = rdisp[i]; /* root offset at this root rank */
598076ba34aSJunchao Zhang       PetscInt leafbase = Ai[roffset[i]]; /* leaf base */
599076ba34aSJunchao Zhang       PetscInt nz       = Ai[roffset[i+1]] - leafbase; /* I will send nz nonzeros to this root rank */
600076ba34aSJunchao Zhang       for (PetscInt k=0; k<nz; k++) {
601076ba34aSJunchao Zhang         iremote[leafbase+k].rank  = ranks[i];
602076ba34aSJunchao Zhang         iremote[leafbase+k].index = rootbase + k;
603076ba34aSJunchao Zhang       }
604076ba34aSJunchao Zhang     }
605076ba34aSJunchao Zhang     ierr = PetscSFCreate(comm,&reduceSF);CHKERRQ(ierr);
606076ba34aSJunchao Zhang     ierr = PetscSFSetGraph(reduceSF,nroots2,nleaves2,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
607076ba34aSJunchao Zhang     ierr = PetscFree2(sdisp,rdisp);CHKERRQ(ierr);
608076ba34aSJunchao Zhang 
609076ba34aSJunchao Zhang     /* Reduce Aa, Ajg to abuf and jbuf */
610076ba34aSJunchao Zhang 
611076ba34aSJunchao Zhang     /* If A uses local col ids, convert them to global ones before sending */
612076ba34aSJunchao Zhang     MatColIdxKokkosView Ajg;
613076ba34aSJunchao Zhang     if (local) {
614076ba34aSJunchao Zhang       Ajg = MatColIdxKokkosView("j",Annz);
615076ba34aSJunchao Zhang       const MatColIdxKokkosView& Aj = akok->j_dual.view_device();
616076ba34aSJunchao Zhang       Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscInt i) {Ajg(i) = l2g(Aj(i));});
617076ba34aSJunchao Zhang     } else {
618076ba34aSJunchao Zhang       Ajg = akok->j_dual.view_device(); /* no data copy, just take a reference */
619076ba34aSJunchao Zhang     }
620076ba34aSJunchao Zhang 
621076ba34aSJunchao Zhang     MatColIdxKokkosView   jbuf("jbuf",Cnnz);
622076ba34aSJunchao Zhang     abuf = MatScalarKokkosView("abuf",Cnnz);
623076ba34aSJunchao Zhang     ierr = PetscSFReduceBegin(reduceSF,MPIU_INT,   Ajg.data(),           jbuf.data(),MPI_REPLACE);CHKERRMPI(ierr);
624076ba34aSJunchao Zhang     ierr = PetscSFReduceEnd  (reduceSF,MPIU_INT,   Ajg.data(),           jbuf.data(),MPI_REPLACE);CHKERRMPI(ierr);
625076ba34aSJunchao Zhang     ierr = PetscSFReduceBegin(reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE);CHKERRMPI(ierr);
626076ba34aSJunchao Zhang     ierr = PetscSFReduceEnd  (reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE);CHKERRMPI(ierr);
627076ba34aSJunchao Zhang 
628076ba34aSJunchao Zhang     /* Copy data from abuf, jbuf to Ca, Cj */
629076ba34aSJunchao Zhang     MatRowMapKokkosView    Ci = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),Ci_h); /* Ci is an alias of Ci_h if no device */
630076ba34aSJunchao Zhang     MatColIdxKokkosView    Cj("j",Cnnz);
631076ba34aSJunchao Zhang     MatScalarKokkosView    Ca("a",Cnnz);
632076ba34aSJunchao Zhang 
633076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(nrows, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
634076ba34aSJunchao Zhang       PetscInt i   = t.league_rank();
635076ba34aSJunchao Zhang       PetscInt src = srcrowoffset(i), dst = dstrowoffset(i);
636076ba34aSJunchao Zhang       PetscInt len = srcrowoffset(i+1) - srcrowoffset(i);
637076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t,len), [&](PetscInt k) {
638076ba34aSJunchao Zhang         Ca(dst+k) = abuf(src+k);
639076ba34aSJunchao Zhang         Cj(dst+k) = jbuf(src+k);
640076ba34aSJunchao Zhang       });
641076ba34aSJunchao Zhang     });
642076ba34aSJunchao Zhang 
643076ba34aSJunchao Zhang     /* Build C with Ca, Ci, Cj */
644076ba34aSJunchao Zhang     C    = KokkosCsrMatrix("csrmat",Cm,N,Cnnz,Ca,Ci,Cj);
645076ba34aSJunchao Zhang     ierr = PetscFree2(srowlens,reqs);CHKERRQ(ierr);
646076ba34aSJunchao Zhang   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unsupported MatReuse enum %d\n",reuse);
647076ba34aSJunchao Zhang   PetscFunctionReturn(0);
648076ba34aSJunchao Zhang }
649076ba34aSJunchao Zhang 
650076ba34aSJunchao Zhang /* MatSetMPIAIJKokkosWithGlobalCSRMatrix - Set the diag and offdiag parts of a MATMPIAIJKOKKOS matrix by splitting a KokkosCsrMatrix
651076ba34aSJunchao Zhang 
652076ba34aSJunchao Zhang   Input Parameters:
653076ba34aSJunchao Zhang +  C        - the MATMPIAIJKOKKOS matrix, of size m,n,M,N
654076ba34aSJunchao Zhang .  reuse    - indicate whether the matrix has called this function before
655076ba34aSJunchao Zhang .  csrmat   - the KokkosCsrMatrix, of size m,N
656076ba34aSJunchao Zhang -  Cdstart  - when reuse == MAT_REUSE_MATRIX, it is an input parameter. For each row in csrmat, it stores the start of the first
657076ba34aSJunchao Zhang               entry of the diag block of C in csrmat's j array. E.g, if row i has col ids = {0, 3, 4, 5, 7, 9} and the first diag
658076ba34aSJunchao Zhang               entry is 5, then Cdstart[i] = 3.
659076ba34aSJunchao Zhang 
660076ba34aSJunchao Zhang   Output Parameters:
661076ba34aSJunchao Zhang +  C        - the updated MATMPIAIJKOKKOS matrix
662076ba34aSJunchao Zhang -  Cdstart - when reuse == MAT_INITIAL_MATRIX, it is an output parameter
663076ba34aSJunchao Zhang 
664076ba34aSJunchao Zhang   Notes:
665076ba34aSJunchao Zhang    Between calls with MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX, csrmat must have the same nonzero pattern
666076ba34aSJunchao Zhang  */
667076ba34aSJunchao Zhang static PetscErrorCode MatSetMPIAIJKokkosWithGlobalCSRMatrix(Mat C,MatReuse reuse,const KokkosCsrMatrix& csrmat,MatRowMapKokkosView& Cdstart)
668076ba34aSJunchao Zhang {
669076ba34aSJunchao Zhang   PetscErrorCode                  ierr;
670076ba34aSJunchao Zhang   const MatScalarKokkosView&      Ca = csrmat.values;
671076ba34aSJunchao Zhang   const ConstMatRowMapKokkosView& Ci = csrmat.graph.row_map;
672076ba34aSJunchao Zhang   PetscInt                        m,n,N;
673076ba34aSJunchao Zhang 
674076ba34aSJunchao Zhang   PetscFunctionBegin;
675076ba34aSJunchao Zhang   ierr = MatGetLocalSize(C,&m,&n);CHKERRQ(ierr);
676076ba34aSJunchao Zhang   ierr = MatGetSize(C,NULL,&N);CHKERRQ(ierr);
677076ba34aSJunchao Zhang 
678076ba34aSJunchao Zhang   if (reuse == MAT_REUSE_MATRIX) {
679076ba34aSJunchao Zhang     Mat_MPIAIJ                  *mpiaij = static_cast<Mat_MPIAIJ*>(C->data);
680076ba34aSJunchao Zhang     Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(mpiaij->A->spptr);
681076ba34aSJunchao Zhang     Mat_SeqAIJKokkos            *bkok = static_cast<Mat_SeqAIJKokkos*>(mpiaij->B->spptr);
682076ba34aSJunchao Zhang     const MatScalarKokkosView&  Cda = akok->a_dual.view_device(),Coa = bkok->a_dual.view_device();
683076ba34aSJunchao Zhang     const MatRowMapKokkosView&  Cdi = akok->i_dual.view_device(),Coi = bkok->i_dual.view_device();
684076ba34aSJunchao Zhang 
685076ba34aSJunchao Zhang     /* Fill 'a' of Cd and Co on device */
686076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
687076ba34aSJunchao Zhang       PetscInt i       = t.league_rank(); /* row i */
688076ba34aSJunchao Zhang       PetscInt clen    = Ci(i+1) - Ci(i); /* len of row i of C */
689076ba34aSJunchao Zhang       PetscInt cdlen   = Cdi(i+1) - Cdi(i); /* len of row i of Cd */
690076ba34aSJunchao Zhang       PetscInt cdstart = Cdstart(i); /* [start, end) of row i of Cd in C */
691076ba34aSJunchao Zhang       PetscInt cdend   = cdstart + cdlen;
692076ba34aSJunchao Zhang       /* [0, clen) is cut into three blocks: [0, cdstart), [cdstart, cdend), [cdend, clen) */
693076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clen), [&](PetscInt k) {
694076ba34aSJunchao Zhang         if (k < cdstart) {  /* k in [0, cdstart) */
695076ba34aSJunchao Zhang           Coa(Coi(i)+k) = Ca(Ci(i)+k);
696076ba34aSJunchao Zhang         } else if (k < cdend) { /* k in [cdstart, cdend) */
697076ba34aSJunchao Zhang           Cda(Cdi(i)+(k-cdstart)) = Ca(Ci(i)+k);
698076ba34aSJunchao Zhang         } else { /* k in [cdend, clen) */
699076ba34aSJunchao Zhang           Coa(Coi(i)+k-cdlen) = Ca(Ci(i)+k);
700076ba34aSJunchao Zhang         }
701076ba34aSJunchao Zhang       });
702076ba34aSJunchao Zhang     });
703076ba34aSJunchao Zhang 
704076ba34aSJunchao Zhang     akok->a_dual.modify_device();
705076ba34aSJunchao Zhang     bkok->a_dual.modify_device();
706076ba34aSJunchao Zhang   } else if (reuse == MAT_INITIAL_MATRIX) {
707076ba34aSJunchao Zhang     Mat                         Cd,Co;
708076ba34aSJunchao Zhang     const MatColIdxKokkosView&  Cj = csrmat.graph.entries;
709076ba34aSJunchao Zhang     MatRowMapKokkosDualView     Cdi_dual("i",m+1),Coi_dual("i",m+1);
710076ba34aSJunchao Zhang     MatRowMapKokkosView         Cdi = Cdi_dual.view_device(),Coi = Coi_dual.view_device();
711076ba34aSJunchao Zhang     PetscInt                    cstart,cend;
712076ba34aSJunchao Zhang 
713076ba34aSJunchao Zhang     /* Note that each row of C is sorted by col ids. We want to find out how to cut each row into three blocks:
714076ba34aSJunchao Zhang        left to the diag block, diag block, right to the diag block. The diag block have col ids in [cstart,cend).
715076ba34aSJunchao Zhang        Suppose a row of C has len nonzeros, indexed by [0, len). We want to know two indices: cdstart and cdend,
716076ba34aSJunchao Zhang        such that the three blocks are [0,cdstart), [cdstart,cdend), [cdend,len). The following code equivalentaly
717076ba34aSJunchao Zhang        stores values of cdstart and cdend-cstart (aka Cdi[]) instead.
718076ba34aSJunchao Zhang      */
719076ba34aSJunchao Zhang     Cdstart = MatRowMapKokkosView("Cdstart",m);
720076ba34aSJunchao Zhang     ierr    = PetscLayoutGetRange(C->cmap,&cstart,&cend);CHKERRQ(ierr); /* Not MatGetOwnershipRangeColumn() since C has not been preallocated yet */
721076ba34aSJunchao Zhang 
722076ba34aSJunchao Zhang     /* I could use RangePolicy and one thread per row. But since each thread essentially does binary search, threads in a
723076ba34aSJunchao Zhang       CUDA warp would completely diverge. So I use TeamPolicy with a team size 1.
724076ba34aSJunchao Zhang      */
725076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
726076ba34aSJunchao Zhang       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
727076ba34aSJunchao Zhang         PetscInt i = t.league_rank(); /* row i */
728076ba34aSJunchao Zhang         PetscInt j,first,count,step;
729076ba34aSJunchao Zhang 
730076ba34aSJunchao Zhang         if (i == 0) { /* Set the first entry of the i arrays to zero on device, to be used in CSR */
731076ba34aSJunchao Zhang           Cdi(0) = 0;
732076ba34aSJunchao Zhang           Coi(0) = 0;
733076ba34aSJunchao Zhang         }
734076ba34aSJunchao Zhang 
735076ba34aSJunchao Zhang         /* Do std::lower_bound(Ci(i),Ci(i+1),cstart) on Cj[]. We use j as the iterator. lower_bound() returns
736076ba34aSJunchao Zhang           in 'first' the first iterator with a value >= cstart, or last iterator if no such element is found.
737076ba34aSJunchao Zhang         */
738076ba34aSJunchao Zhang         count = Ci(i+1)-Ci(i);
739076ba34aSJunchao Zhang         first = Ci(i);
740076ba34aSJunchao Zhang         while (count > 0) {
741076ba34aSJunchao Zhang           j    = first;
742076ba34aSJunchao Zhang           step = count / 2;
743076ba34aSJunchao Zhang           j   += step;
744076ba34aSJunchao Zhang           if (Cj(j) < cstart) {
745076ba34aSJunchao Zhang             first  = ++j;
746076ba34aSJunchao Zhang             count -= step + 1;
747076ba34aSJunchao Zhang           } else count = step;
748076ba34aSJunchao Zhang         }
749076ba34aSJunchao Zhang         Cdstart(i) = first - Ci(i); /* 'first' is the while-loop's output */
750076ba34aSJunchao Zhang 
751076ba34aSJunchao Zhang         /* Do std::lower_bound(first,Ci(i+1),cend) on Cj[] */
752076ba34aSJunchao Zhang         count = Ci(i+1) - first;
753076ba34aSJunchao Zhang         while (count > 0) {
754076ba34aSJunchao Zhang           j    = first;
755076ba34aSJunchao Zhang           step = count / 2;
756076ba34aSJunchao Zhang           j   += step;
757076ba34aSJunchao Zhang           if (Cj(j) < cend) {
758076ba34aSJunchao Zhang             first  = ++j;
759076ba34aSJunchao Zhang             count -= step + 1;
760076ba34aSJunchao Zhang           } else count = step;
761076ba34aSJunchao Zhang         }
762076ba34aSJunchao Zhang         Cdi(i+1) = first - (Ci(i)+Cdstart(i)); /* 'first' is the while-loop's output */
763076ba34aSJunchao Zhang         Coi(i+1) = (Ci(i+1)-Ci(i)) - Cdi(i+1); /* Co's row len = C's row len - Cd's row len */
764076ba34aSJunchao Zhang       });
765076ba34aSJunchao Zhang     });
766076ba34aSJunchao Zhang 
767076ba34aSJunchao Zhang     /* Convert row lens in Cdi[], Coi[] to CSR format using inclusive scan, e.g., changing [0,1,2,3] into [0,1,3,6] */
768076ba34aSJunchao Zhang     Kokkos::parallel_scan(m+1,KOKKOS_LAMBDA(const PetscInt i,PetscInt& update,const bool final) {
769076ba34aSJunchao Zhang       update += Cdi(i);
770076ba34aSJunchao Zhang       if (final) Cdi(i) = update;
771076ba34aSJunchao Zhang     });
772076ba34aSJunchao Zhang     Kokkos::parallel_scan(m+1,KOKKOS_LAMBDA(const PetscInt i,PetscInt& update,const bool final) {
773076ba34aSJunchao Zhang       update += Coi(i);
774076ba34aSJunchao Zhang       if (final) Coi(i) = update;
775076ba34aSJunchao Zhang     });
776076ba34aSJunchao Zhang 
777076ba34aSJunchao Zhang     /* Get Cdi, Coi on host (it is not a waste, since we do need them on host in
778076ba34aSJunchao Zhang        MatCreateSeqAIJKokkosWithCSRMatrix() below), then get nnz of Cd and Co.
779076ba34aSJunchao Zhang     */
780076ba34aSJunchao Zhang     Cdi_dual.modify_device();
781076ba34aSJunchao Zhang     Coi_dual.modify_device();
782076ba34aSJunchao Zhang     Cdi_dual.sync_host();
783076ba34aSJunchao Zhang     Coi_dual.sync_host();
784076ba34aSJunchao Zhang     PetscInt Cd_nnz = Cdi_dual.view_host().data()[m];
785076ba34aSJunchao Zhang     PetscInt Co_nnz = Coi_dual.view_host().data()[m];
786076ba34aSJunchao Zhang 
787076ba34aSJunchao Zhang     /* With nnz, allocate a, j for Cd and Co */
788076ba34aSJunchao Zhang     MatColIdxKokkosDualView Cdj_dual("j",Cd_nnz),Coj_dual("j",Co_nnz);
789076ba34aSJunchao Zhang     MatScalarKokkosDualView Cda_dual("a",Cd_nnz),Coa_dual("a",Co_nnz);
790076ba34aSJunchao Zhang 
791076ba34aSJunchao Zhang     /* Fill a, j of Cd and Co on device */
792076ba34aSJunchao Zhang     MatColIdxKokkosView     Cdj = Cdj_dual.view_device(),Coj = Coj_dual.view_device();
793076ba34aSJunchao Zhang     MatScalarKokkosView     Cda = Cda_dual.view_device(),Coa = Coa_dual.view_device();
794076ba34aSJunchao Zhang 
795076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
796076ba34aSJunchao Zhang       PetscInt i       = t.league_rank(); /* row i */
797076ba34aSJunchao Zhang       PetscInt clen    = Ci(i+1) - Ci(i); /* len of row i of C */
798076ba34aSJunchao Zhang       PetscInt cdlen   = Cdi(i+1) - Cdi(i); /* len of row i of Cd */
799076ba34aSJunchao Zhang       PetscInt cdstart = Cdstart(i); /* [start, end) of row i of Cd in C */
800076ba34aSJunchao Zhang       PetscInt cdend   = cdstart + cdlen;
801076ba34aSJunchao Zhang       /* [0, clen) is cut into three blocks: [0, cdstart), [cdstart, cdend), [cdend, clen) */
802076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clen), [&](PetscInt k) {
803076ba34aSJunchao Zhang         if (k < cdstart) { /* k in [0, cdstart) */
804076ba34aSJunchao Zhang           Coa(Coi(i)+k) = Ca(Ci(i)+k);
805076ba34aSJunchao Zhang           Coj(Coi(i)+k) = Cj(Ci(i)+k);
806076ba34aSJunchao Zhang         } else if (k < cdend) { /* k in [cdstart, cdend) */
807076ba34aSJunchao Zhang           Cda(Cdi(i)+(k-cdstart)) = Ca(Ci(i)+k);
808076ba34aSJunchao Zhang           Cdj(Cdi(i)+(k-cdstart)) = Cj(Ci(i)+k) - cstart; /* Use local col ids in Cdj */
809076ba34aSJunchao Zhang         } else { /* k in [cdend, clen) */
810076ba34aSJunchao Zhang           Coa(Coi(i)+k-cdlen) = Ca(Ci(i)+k);
811076ba34aSJunchao Zhang           Coj(Coi(i)+k-cdlen) = Cj(Ci(i)+k);
812076ba34aSJunchao Zhang         }
813076ba34aSJunchao Zhang       });
814076ba34aSJunchao Zhang     });
815076ba34aSJunchao Zhang 
816076ba34aSJunchao Zhang     Cdj_dual.modify_device();
817076ba34aSJunchao Zhang     Cda_dual.modify_device();
818076ba34aSJunchao Zhang     Coj_dual.modify_device();
819076ba34aSJunchao Zhang     Coa_dual.modify_device();
820076ba34aSJunchao Zhang     /* With a, i, j for Cd and Co, finally build Cd, Co and then C. Their offloadmask will be set in each's MatAssemblyEnd */
821076ba34aSJunchao Zhang     auto cdkok = new Mat_SeqAIJKokkos(m,n,Cd_nnz,Cdi_dual,Cdj_dual,Cda_dual);
822076ba34aSJunchao Zhang     auto cokok = new Mat_SeqAIJKokkos(m,N,Co_nnz,Coi_dual,Coj_dual,Coa_dual);
823076ba34aSJunchao Zhang     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,cdkok,&Cd);CHKERRQ(ierr);
824076ba34aSJunchao Zhang     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,cokok,&Co);CHKERRQ(ierr);
825076ba34aSJunchao Zhang     ierr = MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices(C,Cd,Co);CHKERRQ(ierr); /* Coj will be converted to local ids within */
826076ba34aSJunchao Zhang   }
827076ba34aSJunchao Zhang   PetscFunctionReturn(0);
828076ba34aSJunchao Zhang }
829076ba34aSJunchao Zhang 
830076ba34aSJunchao Zhang /* MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos - Compact a SEQAIJKOKKS matrix's global col ids.
831076ba34aSJunchao Zhang 
832076ba34aSJunchao Zhang   It is similar to MatSeqAIJCompactOutExtraColumns_SeqAIJ, but it applies to SEQAIJKOKKOS and returns the l2g map in Kokkos view.
833076ba34aSJunchao Zhang 
834076ba34aSJunchao Zhang   Input Parameters:
835076ba34aSJunchao Zhang +  C        - the MATMPIAIJKOKKOS matrix, of size m,n,M,N
836076ba34aSJunchao Zhang .  reuse    - indicate whether the matrix has called this function before
837076ba34aSJunchao Zhang .  csrmat   - the KokkosCsrMatrix, of size m,N
838076ba34aSJunchao Zhang -  Cdoffset - when reuse == MAT_REUSE_MATRIX, it is an input parameter. For each row in csrmat, it stores the offset of the first
839076ba34aSJunchao Zhang               entry of the diag block of C in csrmat's j array.
840076ba34aSJunchao Zhang 
841076ba34aSJunchao Zhang   Output Parameters:
842076ba34aSJunchao Zhang +  C        - the updated MATMPIAIJKOKKOS matrix
843076ba34aSJunchao Zhang -  Cdoffset - when reuse == MAT_INITIAL_MATRIX, it is an output parameter
844076ba34aSJunchao Zhang 
845076ba34aSJunchao Zhang   Notes: the input matrix's col ids and col size will be changed.
846076ba34aSJunchao Zhang */
847076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos(Mat C,MatColIdxKokkosView& l2g)
848076ba34aSJunchao Zhang {
849076ba34aSJunchao Zhang   PetscErrorCode         ierr;
850076ba34aSJunchao Zhang   Mat_SeqAIJKokkos       *ckok;
851076ba34aSJunchao Zhang   ISLocalToGlobalMapping l2gmap;
852076ba34aSJunchao Zhang   const PetscInt         *garray;
853076ba34aSJunchao Zhang   PetscInt               sz;
854076ba34aSJunchao Zhang 
855076ba34aSJunchao Zhang   PetscFunctionBegin;
856076ba34aSJunchao Zhang   /* Compact P_other's global col ids and col size. We do it since we guess with local ids KK might be more memory scalable */
857076ba34aSJunchao Zhang   ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(C,&l2gmap);CHKERRQ(ierr);
858076ba34aSJunchao Zhang   ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
859076ba34aSJunchao Zhang   ckok->j_dual.modify_host(); /* P_other's j is modified on host; we need to sync it on device */
860076ba34aSJunchao Zhang   ckok->j_dual.sync_device();
861076ba34aSJunchao Zhang   ckok->SetColSize(C->cmap->n); /* Update col size of the csrmat in spptr */
862076ba34aSJunchao Zhang 
863076ba34aSJunchao Zhang   /* Build l2g -- the local to global mapping of C's cols */
864076ba34aSJunchao Zhang   ierr = ISLocalToGlobalMappingGetIndices(l2gmap,&garray);CHKERRQ(ierr);
865076ba34aSJunchao Zhang   ierr = ISLocalToGlobalMappingGetSize(l2gmap,&sz);CHKERRQ(ierr);
866*c0aa6a63SJacob Faibussowitsch   if (C->cmap->n != sz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"matrix column size(%" PetscInt_FMT ") != l2g mapping size(%" PetscInt_FMT ")\n", C->cmap->n,sz);
867076ba34aSJunchao Zhang 
868076ba34aSJunchao Zhang   ConstMatColIdxKokkosViewHost tmp(garray,sz);
869076ba34aSJunchao Zhang   l2g = MatColIdxKokkosView("l2g",sz);
870076ba34aSJunchao Zhang   Kokkos::deep_copy(l2g,tmp);
871076ba34aSJunchao Zhang 
872076ba34aSJunchao Zhang   ierr = ISLocalToGlobalMappingRestoreIndices(l2gmap,&garray);CHKERRQ(ierr);
873076ba34aSJunchao Zhang   ierr = ISLocalToGlobalMappingDestroy(&l2gmap);CHKERRQ(ierr);
874076ba34aSJunchao Zhang   PetscFunctionReturn(0);
875076ba34aSJunchao Zhang }
876076ba34aSJunchao Zhang 
877076ba34aSJunchao Zhang /* MatProductSymbolic_MPIAIJKokkos_AB - AB flavor of MatProductSymbolic_MPIAIJKokkos
878076ba34aSJunchao Zhang 
879076ba34aSJunchao Zhang   Input Parameters:
880076ba34aSJunchao Zhang +  product  - Mat_Product which carried out the computation. Passed in to access info about this mat product.
881076ba34aSJunchao Zhang .  A        - an MPIAIJKOKKOS matrix
882076ba34aSJunchao Zhang .  B        - an MPIAIJKOKKOS matrix
883076ba34aSJunchao Zhang -  mm       - a struct used to stash intermediate data when computing AB. Persist from symbolic to numeric operations.
884076ba34aSJunchao Zhang 
885076ba34aSJunchao Zhang   Notes: The local part of the result C is stored as mm->C_global, which is of type KokkosCsrMatrix and uses global col ids.
886076ba34aSJunchao Zhang */
887076ba34aSJunchao Zhang static PetscErrorCode MatProductSymbolic_MPIAIJKokkos_AB(Mat_Product *product,Mat A,Mat B,MatMatStruct_AB *mm)
888076ba34aSJunchao Zhang {
889076ba34aSJunchao Zhang   PetscErrorCode              ierr;
890076ba34aSJunchao Zhang   Mat_MPIAIJ                  *a = static_cast<Mat_MPIAIJ*>(A->data);
891076ba34aSJunchao Zhang   Mat                         Ad = a->A,Ao = a->B; /* diag and offdiag of A */
892076ba34aSJunchao Zhang   IS                          glob = NULL;
893076ba34aSJunchao Zhang   const PetscInt              *garray;
894076ba34aSJunchao Zhang   PetscInt                    N = B->cmap->N,sz;
895076ba34aSJunchao Zhang   ConstMatColIdxKokkosView    l2g1; /* two temp maps mapping local col ids to global ones */
896076ba34aSJunchao Zhang   MatColIdxKokkosView         l2g2;
897076ba34aSJunchao Zhang   Mat                         C1,C2; /* intermediate matrices */
898076ba34aSJunchao Zhang 
899076ba34aSJunchao Zhang   PetscFunctionBegin;
900076ba34aSJunchao Zhang   /* C1 = Ad * B_local. B_local is a matrix got by merging Bd and Bo, and uses local col ids */
901076ba34aSJunchao Zhang   ierr = MatMPIAIJGetLocalMatMerge(B,MAT_INITIAL_MATRIX,&glob,&mm->B_local);CHKERRQ(ierr);
902076ba34aSJunchao Zhang   ierr = MatProductCreate(Ad,mm->B_local,NULL,&C1);CHKERRQ(ierr);
903076ba34aSJunchao Zhang   ierr = MatProductSetType(C1,MATPRODUCT_AB);CHKERRQ(ierr);
904076ba34aSJunchao Zhang   ierr = MatProductSetFill(C1,product->fill);CHKERRQ(ierr);
905076ba34aSJunchao Zhang   C1->product->api_user = product->api_user;
906076ba34aSJunchao Zhang   ierr = MatProductSetFromOptions(C1);CHKERRQ(ierr);
907076ba34aSJunchao Zhang   if (!C1->ops->productsymbolic) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C1->product->type]);
908076ba34aSJunchao Zhang   ierr = (*C1->ops->productsymbolic)(C1);CHKERRQ(ierr);
909076ba34aSJunchao Zhang 
910076ba34aSJunchao Zhang   ierr = ISGetIndices(glob,&garray);CHKERRQ(ierr);
911076ba34aSJunchao Zhang   ierr = ISGetSize(glob,&sz);CHKERRQ(ierr);
912076ba34aSJunchao Zhang   const auto& tmp  = ConstMatColIdxKokkosViewHost(garray,sz); /* wrap garray as a view */
913076ba34aSJunchao Zhang   l2g1 = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),tmp); /* maybe just an alias to tmp, so we restore garray at the very end */
914076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C1,N,l2g1,mm->C1_global);
915076ba34aSJunchao Zhang 
916076ba34aSJunchao Zhang   /* C2 = Ao * B_other. B_other is a matrix consisting of needed rows of B gathered from other procs */
917076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosBcast(mm->B_local,MAT_INITIAL_MATRIX,N,l2g1,a->Mvctx,mm->sf,
918076ba34aSJunchao Zhang                               mm->abuf,mm->rows,mm->rowoffset,mm->B_other);CHKERRQ(ierr);
919076ba34aSJunchao Zhang 
920076ba34aSJunchao Zhang   /* Compact B_other to use local ids as we guess KK spgemm is more memroy scalable with that; We could skip the compaction to simplify code */
921076ba34aSJunchao Zhang   ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos(mm->B_other,l2g2);CHKERRQ(ierr);
922076ba34aSJunchao Zhang   ierr = MatProductCreate(Ao,mm->B_other,NULL,&C2);CHKERRQ(ierr);
923076ba34aSJunchao Zhang   ierr = MatProductSetType(C2,MATPRODUCT_AB);CHKERRQ(ierr);
924076ba34aSJunchao Zhang   ierr = MatProductSetFill(C2,product->fill);CHKERRQ(ierr);
925076ba34aSJunchao Zhang   C2->product->api_user = product->api_user;
926076ba34aSJunchao Zhang   ierr = MatProductSetFromOptions(C2);CHKERRQ(ierr);
927076ba34aSJunchao Zhang   if (!C2->ops->productsymbolic) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C2->product->type]);
928076ba34aSJunchao Zhang   ierr = (*C2->ops->productsymbolic)(C2);CHKERRQ(ierr);
929076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C2,N,l2g2,mm->C2_global);
930076ba34aSJunchao Zhang 
931076ba34aSJunchao Zhang   /* C = C1 + C2.  We actually use their global col ids versions in adding */
932076ba34aSJunchao Zhang   mm->kh.create_spadd_handle(false); /* Input C1, C2 are NOT sorted, since B_local, B_other are not */
933076ba34aSJunchao Zhang   KokkosSparse::spadd_symbolic(&mm->kh,mm->C1_global,mm->C2_global,mm->C_global);
934076ba34aSJunchao Zhang   /* Have to do numeric since spadd_symbolic does not really populate column indices of the result matrix */
935076ba34aSJunchao Zhang   KokkosSparse::spadd_numeric(&mm->kh,(MatScalarType)1.0,mm->C1_global,(MatScalarType)1.0,mm->C2_global,mm->C_global);
936076ba34aSJunchao Zhang 
937076ba34aSJunchao Zhang   mm->C1 = C1;
938076ba34aSJunchao Zhang   mm->C2 = C2;
939076ba34aSJunchao Zhang   ierr = ISRestoreIndices(glob,&garray);CHKERRQ(ierr);
940076ba34aSJunchao Zhang   ierr = ISDestroy(&glob);CHKERRQ(ierr);
941076ba34aSJunchao Zhang   PetscFunctionReturn(0);
942076ba34aSJunchao Zhang }
943076ba34aSJunchao Zhang 
944076ba34aSJunchao Zhang /* MatProductSymbolic_MPIAIJKokkos_AtB - A^tB flavor of MatProductSymbolic_MPIAIJKokkos
945076ba34aSJunchao Zhang 
946076ba34aSJunchao Zhang   Input Parameters:
947076ba34aSJunchao Zhang +  product  - Mat_Product which carried out the computation. Passed in to access info about this mat product.
948076ba34aSJunchao Zhang .  A        - an MPIAIJKOKKOS matrix
949076ba34aSJunchao Zhang .  B        - a SEQAIJKOKKOS matrix. It works as if A^t is multiplied by a parallel matrix made up of Bs on each rank.
950076ba34aSJunchao Zhang .  localB   - Does B use local col ids? If false, then B is already in global col ids.
951076ba34aSJunchao Zhang .  N        - col size of the "parallel B matrix". It implies B's global col ids are in range of [0,N) and N is the same across the communicator.
952076ba34aSJunchao Zhang .  l2g      - If localB, then l2g maps B's local col ids to global ones.
953076ba34aSJunchao Zhang -  mm       - a struct used to stash intermediate data in AtB
954076ba34aSJunchao Zhang 
955076ba34aSJunchao Zhang   Notes: The local part of the result C is stored as mm->C_global, which is of type KokkosCsrMatrix and uses global col ids.
956076ba34aSJunchao Zhang */
957076ba34aSJunchao Zhang static PetscErrorCode MatProductSymbolic_MPIAIJKokkos_AtB(Mat_Product *product,Mat A,Mat B,PetscBool localB,PetscInt N,const ConstMatColIdxKokkosView& l2g,MatMatStruct_AtB *mm)
958076ba34aSJunchao Zhang {
959076ba34aSJunchao Zhang   PetscErrorCode         ierr;
960076ba34aSJunchao Zhang   Mat_MPIAIJ             *a = static_cast<Mat_MPIAIJ*>(A->data);
961076ba34aSJunchao Zhang   Mat                    Ad = a->A,Ao = a->B; /* diag and offdiag of A */
962076ba34aSJunchao Zhang   Mat                    C1,C2; /* intermediate matrices */
963076ba34aSJunchao Zhang 
964076ba34aSJunchao Zhang   PetscFunctionBegin;
965076ba34aSJunchao Zhang   /* C1 = Ad^t * B */
966076ba34aSJunchao Zhang   ierr = MatProductCreate(Ad,B,NULL,&C1);CHKERRQ(ierr);
967076ba34aSJunchao Zhang   ierr = MatProductSetType(C1,MATPRODUCT_AtB);CHKERRQ(ierr);
968076ba34aSJunchao Zhang   ierr = MatProductSetFill(C1,product->fill);CHKERRQ(ierr);
969076ba34aSJunchao Zhang   C1->product->api_user = product->api_user;
970076ba34aSJunchao Zhang   ierr = MatProductSetFromOptions(C1);CHKERRQ(ierr);
971076ba34aSJunchao Zhang   if (!C1->ops->productsymbolic) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C1->product->type]);
972076ba34aSJunchao Zhang   ierr = (*C1->ops->productsymbolic)(C1);CHKERRQ(ierr);
973076ba34aSJunchao Zhang 
974076ba34aSJunchao Zhang   if (localB) {ierr = MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C1,N,l2g,mm->C1_global);}
975076ba34aSJunchao Zhang   else mm->C1_global = static_cast<Mat_SeqAIJKokkos*>(C1->spptr)->csrmat; /* the csrmat already uses global col ids */
976076ba34aSJunchao Zhang 
977076ba34aSJunchao Zhang   /* C2 = Ao^t * B */
978076ba34aSJunchao Zhang   ierr = MatProductCreate(Ao,B,NULL,&C2);CHKERRQ(ierr);
979076ba34aSJunchao Zhang   ierr = MatProductSetType(C2,MATPRODUCT_AtB);CHKERRQ(ierr);
980076ba34aSJunchao Zhang   ierr = MatProductSetFill(C2,product->fill);CHKERRQ(ierr);
981076ba34aSJunchao Zhang   C2->product->api_user = product->api_user;
982076ba34aSJunchao Zhang   ierr = MatProductSetFromOptions(C2);CHKERRQ(ierr);
983076ba34aSJunchao Zhang   if (!C2->ops->productsymbolic) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C2->product->type]);
984076ba34aSJunchao Zhang   ierr = (*C2->ops->productsymbolic)(C2);CHKERRQ(ierr);
985076ba34aSJunchao Zhang 
986076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosReduce(C2,MAT_INITIAL_MATRIX,localB,N,l2g,a->Mvctx,mm->sf,mm->abuf,
987076ba34aSJunchao Zhang                                mm->srcrowoffset,mm->dstrowoffset,mm->C2_global);CHKERRQ(ierr);
988076ba34aSJunchao Zhang 
989076ba34aSJunchao Zhang   mm->kh.create_spadd_handle(false); /* Input C1, C2 are NOT sorted, since B may be not */
990076ba34aSJunchao Zhang   KokkosSparse::spadd_symbolic(&mm->kh,mm->C1_global,mm->C2_global,mm->C_global);
991076ba34aSJunchao Zhang   /* Have to do numeric since spadd_symbolic does not really populate column indices of the result matrix */
992076ba34aSJunchao Zhang   KokkosSparse::spadd_numeric(&mm->kh,(MatScalarType)1.0,mm->C1_global,(MatScalarType)1.0,mm->C2_global,mm->C_global);
993076ba34aSJunchao Zhang   mm->C1 = C1;
994076ba34aSJunchao Zhang   mm->C2 = C2;
995076ba34aSJunchao Zhang   PetscFunctionReturn(0);
996076ba34aSJunchao Zhang }
997076ba34aSJunchao Zhang 
998076ba34aSJunchao Zhang PetscErrorCode MatProductNumeric_MPIAIJKokkos(Mat C)
999076ba34aSJunchao Zhang {
1000076ba34aSJunchao Zhang   PetscErrorCode                ierr;
1001076ba34aSJunchao Zhang   Mat_Product                   *product = C->product;
1002076ba34aSJunchao Zhang   MatProductType                ptype;
1003076ba34aSJunchao Zhang   MatProductData_MPIAIJKokkos   *mmdata;
1004076ba34aSJunchao Zhang   MatMatStruct                  *mm = NULL;
1005076ba34aSJunchao Zhang   MatMatStruct_AB               *ab;
1006076ba34aSJunchao Zhang   MatMatStruct_AtB              *atb;
1007076ba34aSJunchao Zhang   Mat                           A,B,Ad,Ao,Bd,Bo;
1008076ba34aSJunchao Zhang   const MatScalarType           one = 1.0; /* Not use literal 1.0 directly, to avoid wrong template instantiation in KokkosSparse::spadd_numeric */
1009076ba34aSJunchao Zhang 
1010076ba34aSJunchao Zhang   PetscFunctionBegin;
1011076ba34aSJunchao Zhang   MatCheckProduct(C,1);
1012076ba34aSJunchao Zhang   mmdata = static_cast<MatProductData_MPIAIJKokkos*>(product->data);
1013076ba34aSJunchao Zhang   ptype  = product->type;
1014076ba34aSJunchao Zhang   A      = product->A;
1015076ba34aSJunchao Zhang   B      = product->B;
1016076ba34aSJunchao Zhang   Ad     = static_cast<Mat_MPIAIJ*>(A->data)->A;
1017076ba34aSJunchao Zhang   Ao     = static_cast<Mat_MPIAIJ*>(A->data)->B;
1018076ba34aSJunchao Zhang   Bd     = static_cast<Mat_MPIAIJ*>(B->data)->A;
1019076ba34aSJunchao Zhang   Bo     = static_cast<Mat_MPIAIJ*>(B->data)->B;
1020076ba34aSJunchao Zhang 
1021076ba34aSJunchao Zhang   if (mmdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
1022076ba34aSJunchao Zhang     mmdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
1023076ba34aSJunchao Zhang     ab  = mmdata->mmAB;
1024076ba34aSJunchao Zhang     atb = mmdata->mmAtB;
1025076ba34aSJunchao Zhang     if (ab) {
1026076ba34aSJunchao Zhang       static_cast<MatProductData_SeqAIJKokkos*>(ab->C1->product->data)->reusesym = PETSC_FALSE;
1027076ba34aSJunchao Zhang       static_cast<MatProductData_SeqAIJKokkos*>(ab->C2->product->data)->reusesym = PETSC_FALSE;
1028076ba34aSJunchao Zhang     }
1029076ba34aSJunchao Zhang     if (atb) {
1030076ba34aSJunchao Zhang       static_cast<MatProductData_SeqAIJKokkos*>(atb->C1->product->data)->reusesym = PETSC_FALSE;
1031076ba34aSJunchao Zhang       static_cast<MatProductData_SeqAIJKokkos*>(atb->C2->product->data)->reusesym = PETSC_FALSE;
1032076ba34aSJunchao Zhang     }
1033076ba34aSJunchao Zhang     PetscFunctionReturn(0);
1034076ba34aSJunchao Zhang   }
1035076ba34aSJunchao Zhang 
1036076ba34aSJunchao Zhang   if (ptype == MATPRODUCT_AB) {
1037076ba34aSJunchao Zhang     ab   = mmdata->mmAB;
1038076ba34aSJunchao Zhang     /* C1 = Ad * B_local */
1039076ba34aSJunchao Zhang     if (!ab->C1->ops->productnumeric || !ab->C2->ops->productnumeric) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing numeric op for MATPRODUCT_AB");
1040076ba34aSJunchao Zhang     ierr = MatMPIAIJGetLocalMatMerge(B,MAT_REUSE_MATRIX,NULL/*glob*/,&ab->B_local);CHKERRQ(ierr);
1041076ba34aSJunchao Zhang     if (ab->C1->product->B != ab->B_local) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AB, internal mat product matrix C1->B has unexpectedly changed");
1042076ba34aSJunchao Zhang     if (ab->C1->product->A != Ad) {ierr = MatProductReplaceMats(Ad,NULL,NULL,ab->C1);CHKERRQ(ierr);}
1043076ba34aSJunchao Zhang     ierr = (*ab->C1->ops->productnumeric)(ab->C1);CHKERRQ(ierr);
1044076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosBcast(ab->B_local,MAT_REUSE_MATRIX,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,ab->sf,
1045076ba34aSJunchao Zhang                                 ab->abuf,ab->rows,ab->rowoffset,ab->B_other);CHKERRQ(ierr);
1046076ba34aSJunchao Zhang     /* C2 = Ao * B_other */
1047076ba34aSJunchao Zhang     if (ab->C2->product->B != ab->B_other) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AB, internal mat product matrix C2->B has unexpectedly changed");
1048076ba34aSJunchao Zhang     if (ab->C1->product->A != Ao) {ierr = MatProductReplaceMats(Ao,NULL,NULL,ab->C2);CHKERRQ(ierr);}
1049076ba34aSJunchao Zhang     ierr = (*ab->C2->ops->productnumeric)(ab->C2);CHKERRQ(ierr);
1050076ba34aSJunchao Zhang     /* C = C1_global + C2_global */
1051076ba34aSJunchao Zhang     KokkosSparse::spadd_numeric(&ab->kh,one,ab->C1_global,one,ab->C2_global,ab->C_global);
1052076ba34aSJunchao Zhang     mm = static_cast<MatMatStruct*>(ab);
1053076ba34aSJunchao Zhang   } else if (ptype == MATPRODUCT_AtB) {
1054076ba34aSJunchao Zhang     atb  = mmdata->mmAtB;
1055076ba34aSJunchao Zhang     if (!atb->C1->ops->productnumeric || !atb->C2->ops->productnumeric) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing numeric op for MATPRODUCT_AtB");
1056076ba34aSJunchao Zhang     /* C1 = Ad^t * B_local */
1057076ba34aSJunchao Zhang     ierr = MatMPIAIJGetLocalMatMerge(B,MAT_REUSE_MATRIX,NULL/*glob*/,&atb->B_local);CHKERRQ(ierr);
1058076ba34aSJunchao Zhang     if (atb->C1->product->B != atb->B_local) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AtB, internal mat product matrix C1->B has unexpectedly changed");
1059076ba34aSJunchao Zhang     if (atb->C1->product->A != Ad) {ierr = MatProductReplaceMats(Ad,NULL,NULL,atb->C1);CHKERRQ(ierr);}
1060076ba34aSJunchao Zhang     ierr = (*atb->C1->ops->productnumeric)(atb->C1);CHKERRQ(ierr);
1061076ba34aSJunchao Zhang 
1062076ba34aSJunchao Zhang     /* C2 = Ao^t * B_local */
1063076ba34aSJunchao Zhang     if (atb->C2->product->B != atb->B_local) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AtB, internal mat product matrix C2->B has unexpectedly changed");
1064076ba34aSJunchao Zhang     if (atb->C2->product->A != Ao) {ierr = MatProductReplaceMats(Ao,NULL,NULL,atb->C2);CHKERRQ(ierr);}
1065076ba34aSJunchao Zhang     ierr = (*atb->C2->ops->productnumeric)(atb->C2);CHKERRQ(ierr);
1066076ba34aSJunchao Zhang     /* Form C2_global */
1067076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosReduce(atb->C2,MAT_REUSE_MATRIX,PETSC_TRUE,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,atb->sf,
1068076ba34aSJunchao Zhang                                  atb->abuf,atb->srcrowoffset,atb->dstrowoffset,atb->C2_global);CHKERRQ(ierr);
1069076ba34aSJunchao Zhang     /* C = C1_global + C2_global */
1070076ba34aSJunchao Zhang     KokkosSparse::spadd_numeric(&atb->kh,one,atb->C1_global,one,atb->C2_global,atb->C_global);
1071076ba34aSJunchao Zhang     mm = static_cast<MatMatStruct*>(atb);
1072076ba34aSJunchao Zhang   } else if (ptype == MATPRODUCT_PtAP) { /* BtAB */
1073076ba34aSJunchao Zhang     ab   = mmdata->mmAB;
1074076ba34aSJunchao Zhang     ierr = MatMPIAIJGetLocalMatMerge(B,MAT_REUSE_MATRIX,NULL/*glob*/,&ab->B_local);CHKERRQ(ierr);
1075076ba34aSJunchao Zhang 
1076076ba34aSJunchao Zhang     /* ab->C1 = Ad * B_local */
1077076ba34aSJunchao Zhang     if (ab->C1->product->B != ab->B_local) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_PtAP, internal mat product matrix ab->C1->B has unexpectedly changed");
1078076ba34aSJunchao Zhang     if (ab->C1->product->A != Ad) {ierr = MatProductReplaceMats(Ad,NULL,NULL,ab->C1);CHKERRQ(ierr);}
1079076ba34aSJunchao Zhang     ierr = (*ab->C1->ops->productnumeric)(ab->C1);CHKERRQ(ierr);
1080076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosBcast(ab->B_local,MAT_REUSE_MATRIX,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,ab->sf,
1081076ba34aSJunchao Zhang                                 ab->abuf,ab->rows,ab->rowoffset,ab->B_other);CHKERRQ(ierr);
1082076ba34aSJunchao Zhang     /* ab->C2 = Ao * B_other */
1083076ba34aSJunchao Zhang     if (ab->C2->product->A != Ao) {ierr = MatProductReplaceMats(Ao,NULL,NULL,ab->C2);CHKERRQ(ierr);}
1084076ba34aSJunchao Zhang     ierr = (*ab->C2->ops->productnumeric)(ab->C2);CHKERRQ(ierr); /* C2 = Ao * B_other */
1085076ba34aSJunchao Zhang     KokkosSparse::spadd_numeric(&ab->kh,one,ab->C1_global,one,ab->C2_global,ab->C_global);
1086076ba34aSJunchao Zhang 
1087076ba34aSJunchao Zhang     /* atb->C1 = Bd^t * ab->C_petsc */
1088076ba34aSJunchao Zhang     atb  = mmdata->mmAtB;
1089076ba34aSJunchao Zhang     if (atb->C1->product->B != ab->C_petsc) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_PtAP, internal mat product matrix atb->C1->B has unexpectedly changed");
1090076ba34aSJunchao Zhang     if (atb->C1->product->A != Bd) {ierr = MatProductReplaceMats(Bd,NULL,NULL,atb->C1);CHKERRQ(ierr);}
1091076ba34aSJunchao Zhang     ierr = (*atb->C1->ops->productnumeric)(atb->C1);CHKERRQ(ierr);
1092076ba34aSJunchao Zhang     /* atb->C2 = Bo^t * ab->C_petsc */
1093076ba34aSJunchao Zhang     if (atb->C2->product->A != Bo) {ierr = MatProductReplaceMats(Bo,NULL,NULL,atb->C2);CHKERRQ(ierr);}
1094076ba34aSJunchao Zhang     ierr = (*atb->C2->ops->productnumeric)(atb->C2);CHKERRQ(ierr);
1095076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosReduce(atb->C2,MAT_REUSE_MATRIX,PETSC_FALSE,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,atb->sf,
1096076ba34aSJunchao Zhang                                  atb->abuf,atb->srcrowoffset,atb->dstrowoffset,atb->C2_global);CHKERRQ(ierr);
1097076ba34aSJunchao Zhang     KokkosSparse::spadd_numeric(&atb->kh,one,atb->C1_global,one,atb->C2_global,atb->C_global);
1098076ba34aSJunchao Zhang     mm = static_cast<MatMatStruct*>(atb);
1099076ba34aSJunchao Zhang   }
1100076ba34aSJunchao Zhang   /* Split C_global to form C */
1101076ba34aSJunchao Zhang   ierr = MatSetMPIAIJKokkosWithGlobalCSRMatrix(C,MAT_REUSE_MATRIX,mm->C_global,mm->Cdstart);CHKERRQ(ierr);
1102076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1103076ba34aSJunchao Zhang }
1104076ba34aSJunchao Zhang 
1105076ba34aSJunchao Zhang PetscErrorCode MatProductSymbolic_MPIAIJKokkos(Mat C)
1106076ba34aSJunchao Zhang {
1107076ba34aSJunchao Zhang   PetscErrorCode              ierr;
1108076ba34aSJunchao Zhang   Mat                         A,B;
1109076ba34aSJunchao Zhang   Mat_Product                 *product = C->product;
1110076ba34aSJunchao Zhang   MatProductType              ptype;
1111076ba34aSJunchao Zhang   MatProductData_MPIAIJKokkos *mmdata;
1112076ba34aSJunchao Zhang   MatMatStruct                *mm = NULL;
1113076ba34aSJunchao Zhang   IS                          glob = NULL;
1114076ba34aSJunchao Zhang   const PetscInt              *garray;
1115076ba34aSJunchao Zhang   PetscInt                    m,n,M,N,sz;
1116076ba34aSJunchao Zhang   ConstMatColIdxKokkosView    l2g; /* map local col ids to global ones */
1117076ba34aSJunchao Zhang 
1118076ba34aSJunchao Zhang   PetscFunctionBegin;
1119076ba34aSJunchao Zhang   MatCheckProduct(C,1);
1120076ba34aSJunchao Zhang   if (product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
1121076ba34aSJunchao Zhang   ptype = product->type;
1122076ba34aSJunchao Zhang   A     = product->A;
1123076ba34aSJunchao Zhang   B     = product->B;
1124076ba34aSJunchao Zhang 
1125076ba34aSJunchao Zhang   switch (ptype) {
1126076ba34aSJunchao Zhang     case MATPRODUCT_AB:   m = A->rmap->n; n = B->cmap->n; M = A->rmap->N; N = B->cmap->N; break;
1127076ba34aSJunchao Zhang     case MATPRODUCT_AtB:  m = A->cmap->n; n = B->cmap->n; M = A->cmap->N; N = B->cmap->N; break;
1128076ba34aSJunchao Zhang     case MATPRODUCT_PtAP: m = B->cmap->n; n = B->cmap->n; M = B->cmap->N; N = B->cmap->N; break; /* BtAB */
1129076ba34aSJunchao Zhang     default: SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for product type %s",MatProductTypes[ptype]);
1130076ba34aSJunchao Zhang   }
1131076ba34aSJunchao Zhang 
1132076ba34aSJunchao Zhang   ierr = MatSetSizes(C,m,n,M,N);CHKERRQ(ierr);
1133076ba34aSJunchao Zhang   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
1134076ba34aSJunchao Zhang   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
1135076ba34aSJunchao Zhang   ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
1136076ba34aSJunchao Zhang 
1137076ba34aSJunchao Zhang   mmdata           = new MatProductData_MPIAIJKokkos();
1138076ba34aSJunchao Zhang   mmdata->reusesym = product->api_user;
1139076ba34aSJunchao Zhang 
1140076ba34aSJunchao Zhang   if (ptype == MATPRODUCT_AB) {
1141076ba34aSJunchao Zhang     mmdata->mmAB = new MatMatStruct_AB();
1142076ba34aSJunchao Zhang     ierr = MatProductSymbolic_MPIAIJKokkos_AB(product,A,B,mmdata->mmAB);CHKERRQ(ierr);
1143076ba34aSJunchao Zhang     mm   = static_cast<MatMatStruct*>(mmdata->mmAB);
1144076ba34aSJunchao Zhang   } else if (ptype == MATPRODUCT_AtB) {
1145076ba34aSJunchao Zhang     mmdata->mmAtB = new MatMatStruct_AtB();
1146076ba34aSJunchao Zhang     auto atb      = mmdata->mmAtB;
1147076ba34aSJunchao Zhang     ierr = MatMPIAIJGetLocalMatMerge(B,MAT_INITIAL_MATRIX,&glob,&atb->B_local);CHKERRQ(ierr);
1148076ba34aSJunchao Zhang     ierr = ISGetIndices(glob,&garray);CHKERRQ(ierr);
1149076ba34aSJunchao Zhang     ierr = ISGetSize(glob,&sz);CHKERRQ(ierr);
1150076ba34aSJunchao Zhang     l2g  = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatColIdxKokkosViewHost(garray,sz));
1151076ba34aSJunchao Zhang     ierr = MatProductSymbolic_MPIAIJKokkos_AtB(product,A,atb->B_local,PETSC_TRUE,N,l2g,atb);CHKERRQ(ierr);
1152076ba34aSJunchao Zhang     ierr = ISRestoreIndices(glob,&garray);CHKERRQ(ierr);
1153076ba34aSJunchao Zhang     ierr = ISDestroy(&glob);CHKERRQ(ierr);
1154076ba34aSJunchao Zhang     mm   = static_cast<MatMatStruct*>(atb);
1155076ba34aSJunchao Zhang   } else if (ptype == MATPRODUCT_PtAP) { /* BtAB */
1156076ba34aSJunchao Zhang     mmdata->mmAB  = new MatMatStruct_AB(); /* tmp=A*B */
1157076ba34aSJunchao Zhang     mmdata->mmAtB = new MatMatStruct_AtB(); /* C=B^t*tmp */
1158076ba34aSJunchao Zhang     auto ab       = mmdata->mmAB;
1159076ba34aSJunchao Zhang     auto atb      = mmdata->mmAtB;
1160076ba34aSJunchao Zhang     ierr = MatProductSymbolic_MPIAIJKokkos_AB(product,A,B,ab);CHKERRQ(ierr);
1161076ba34aSJunchao Zhang     auto tmp = new Mat_SeqAIJKokkos(ab->C_global); /* Memory will be owned by ab->C_petsc */
1162076ba34aSJunchao Zhang     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,tmp,&ab->C_petsc);CHKERRQ(ierr);
1163076ba34aSJunchao Zhang     ierr = MatProductSymbolic_MPIAIJKokkos_AtB(product,B,ab->C_petsc,PETSC_FALSE,N,l2g/*not used*/,atb);CHKERRQ(ierr);
1164076ba34aSJunchao Zhang     mm   = static_cast<MatMatStruct*>(atb);
1165076ba34aSJunchao Zhang   }
1166076ba34aSJunchao Zhang   /* Split the C_global into petsc A, B format */
1167076ba34aSJunchao Zhang   ierr = MatSetMPIAIJKokkosWithGlobalCSRMatrix(C,MAT_INITIAL_MATRIX,mm->C_global,mm->Cdstart);CHKERRQ(ierr);
1168076ba34aSJunchao Zhang   C->product->data        = mmdata;
1169076ba34aSJunchao Zhang   C->product->destroy     = MatProductDataDestroy_MPIAIJKokkos;
1170076ba34aSJunchao Zhang   C->ops->productnumeric  = MatProductNumeric_MPIAIJKokkos;
1171076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1172076ba34aSJunchao Zhang }
1173076ba34aSJunchao Zhang 
1174076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_MPIAIJKokkos(Mat mat)
1175076ba34aSJunchao Zhang {
1176076ba34aSJunchao Zhang   PetscErrorCode ierr;
1177076ba34aSJunchao Zhang   Mat_Product    *product = mat->product;
1178076ba34aSJunchao Zhang   PetscBool      match = PETSC_FALSE;
1179076ba34aSJunchao Zhang   PetscBool      usecpu = PETSC_FALSE;
1180076ba34aSJunchao Zhang 
1181076ba34aSJunchao Zhang   PetscFunctionBegin;
1182076ba34aSJunchao Zhang   MatCheckProduct(mat,1);
1183076ba34aSJunchao Zhang   if (!product->A->boundtocpu && !product->B->boundtocpu) {
1184076ba34aSJunchao Zhang     ierr = PetscObjectTypeCompare((PetscObject)product->B,((PetscObject)product->A)->type_name,&match);CHKERRQ(ierr);
1185076ba34aSJunchao Zhang   }
1186076ba34aSJunchao Zhang   if (match) { /* we can always fallback to the CPU if requested */
1187076ba34aSJunchao Zhang     switch (product->type) {
1188076ba34aSJunchao Zhang     case MATPRODUCT_AB:
1189076ba34aSJunchao Zhang       if (product->api_user) {
1190076ba34aSJunchao Zhang         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
1191076ba34aSJunchao Zhang         ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
1192076ba34aSJunchao Zhang         ierr = PetscOptionsEnd();CHKERRQ(ierr);
1193076ba34aSJunchao Zhang       } else {
1194076ba34aSJunchao Zhang         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
1195076ba34aSJunchao Zhang         ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
1196076ba34aSJunchao Zhang         ierr = PetscOptionsEnd();CHKERRQ(ierr);
1197076ba34aSJunchao Zhang       }
1198076ba34aSJunchao Zhang       break;
1199076ba34aSJunchao Zhang     case MATPRODUCT_AtB:
1200076ba34aSJunchao Zhang       if (product->api_user) {
1201076ba34aSJunchao Zhang         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
1202076ba34aSJunchao Zhang         ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
1203076ba34aSJunchao Zhang         ierr = PetscOptionsEnd();CHKERRQ(ierr);
1204076ba34aSJunchao Zhang       } else {
1205076ba34aSJunchao Zhang         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
1206076ba34aSJunchao Zhang         ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
1207076ba34aSJunchao Zhang         ierr = PetscOptionsEnd();CHKERRQ(ierr);
1208076ba34aSJunchao Zhang       }
1209076ba34aSJunchao Zhang       break;
1210076ba34aSJunchao Zhang     case MATPRODUCT_PtAP:
1211076ba34aSJunchao Zhang       if (product->api_user) {
1212076ba34aSJunchao Zhang         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
1213076ba34aSJunchao Zhang         ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
1214076ba34aSJunchao Zhang         ierr = PetscOptionsEnd();CHKERRQ(ierr);
1215076ba34aSJunchao Zhang       } else {
1216076ba34aSJunchao Zhang         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
1217076ba34aSJunchao Zhang         ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
1218076ba34aSJunchao Zhang         ierr = PetscOptionsEnd();CHKERRQ(ierr);
1219076ba34aSJunchao Zhang       }
1220076ba34aSJunchao Zhang       break;
1221076ba34aSJunchao Zhang     default:
1222076ba34aSJunchao Zhang       break;
1223076ba34aSJunchao Zhang     }
1224076ba34aSJunchao Zhang     match = (PetscBool)!usecpu;
1225076ba34aSJunchao Zhang   }
1226076ba34aSJunchao Zhang   if (match) {
1227076ba34aSJunchao Zhang     switch (product->type) {
1228076ba34aSJunchao Zhang     case MATPRODUCT_AB:
1229076ba34aSJunchao Zhang     case MATPRODUCT_AtB:
1230076ba34aSJunchao Zhang     case MATPRODUCT_PtAP:
1231076ba34aSJunchao Zhang       mat->ops->productsymbolic = MatProductSymbolic_MPIAIJKokkos;
1232076ba34aSJunchao Zhang       break;
1233076ba34aSJunchao Zhang     default:
1234076ba34aSJunchao Zhang       break;
1235076ba34aSJunchao Zhang     }
1236076ba34aSJunchao Zhang   }
1237076ba34aSJunchao Zhang   /* fallback to MPIAIJ ops */
1238076ba34aSJunchao Zhang   if (!mat->ops->productsymbolic) {
1239076ba34aSJunchao Zhang     ierr = MatProductSetFromOptions_MPIAIJ(mat);CHKERRQ(ierr);
1240076ba34aSJunchao Zhang   }
1241076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1242076ba34aSJunchao Zhang }
1243076ba34aSJunchao Zhang 
1244076ba34aSJunchao Zhang PetscErrorCode MatDestroy_MPIAIJKokkos(Mat A)
1245076ba34aSJunchao Zhang {
1246076ba34aSJunchao Zhang   PetscErrorCode     ierr;
1247076ba34aSJunchao Zhang 
1248076ba34aSJunchao Zhang   PetscFunctionBegin;
1249076ba34aSJunchao Zhang   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
1250076ba34aSJunchao Zhang   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
1251076ba34aSJunchao Zhang   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
1252076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1253076ba34aSJunchao Zhang }
1254076ba34aSJunchao Zhang 
12558c3ff71bSJunchao Zhang PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
12568c3ff71bSJunchao Zhang {
12578c3ff71bSJunchao Zhang   PetscErrorCode     ierr;
12588c3ff71bSJunchao Zhang   Mat                B;
1259076ba34aSJunchao Zhang   Mat_MPIAIJ         *a;
12608c3ff71bSJunchao Zhang 
12618c3ff71bSJunchao Zhang   PetscFunctionBegin;
12628c3ff71bSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) {
12638c3ff71bSJunchao Zhang     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
12648c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {
12658c3ff71bSJunchao Zhang     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
12668c3ff71bSJunchao Zhang   }
12678c3ff71bSJunchao Zhang   B = *newmat;
12688c3ff71bSJunchao Zhang 
12696f3d89d0SStefano Zampini   B->boundtocpu = PETSC_FALSE;
12708c3ff71bSJunchao Zhang   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
12718c3ff71bSJunchao Zhang   ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr);
12723d0639e7SStefano Zampini   ierr = PetscObjectChangeTypeName((PetscObject)B,MATMPIAIJKOKKOS);CHKERRQ(ierr);
12738c3ff71bSJunchao Zhang 
1274076ba34aSJunchao Zhang   a = static_cast<Mat_MPIAIJ*>(A->data);
1275076ba34aSJunchao Zhang   if (a->A) {ierr = MatSetType(a->A,MATSEQAIJKOKKOS);CHKERRQ(ierr);}
1276076ba34aSJunchao Zhang   if (a->B) {ierr = MatSetType(a->B,MATSEQAIJKOKKOS);CHKERRQ(ierr);}
1277076ba34aSJunchao Zhang   if (a->lvec) {ierr = VecSetType(a->lvec,VECSEQKOKKOS);CHKERRQ(ierr);}
1278076ba34aSJunchao Zhang 
12798c3ff71bSJunchao Zhang   B->ops->assemblyend           = MatAssemblyEnd_MPIAIJKokkos;
12808c3ff71bSJunchao Zhang   B->ops->mult                  = MatMult_MPIAIJKokkos;
12818c3ff71bSJunchao Zhang   B->ops->multadd               = MatMultAdd_MPIAIJKokkos;
12828c3ff71bSJunchao Zhang   B->ops->multtranspose         = MatMultTranspose_MPIAIJKokkos;
1283076ba34aSJunchao Zhang   B->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJKokkos;
1284076ba34aSJunchao Zhang   B->ops->destroy               = MatDestroy_MPIAIJKokkos;
12858c3ff71bSJunchao Zhang 
12863d0639e7SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJKokkos);CHKERRQ(ierr);
1287076ba34aSJunchao Zhang   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJKokkos);CHKERRQ(ierr);
12888c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
12898c3ff71bSJunchao Zhang }
12908c3ff71bSJunchao Zhang 
12918c3ff71bSJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJKokkos(Mat A)
12928c3ff71bSJunchao Zhang {
12938c3ff71bSJunchao Zhang   PetscErrorCode ierr;
12948c3ff71bSJunchao Zhang 
12958c3ff71bSJunchao Zhang   PetscFunctionBegin;
12968c3ff71bSJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
12978c3ff71bSJunchao Zhang   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
12988c3ff71bSJunchao Zhang   ierr = MatConvert_MPIAIJ_MPIAIJKokkos(A,MATMPIAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
12998c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
13008c3ff71bSJunchao Zhang }
13018c3ff71bSJunchao Zhang 
13028c3ff71bSJunchao Zhang /*@C
13038c3ff71bSJunchao Zhang    MatCreateAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
13048c3ff71bSJunchao Zhang    (the default parallel PETSc format).  This matrix will ultimately pushed down
13058c3ff71bSJunchao Zhang    to Kokkos for calculations. For good matrix
13068c3ff71bSJunchao Zhang    assembly performance the user should preallocate the matrix storage by setting
13078c3ff71bSJunchao Zhang    the parameter nz (or the array nnz).  By setting these parameters accurately,
13088c3ff71bSJunchao Zhang    performance during matrix assembly can be increased by more than a factor of 50.
13098c3ff71bSJunchao Zhang 
13108c3ff71bSJunchao Zhang    Collective
13118c3ff71bSJunchao Zhang 
13128c3ff71bSJunchao Zhang    Input Parameters:
13138c3ff71bSJunchao Zhang +  comm - MPI communicator, set to PETSC_COMM_SELF
13148c3ff71bSJunchao Zhang .  m - number of rows
13158c3ff71bSJunchao Zhang .  n - number of columns
13168c3ff71bSJunchao Zhang .  nz - number of nonzeros per row (same for all rows)
13178c3ff71bSJunchao Zhang -  nnz - array containing the number of nonzeros in the various rows
13188c3ff71bSJunchao Zhang          (possibly different for each row) or NULL
13198c3ff71bSJunchao Zhang 
13208c3ff71bSJunchao Zhang    Output Parameter:
13218c3ff71bSJunchao Zhang .  A - the matrix
13228c3ff71bSJunchao Zhang 
13238c3ff71bSJunchao Zhang    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
13248c3ff71bSJunchao Zhang    MatXXXXSetPreallocation() paradigm instead of this routine directly.
13258c3ff71bSJunchao Zhang    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
13268c3ff71bSJunchao Zhang 
13278c3ff71bSJunchao Zhang    Notes:
13288c3ff71bSJunchao Zhang    If nnz is given then nz is ignored
13298c3ff71bSJunchao Zhang 
13308c3ff71bSJunchao Zhang    The AIJ format (also called the Yale sparse matrix format or
13318c3ff71bSJunchao Zhang    compressed row storage), is fully compatible with standard Fortran 77
13328c3ff71bSJunchao Zhang    storage.  That is, the stored row and column indices can begin at
13338c3ff71bSJunchao Zhang    either one (as in Fortran) or zero.  See the users' manual for details.
13348c3ff71bSJunchao Zhang 
13358c3ff71bSJunchao Zhang    Specify the preallocated storage with either nz or nnz (not both).
13368c3ff71bSJunchao Zhang    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
13378c3ff71bSJunchao Zhang    allocation.  For large problems you MUST preallocate memory or you
13388c3ff71bSJunchao Zhang    will get TERRIBLE performance, see the users' manual chapter on matrices.
13398c3ff71bSJunchao Zhang 
13408c3ff71bSJunchao Zhang    By default, this format uses inodes (identical nodes) when possible, to
13418c3ff71bSJunchao Zhang    improve numerical efficiency of matrix-vector products and solves. We
13428c3ff71bSJunchao Zhang    search for consecutive rows with the same nonzero structure, thereby
13438c3ff71bSJunchao Zhang    reusing matrix information to achieve increased efficiency.
13448c3ff71bSJunchao Zhang 
13458c3ff71bSJunchao Zhang    Level: intermediate
13468c3ff71bSJunchao Zhang 
13478c3ff71bSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJKOKKOS, MATAIJKokkos
13488c3ff71bSJunchao Zhang @*/
13498c3ff71bSJunchao Zhang PetscErrorCode  MatCreateAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
13508c3ff71bSJunchao Zhang {
13518c3ff71bSJunchao Zhang   PetscErrorCode ierr;
13528c3ff71bSJunchao Zhang   PetscMPIInt    size;
13538c3ff71bSJunchao Zhang 
13548c3ff71bSJunchao Zhang   PetscFunctionBegin;
13558c3ff71bSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
13568c3ff71bSJunchao Zhang   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
1357ffc4695bSBarry Smith   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
13588c3ff71bSJunchao Zhang   if (size > 1) {
13598c3ff71bSJunchao Zhang     ierr = MatSetType(*A,MATMPIAIJKOKKOS);CHKERRQ(ierr);
13608c3ff71bSJunchao Zhang     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
13618c3ff71bSJunchao Zhang   } else {
13628c3ff71bSJunchao Zhang     ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
13638c3ff71bSJunchao Zhang     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
13648c3ff71bSJunchao Zhang   }
13658c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
13668c3ff71bSJunchao Zhang }
13678c3ff71bSJunchao Zhang 
1368a587d139SMark // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
1369042217e8SBarry Smith PetscErrorCode MatKokkosGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
1370a587d139SMark {
1371a587d139SMark   PetscMPIInt                size,rank;
1372a587d139SMark   MPI_Comm                   comm;
1373a587d139SMark   PetscErrorCode             ierr;
1374042217e8SBarry Smith   PetscSplitCSRDataStructure d_mat=NULL;
1375a587d139SMark 
1376a587d139SMark   PetscFunctionBegin;
1377a587d139SMark   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
137855b25c41SPierre Jolivet   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
137955b25c41SPierre Jolivet   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
1380a587d139SMark   if (size == 1) {
1381a587d139SMark     ierr   = MatSeqAIJKokkosGetDeviceMat(A,&d_mat);CHKERRQ(ierr);
1382fc76dfabSMark Adams     ierr   = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); /* Since we are going to modify matrix values on device */
1383a587d139SMark   } else {
1384a587d139SMark     Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
1385a587d139SMark     ierr   = MatSeqAIJKokkosGetDeviceMat(aij->A,&d_mat);CHKERRQ(ierr);
1386fc76dfabSMark Adams     ierr   = MatSeqAIJKokkosModifyDevice(aij->A);CHKERRQ(ierr);
1387fc76dfabSMark Adams     ierr   = MatSeqAIJKokkosModifyDevice(aij->B);CHKERRQ(ierr);
1388a587d139SMark   }
1389a587d139SMark   // act like MatSetValues because not called on host
1390a587d139SMark   if (A->assembled) {
1391a587d139SMark     if (A->was_assembled) {
1392a587d139SMark       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
1393a587d139SMark     }
1394a587d139SMark     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
1395a587d139SMark   } else {
1396*c0aa6a63SJacob Faibussowitsch     ierr = PetscInfo1(A,"Warning !assemble ??? assembled=%" PetscInt_FMT "\n",A->assembled);CHKERRQ(ierr);
1397a587d139SMark     // SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
1398a587d139SMark   }
1399a587d139SMark   if (!d_mat) {
1400042217e8SBarry Smith     struct _n_SplitCSRMat h_mat; /* host container */
1401a587d139SMark     Mat_SeqAIJKokkos      *aijkokA;
1402a587d139SMark     Mat_SeqAIJ            *jaca;
1403a587d139SMark     PetscInt              n = A->rmap->n, nnz;
1404a587d139SMark     Mat                   Amat;
1405042217e8SBarry Smith     PetscInt              *colmap;
1406042217e8SBarry Smith 
1407042217e8SBarry Smith     /* create and copy h_mat */
140849b994a9SMark Adams     h_mat.M = A->cmap->N; // use for debug build
1409a587d139SMark     ierr = PetscInfo(A,"Create device matrix in Kokkos\n");CHKERRQ(ierr);
1410a587d139SMark     if (size == 1) {
1411a587d139SMark       Amat = A;
1412a587d139SMark       jaca = (Mat_SeqAIJ*)A->data;
1413a587d139SMark       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
1414a587d139SMark       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
1415a587d139SMark       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
1416a587d139SMark       h_mat.offdiag.a = NULL;
1417a587d139SMark       aijkokA = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1418a587d139SMark       aijkokA->i_uncompressed_d = NULL;
1419a587d139SMark       aijkokA->colmap_d = NULL;
1420a587d139SMark     } else {
1421a587d139SMark       Mat_MPIAIJ       *aij = (Mat_MPIAIJ*)A->data;
1422a587d139SMark       Mat_SeqAIJ       *jacb = (Mat_SeqAIJ*)aij->B->data;
1423a587d139SMark       PetscInt         ii;
1424a587d139SMark       Mat_SeqAIJKokkos *aijkokB;
1425042217e8SBarry Smith 
1426a587d139SMark       Amat = aij->A;
1427a587d139SMark       aijkokA = static_cast<Mat_SeqAIJKokkos*>(aij->A->spptr);
1428a587d139SMark       aijkokB = static_cast<Mat_SeqAIJKokkos*>(aij->B->spptr);
1429a587d139SMark       aijkokA->i_uncompressed_d = NULL;
1430a587d139SMark       aijkokA->colmap_d = NULL;
1431a587d139SMark       jaca = (Mat_SeqAIJ*)aij->A->data;
1432b3c64f9dSJunchao Zhang       if (aij->B->cmap->n && !aij->garray) SETERRQ(comm,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
1433a587d139SMark       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
1434a587d139SMark       aij->donotstash = PETSC_TRUE;
1435a587d139SMark       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
1436a5b23f4aSJose E. Roman       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more disassembly
1437042217e8SBarry Smith       ierr = PetscCalloc1(A->cmap->N,&colmap);CHKERRQ(ierr);
1438042217e8SBarry Smith       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N)*sizeof(PetscInt));CHKERRQ(ierr);
1439042217e8SBarry Smith       for (ii=0; ii<aij->B->cmap->n; ii++) colmap[aij->garray[ii]] = ii+1;
1440a587d139SMark       // allocate B copy data
1441a587d139SMark       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
1442a587d139SMark       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
1443a587d139SMark       nnz = jacb->i[n];
1444a587d139SMark       if (jacb->compressedrow.use) {
1445a587d139SMark         const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_i_k (jacb->i,n+1);
14469d676ae4SMark Adams         aijkokB->i_uncompressed_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_i_k));
1447a587d139SMark         Kokkos::deep_copy (*aijkokB->i_uncompressed_d, h_i_k);
1448a587d139SMark         h_mat.offdiag.i = aijkokB->i_uncompressed_d->data();
1449a587d139SMark       } else {
145099551766SMark Adams          h_mat.offdiag.i = aijkokB->i_device_data();
1451a587d139SMark       }
145299551766SMark Adams       h_mat.offdiag.j = aijkokB->j_device_data();
1453076ba34aSJunchao Zhang       h_mat.offdiag.a = aijkokB->a_device_data();
1454a587d139SMark       {
1455042217e8SBarry Smith         Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_colmap_k (colmap,A->cmap->N);
14569d676ae4SMark Adams         aijkokB->colmap_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_colmap_k));
1457a587d139SMark         Kokkos::deep_copy (*aijkokB->colmap_d, h_colmap_k);
1458a587d139SMark         h_mat.colmap = aijkokB->colmap_d->data();
1459042217e8SBarry Smith         ierr = PetscFree(colmap);CHKERRQ(ierr);
1460a587d139SMark       }
1461a587d139SMark       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
1462a587d139SMark       h_mat.offdiag.n = n;
1463a587d139SMark     }
1464a587d139SMark     // allocate A copy data
1465a587d139SMark     nnz = jaca->i[n];
1466a587d139SMark     h_mat.diag.n = n;
1467a587d139SMark     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
146855b25c41SPierre Jolivet     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr);
1469042217e8SBarry Smith     if (jaca->compressedrow.use) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A does not suppport compressed row (todo)");
1470042217e8SBarry Smith     else {
147199551766SMark Adams       h_mat.diag.i = aijkokA->i_device_data();
1472a587d139SMark     }
147399551766SMark Adams     h_mat.diag.j = aijkokA->j_device_data();
1474076ba34aSJunchao Zhang     h_mat.diag.a = aijkokA->a_device_data();
1475a587d139SMark     // copy pointers and metdata to device
1476a587d139SMark     ierr = MatSeqAIJKokkosSetDeviceMat(Amat,&h_mat);CHKERRQ(ierr);
1477a587d139SMark     ierr = MatSeqAIJKokkosGetDeviceMat(Amat,&d_mat);CHKERRQ(ierr);
1478*c0aa6a63SJacob Faibussowitsch     ierr = PetscInfo2(A,"Create device Mat n=%" PetscInt_FMT " nnz=%" PetscInt_FMT "\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
1479a587d139SMark   }
1480a587d139SMark   *B = d_mat; // return it, set it in Mat, and set it up
1481a587d139SMark   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
1482a587d139SMark   PetscFunctionReturn(0);
1483a587d139SMark }
1484076ba34aSJunchao Zhang 
1485076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetOffloadMask(Mat A, const char **mask)
1486076ba34aSJunchao Zhang {
1487076ba34aSJunchao Zhang   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1488076ba34aSJunchao Zhang 
1489076ba34aSJunchao Zhang   PetscFunctionBegin;
1490076ba34aSJunchao Zhang   if (!aijkok) *mask = "AIJKOK_UNALLOCATED";
1491076ba34aSJunchao Zhang   else if (aijkok->a_dual.need_sync_host()) *mask = "PETSC_OFFLOAD_GPU";
1492076ba34aSJunchao Zhang   else if (aijkok->a_dual.need_sync_device()) *mask = "PETSC_OFFLOAD_CPU";
1493076ba34aSJunchao Zhang   else *mask = "PETSC_OFFLOAD_BOTH";
1494076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1495076ba34aSJunchao Zhang }
1496076ba34aSJunchao Zhang 
1497076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatAIJKokkosPrintOffloadMask(Mat A)
1498076ba34aSJunchao Zhang {
1499076ba34aSJunchao Zhang   PetscErrorCode    ierr;
1500076ba34aSJunchao Zhang   PetscMPIInt       size;
1501076ba34aSJunchao Zhang   Mat               Ad,Ao;
1502076ba34aSJunchao Zhang   const char        *amask,*bmask;
1503076ba34aSJunchao Zhang 
1504076ba34aSJunchao Zhang   PetscFunctionBegin;
1505076ba34aSJunchao Zhang   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr);
1506076ba34aSJunchao Zhang 
1507076ba34aSJunchao Zhang   if (size == 1) {
1508076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGetOffloadMask(A,&amask);CHKERRQ(ierr);
1509076ba34aSJunchao Zhang     ierr = PetscPrintf(PETSC_COMM_SELF,"%s\n",amask);CHKERRQ(ierr);
1510076ba34aSJunchao Zhang   } else {
1511076ba34aSJunchao Zhang     Ad  = ((Mat_MPIAIJ*)A->data)->A;
1512076ba34aSJunchao Zhang     Ao  = ((Mat_MPIAIJ*)A->data)->B;
1513076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGetOffloadMask(Ad,&amask);CHKERRQ(ierr);
1514076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGetOffloadMask(Ao,&bmask);CHKERRQ(ierr);
1515076ba34aSJunchao Zhang     ierr = PetscPrintf(PETSC_COMM_SELF,"Diag : Off-diag = %s : %s\n",amask,bmask);CHKERRQ(ierr);
1516076ba34aSJunchao Zhang   }
1517076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1518076ba34aSJunchao Zhang }
1519