xref: /petsc/src/mat/impls/aij/mpi/kokkos/mpiaijkok.kokkos.cxx (revision 8ebe3e4e9e00d86ece2e9fcd0cc84910b0ad437c)
1 #include <petscvec_kokkos.hpp>
2 #include <petscsf.h>
3 #include <../src/mat/impls/aij/mpi/mpiaij.h>
4 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
5 #include <KokkosSparse_spadd.hpp>
6 
7 PetscErrorCode MatAssemblyEnd_MPIAIJKokkos(Mat A,MatAssemblyType mode)
8 {
9   PetscErrorCode   ierr;
10   Mat_MPIAIJ       *mpiaij = (Mat_MPIAIJ*)A->data;
11   Mat_SeqAIJKokkos *aijkok = mpiaij->A->spptr ? static_cast<Mat_SeqAIJKokkos*>(mpiaij->A->spptr) : NULL;
12 
13   PetscFunctionBegin;
14   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
15   if (aijkok && aijkok->device_mat_d.data()) {
16     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
17   }
18 
19   PetscFunctionReturn(0);
20 }
21 
22 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJKokkos(Mat mat,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
23 {
24   PetscErrorCode ierr;
25   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
26 
27   PetscFunctionBegin;
28   ierr = PetscLayoutSetUp(mat->rmap);CHKERRQ(ierr);
29   ierr = PetscLayoutSetUp(mat->cmap);CHKERRQ(ierr);
30 #if defined(PETSC_USE_DEBUG)
31   if (d_nnz) {
32     PetscInt i;
33     for (i=0; i<mat->rmap->n; i++) {
34       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]);
35     }
36   }
37   if (o_nnz) {
38     PetscInt i;
39     for (i=0; i<mat->rmap->n; i++) {
40       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]);
41     }
42   }
43 #endif
44 #if defined(PETSC_USE_CTABLE)
45   ierr = PetscTableDestroy(&mpiaij->colmap);CHKERRQ(ierr);
46 #else
47   ierr = PetscFree(mpiaij->colmap);CHKERRQ(ierr);
48 #endif
49   ierr = PetscFree(mpiaij->garray);CHKERRQ(ierr);
50   ierr = VecDestroy(&mpiaij->lvec);CHKERRQ(ierr);
51   ierr = VecScatterDestroy(&mpiaij->Mvctx);CHKERRQ(ierr);
52   /* Because the B will have been resized we simply destroy it and create a new one each time */
53   ierr = MatDestroy(&mpiaij->B);CHKERRQ(ierr);
54 
55   if (!mpiaij->A) {
56     ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->A);CHKERRQ(ierr);
57     ierr = MatSetSizes(mpiaij->A,mat->rmap->n,mat->cmap->n,mat->rmap->n,mat->cmap->n);CHKERRQ(ierr);
58     ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->A);CHKERRQ(ierr);
59   }
60   if (!mpiaij->B) {
61     PetscMPIInt size;
62     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size);CHKERRMPI(ierr);
63     ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->B);CHKERRQ(ierr);
64     ierr = MatSetSizes(mpiaij->B,mat->rmap->n,size > 1 ? mat->cmap->N : 0,mat->rmap->n,size > 1 ? mat->cmap->N : 0);CHKERRQ(ierr);
65     ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->B);CHKERRQ(ierr);
66   }
67   ierr = MatSetType(mpiaij->A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
68   ierr = MatSetType(mpiaij->B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
69   ierr = MatSeqAIJSetPreallocation(mpiaij->A,d_nz,d_nnz);CHKERRQ(ierr);
70   ierr = MatSeqAIJSetPreallocation(mpiaij->B,o_nz,o_nnz);CHKERRQ(ierr);
71   mat->preallocated = PETSC_TRUE;
72   PetscFunctionReturn(0);
73 }
74 
75 PetscErrorCode MatMult_MPIAIJKokkos(Mat mat,Vec xx,Vec yy)
76 {
77   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
78   PetscErrorCode ierr;
79   PetscInt       nt;
80 
81   PetscFunctionBegin;
82   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
83   if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",mat->cmap->n,nt);
84   ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
85   ierr = (*mpiaij->A->ops->mult)(mpiaij->A,xx,yy);CHKERRQ(ierr);
86   ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
87   ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,yy,yy);CHKERRQ(ierr);
88   PetscFunctionReturn(0);
89 }
90 
91 PetscErrorCode MatMultAdd_MPIAIJKokkos(Mat mat,Vec xx,Vec yy,Vec zz)
92 {
93   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
94   PetscErrorCode ierr;
95   PetscInt       nt;
96 
97   PetscFunctionBegin;
98   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
99   if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",mat->cmap->n,nt);
100   ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
101   ierr = (*mpiaij->A->ops->multadd)(mpiaij->A,xx,yy,zz);CHKERRQ(ierr);
102   ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
103   ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,zz,zz);CHKERRQ(ierr);
104   PetscFunctionReturn(0);
105 }
106 
107 PetscErrorCode MatMultTranspose_MPIAIJKokkos(Mat mat,Vec xx,Vec yy)
108 {
109   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
110   PetscErrorCode ierr;
111   PetscInt       nt;
112 
113   PetscFunctionBegin;
114   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
115   if (nt != mat->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",mat->rmap->n,nt);
116   ierr = (*mpiaij->B->ops->multtranspose)(mpiaij->B,xx,mpiaij->lvec);CHKERRQ(ierr);
117   ierr = (*mpiaij->A->ops->multtranspose)(mpiaij->A,xx,yy);CHKERRQ(ierr);
118   ierr = VecScatterBegin(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
119   ierr = VecScatterEnd(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
120   PetscFunctionReturn(0);
121 }
122 
123 /* Merge the "A, B" matrices of mat into a matrix C.  mat's type is MPIAIJKOKKOS. C's type is MATSEQAIJKOKKOS.
124    A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n).
125    C still uses local column ids. Their corresponding global column ids are returned in glob.
126 */
127 PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJKokkos(Mat mat,MatReuse reuse,IS *glob,Mat *C)
128 {
129   Mat            Ad,Ao;
130   const PetscInt *cmap;
131   PetscErrorCode ierr;
132 
133   PetscFunctionBegin;
134   ierr = MatMPIAIJGetSeqAIJ(mat,&Ad,&Ao,&cmap);CHKERRQ(ierr);
135   ierr = MatSeqAIJKokkosMergeMats(Ad,Ao,reuse,C);CHKERRQ(ierr);
136   if (glob) {
137     PetscInt cst, i, dn, on, *gidx;
138     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
139     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
140     ierr = MatGetOwnershipRangeColumn(mat,&cst,NULL);CHKERRQ(ierr);
141     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
142     for (i=0; i<dn; i++) gidx[i]    = cst + i;
143     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
144     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
145   }
146   PetscFunctionReturn(0);
147 }
148 
149 /* Structs used in matrix product C=AB, C=A^tB and C=B^tAB */
150 struct MatMatStruct {
151   MatRowMapKokkosView   Cdstart; /* Used to split sequential matrix into petsc's A, B format */
152   PetscSF               sf; /* SF to send/recv matrix entries */
153   MatScalarKokkosView   abuf; /* buf of mat values in send/recv */
154   Mat                   C1,C2,B_local;
155   KokkosCsrMatrix       C1_global,C2_global,C_global;
156   KernelHandle          kh;
157   MatMatStruct() {
158     C1 = C2 = B_local = NULL;
159     sf = NULL;
160   }
161 
162   ~MatMatStruct() {
163     MatDestroy(&C1);
164     MatDestroy(&C2);
165     MatDestroy(&B_local);
166     PetscSFDestroy(&sf);
167     kh.destroy_spadd_handle();
168   }
169 };
170 
171 struct MatMatStruct_AB : public MatMatStruct {
172   MatColIdxKokkosView   rows;
173   MatRowMapKokkosView   rowoffset;
174   Mat                   B_other,C_petsc; /* SEQAIJKOKKOS matrices. TODO: have a better var name than C_petsc */
175 
176   MatMatStruct_AB() : B_other(NULL),C_petsc(NULL){}
177   ~MatMatStruct_AB() {
178     MatDestroy(&B_other);
179     MatDestroy(&C_petsc);
180   }
181 };
182 
183 struct MatMatStruct_AtB : public MatMatStruct {
184   MatRowMapKokkosView   srcrowoffset,dstrowoffset;
185 };
186 
187 struct MatProductData_MPIAIJKokkos
188 {
189   MatMatStruct_AB   *mmAB;
190   MatMatStruct_AtB  *mmAtB;
191   PetscBool         reusesym;
192 
193   MatProductData_MPIAIJKokkos(): mmAB(NULL),mmAtB(NULL),reusesym(PETSC_FALSE){}
194   ~MatProductData_MPIAIJKokkos() {
195     delete mmAB;
196     delete mmAtB;
197   }
198 };
199 
200 static PetscErrorCode MatProductDataDestroy_MPIAIJKokkos(void *data)
201 {
202   PetscFunctionBegin;
203   CHKERRCXX(delete static_cast<MatProductData_MPIAIJKokkos*>(data));
204   PetscFunctionReturn(0);
205 }
206 
207 /* MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds - Get a KokkosCsrMatrix from a MATSEQAIJKOKKOS matrix
208 
209    Input Parameters:
210 +  A       - the MATSEQAIJKOKKOS matrix
211 .  N       - new column size for the returned Kokkos matrix
212 -  l2g     - a map that maps old col ids to new col ids
213 
214    Output Parameters:
215 .  csrmat  - the Kokkos matrix, which has the same row size as A, shares a, i but not j with A.
216  */
217 static PetscErrorCode MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(Mat A,PetscInt N,const ConstMatColIdxKokkosView& l2g,KokkosCsrMatrix& csrmat)
218 {
219   KokkosCsrMatrix&         orig = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->csrmat;
220   MatColIdxKokkosView      jg("jg",orig.nnz()); /* New j array for csrmat */
221 
222   PetscFunctionBegin;
223   CHKERRCXX(Kokkos::parallel_for(orig.nnz(), KOKKOS_LAMBDA(const PetscInt i) {jg(i) = l2g(orig.graph.entries(i));}));
224   CHKERRCXX(csrmat = KokkosCsrMatrix("csrmat",orig.numRows(),N,orig.nnz(),orig.values,orig.graph.row_map,jg));
225   PetscFunctionReturn(0);
226 }
227 
228 /* MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices - Set the diag and offdiag matrices of a MATMPIAIJKOKKOS matrix.
229    It is similar to MatCreateMPIAIJWithSplitArrays.
230 
231   Input Parameters:
232 +  mat   - the MATMPIAIJKOKKOS matrix, which should have its type and layout set, but should not have its diag, offdiag matrices set
233 .  A     - the diag matrix using local col ids
234 -  B     - the offdiag matrix using global col ids
235 
236   Output Parameters:
237 .  mat   - the updated MATMPIAIJKOKKOS matrix
238 */
239 static PetscErrorCode MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices(Mat mat,Mat A,Mat B)
240 {
241   PetscErrorCode      ierr;
242   Mat_MPIAIJ          *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data);
243   PetscInt            m,n,M,N,Am,An,Bm,Bn;
244   Mat_SeqAIJKokkos    *bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
245 
246   PetscFunctionBegin;
247   ierr = MatGetSize(mat,&M,&N);CHKERRQ(ierr);
248   ierr = MatGetLocalSize(mat,&m,&n);CHKERRQ(ierr);
249   ierr = MatGetLocalSize(A,&Am,&An);CHKERRQ(ierr);
250   ierr = MatGetLocalSize(B,&Bm,&Bn);CHKERRQ(ierr);
251 
252   if (m != Am || m != Bm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"local number of rows do not match");
253   if (n != An) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"local number of columns do not match");
254   if (N != Bn) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"global number of columns do not match");
255   if (mpiaij->A || mpiaij->B) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A, B of the MPIAIJ matrix are not empty");
256   mpiaij->A = A;
257   mpiaij->B = B;
258 
259   mat->preallocated     = PETSC_TRUE;
260   mat->nooffprocentries = PETSC_TRUE; /* See MatAssemblyBegin_MPIAIJ. In effect, making MatAssemblyBegin a nop */
261 
262   ierr = MatSetOption(mat,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE);CHKERRQ(ierr);
263   ierr = MatAssemblyBegin(mat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
264   /* MatAssemblyEnd is critical here. It sets mat->offloadmask according to A and B's, and
265     also gets mpiaij->B compacted, with its col ids and size reduced
266   */
267   ierr = MatAssemblyEnd(mat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
268   ierr = MatSetOption(mat,MAT_NO_OFF_PROC_ENTRIES,PETSC_FALSE);CHKERRQ(ierr);
269   ierr = MatSetOption(mat,MAT_NEW_NONZERO_LOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
270 
271   /* Update bkok with new local col ids (stored on host) and size */
272   bkok->j_dual.modify_host();
273   bkok->j_dual.sync_device();
274   bkok->SetColSize(mpiaij->B->cmap->n);
275   PetscFunctionReturn(0);
276 }
277 
278 /* MatSeqAIJKokkosBcast - Bcast rows of a SEQAIJKOKKOS matrice (B) to form a SEQAIJKOKKOS matrix (C).
279 
280    It is essentially the MPIAIJKOKKOS counterpart of MatGetBrowsOfAoCols_MPIAIJ, but supports device and uses PetscSF.
281    In the given ownerSF, leaves correspond to rows in C, and roots correspond to rows in B. Roots may connect to multiple leaves.
282    Suppose C's j-th row is connected to a root identified by PetscSFNode (k,i), it means we will bcast the i-th row of B on rank k
283    to j-th row of C. ownerSF's leaves must be contiguous (in other words, as if ilocal=NULL was used to set its graph).
284 
285    Collective on comm of ownerSF
286 
287    Input Parameters:
288 +   B       - the SEQAIJKOKKOS matrix, using local col ids
289 .   reuse   - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX
290 .   N       - global col ids are in range of [0,N). N Must be the same across ranks (nonsignificant in MAT_REUSE_MATRIX)
291 .   l2g     - a map mapping B's local col ids to global ones (nonsignificant in MAT_REUSE_MATRIX)
292 .   ownerSF - the ownership SF (nonsignificant in MAT_REUSE_MATRIX)
293 
294    Input/Output Parameters (out when resue = MAT_INITIAL_MATRIX, inout when reuse = MAT_REUSE_MATRIX)
295 +   bcastSF   - the SF used to bcast rows of B. This plain SF does buffer (abuf) to buffer (Ca) send/recv. In this SF, vertices are nonzeros.
296 .   abuf      - buffer for sending matrix values
297 .   rows      - array containing indices of (local) rows that this rank needs to bcast to others. Each receiver rank has a chunk in rows[].
298                 Values in rows[] might have repeats, which simply indicates a row will be bcast'ed to multiple neighbors.
299 .   rowoffset - For each row in rows[], it will be copied to rowoffset[] at abuf[]
300 -   C         -  the SEQAIJKOKKOS matrix made of the bcast'ed rows, using local col ids.
301 */
302 static PetscErrorCode MatSeqAIJKokkosBcast(Mat B,MatReuse reuse,PetscInt N,const ConstMatColIdxKokkosView& l2g,PetscSF ownerSF,
303                                            PetscSF& bcastSF,MatScalarKokkosView& abuf,MatColIdxKokkosView& rows,
304                                            MatRowMapKokkosView& rowoffset,Mat& C)
305 {
306   PetscErrorCode               ierr;
307   Mat_SeqAIJKokkos             *bkok,*ckok;
308 
309   PetscFunctionBegin;
310   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); /* Make sure B->spptr is accessible */
311   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
312 
313   if (reuse == MAT_REUSE_MATRIX) {
314     ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
315 
316     const auto& Ba = bkok->a_dual.view_device();
317     const auto& Bi = bkok->i_dual.view_device();
318     const auto& Ca = ckok->a_dual.view_device();
319 
320     /* Copy Ba to abuf */
321     Kokkos::parallel_for(Kokkos::TeamPolicy<>(rows.extent(0), Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
322       PetscInt i    = t.league_rank(); /* rows[i] is r-th row of B */
323       PetscInt r    = rows(i);
324       PetscInt base = rowoffset(i); /* Copy r-th row of B to this offset in abuf[] */
325       Kokkos::parallel_for(Kokkos::TeamThreadRange(t,Bi(r+1)-Bi(r)),[&](PetscInt k) {
326         abuf(base+k) = Ba(Bi(r)+k);
327       });
328     });
329 
330     /* Send abuf to Ca through bcastSF and then mark C is updated on device */
331     ierr = PetscSFBcastBegin(bcastSF,MPIU_SCALAR,abuf.data(),Ca.data(),MPI_REPLACE);CHKERRQ(ierr); /* TODO: get memtype for abuf */
332     ierr = PetscSFBcastEnd  (bcastSF,MPIU_SCALAR,abuf.data(),Ca.data(),MPI_REPLACE);CHKERRQ(ierr);
333     ckok->a_dual.modify_device();
334   } else if (reuse == MAT_INITIAL_MATRIX) {
335     MPI_Comm       comm;
336     PetscMPIInt    tag;
337     PetscInt       k,Cm,Cn,Cnnz,*Ci_h,nroots,nleaves;
338 
339     ierr = PetscObjectGetComm((PetscObject)ownerSF,&comm);CHKERRMPI(ierr);
340     ierr = PetscSFGetGraph(ownerSF,&nroots,&nleaves,NULL,NULL);CHKERRQ(ierr);
341     Cm   = nleaves; /* row size of C */
342     Cn   = N;  /* col size of C, which initially uses global ids, so we can safely set its col size as N */
343 
344     /* Get row lens (nz) of B's rows for later fast query */
345     PetscInt       *Browlens;
346     const PetscInt *tmp = bkok->i_host_data();
347     ierr = PetscMalloc1(nroots,&Browlens);CHKERRQ(ierr);
348     for (k=0; k<nroots; k++) Browlens[k] = tmp[k+1]-tmp[k];
349 
350     /* By ownerSF, each proc gets lens of rows of C */
351     MatRowMapKokkosDualView Ci("i",Cm+1); /* C's rowmap */
352     Ci_h    = Ci.view_host().data();
353     Ci_h[0] = 0;
354     ierr    = PetscSFBcastWithMemTypeBegin(ownerSF,MPIU_INT,PETSC_MEMTYPE_HOST,Browlens,PETSC_MEMTYPE_HOST,&Ci_h[1],MPI_REPLACE);CHKERRQ(ierr);
355     ierr    = PetscSFBcastEnd(ownerSF,MPIU_INT,Browlens,&Ci_h[1],MPI_REPLACE);CHKERRQ(ierr);
356     for (k=1; k<Cm+1; k++) Ci_h[k] += Ci_h[k-1]; /* Convert lens to CSR */
357     Cnnz    = Ci_h[Cm];
358     Ci.modify_host();
359     Ci.sync_device();
360 
361     /* With the newly known Cnnz, we are able to allocate (j, a) for C on host & device */
362     MatColIdxKokkosDualView  Cj("j",Cnnz);
363     MatScalarKokkosDualView  Ca("a",Cnnz);
364 
365     /* Now build the bcastSF to fill Ca, Cj. This plain SF only does (contiguous) buffer to buffer send/recv */
366     const PetscMPIInt *iranks,*ranks;
367     const PetscInt    *ioffset,*irootloc,*roffset;
368     PetscInt          i,j,niranks,nranks,*sdisp,*rdisp,*rowptr;
369     MPI_Request       *reqs;
370 
371     ierr = PetscSFGetLeafRanks(ownerSF,&niranks,&iranks,&ioffset,&irootloc);CHKERRQ(ierr); /* irootloc[] contains indices of rows I need to send to each receiver */
372     ierr = PetscSFGetRootRanks(ownerSF,&nranks,&ranks,&roffset,NULL/*rmine*/,NULL/*rremote*/);CHKERRQ(ierr); /* recv info */
373 
374     /* figure out offsets at the send buffer, to build the SF
375       sdisp[]  - stores offsets of nonzeros (in abuf or jbuf, see later) I need to send, per receiver.
376       rowptr[] - stores offsets for data of each row in abuf
377 
378       rdisp[]  - to receive sdisp[]
379     */
380     ierr = PetscMalloc3(niranks+1,&sdisp,nranks,&rdisp,niranks+nranks,&reqs);CHKERRQ(ierr);
381     MatRowMapKokkosViewHost rowptr_h("rowptr_h",ioffset[niranks]+1); /* Let Kokkos do the allocation, so that we can do an easy mirror later */
382     rowptr = rowptr_h.data();
383 
384     sdisp[0] = 0;
385     rowptr[0]  = 0;
386     for (i=0; i<niranks; i++) { /* for each receiver */
387       PetscInt len, nz = 0;
388       for (j=ioffset[i]; j<ioffset[i+1]; j++) { /* for each row to this receiver */
389         len         = Browlens[irootloc[j]];
390         rowptr[j+1] = rowptr[j] + len;
391         nz         += len;
392       }
393       sdisp[i+1] = sdisp[i] + nz;
394     }
395     ierr = PetscCommGetNewTag(comm,&tag);CHKERRMPI(ierr);
396     for (i=0; i<nranks; i++)  {ierr = MPI_Irecv(&rdisp[i],1,MPIU_INT,ranks[i],tag,comm,&reqs[i]);CHKERRMPI(ierr);}
397     for (i=0; i<niranks; i++) {ierr = MPI_Isend(&sdisp[i],1,MPIU_INT,iranks[i],tag,comm,&reqs[nranks+i]);CHKERRMPI(ierr);}
398     ierr = MPI_Waitall(niranks+nranks,reqs,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
399 
400     PetscInt    nleaves2 = Cnnz; /* leaves are the nonzeros I will receive */
401     PetscInt    nroots2  = sdisp[niranks]; /* roots are the nonzeros (in abuf) I will send */
402     PetscSFNode *iremote;
403     ierr = PetscMalloc1(nleaves2,&iremote);CHKERRQ(ierr);
404     for (i=0; i<nranks; i++) { /* for each sender */
405       k = 0;
406       for (j=Ci_h[roffset[i]]; j<Ci_h[roffset[i+1]]; j++) {
407         iremote[j].rank  = ranks[i];
408         iremote[j].index = rdisp[i] + k;
409         k++;
410       }
411     }
412     /* TODO: we should extend PetscSF APIs for this buffer-to-buffer send/recv */
413     ierr = PetscSFCreate(comm,&bcastSF);CHKERRQ(ierr);
414     ierr = PetscSFSetGraph(bcastSF,nroots2,nleaves2,NULL/*ilocal*/,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
415 
416     /* Extract selected rows of B, and copy their (a, j) into abuf[] and jbuf[], with j converted
417       from local to global. Then use bcastSF to fill Ca, Cj.
418     */
419     ConstMatColIdxKokkosViewHost rows_h(irootloc,ioffset[niranks]); /* irootloc[] stores indices of rows I need to send */
420     MatColIdxKokkosView          rows("rows",ioffset[niranks]);
421     Kokkos::deep_copy(rows,rows_h); /* Use deep copy since irootoc is managed by PetscSF and we want 'rows' to be standalone */
422 
423     rowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),rowptr_h); /* If no device, rowoffset will be an alias to rowptr_h */
424 
425     MatColIdxKokkosView jbuf("jbuf",sdisp[niranks]); /* send buf for (global) col ids */
426     abuf = MatScalarKokkosView("abuf",sdisp[niranks]); /* send buf for mat values */
427 
428     const auto& Ba = bkok->a_dual.view_device();
429     const auto& Bi = bkok->i_dual.view_device();
430     const auto& Bj = bkok->j_dual.view_device();
431 
432     /* Copy Ba, Bj to abuf, jbuf with change col ids from local to global */
433     Kokkos::parallel_for(Kokkos::TeamPolicy<>(rows.extent(0), Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
434       PetscInt i    = t.league_rank(); /* rows[i] is r-th row of B */
435       PetscInt r    = rows(i);
436       PetscInt base = rowoffset(i); /* Copy r-th row of B to this offset in abuf[] */
437       Kokkos::parallel_for(Kokkos::TeamThreadRange(t,Bi(r+1)-Bi(r)),[&](PetscInt k) {
438         abuf(base+k) = Ba(Bi(r)+k);
439         jbuf(base+k) = l2g(Bj(Bi(r)+k));
440       });
441     });
442 
443     /* Send abuf & jbuf to fill Ca, Cj */
444     ierr = PetscSFBcastBegin(bcastSF,MPIU_INT,   jbuf.data(),Cj.view_device().data(),MPI_REPLACE);CHKERRQ(ierr);
445     ierr = PetscSFBcastBegin(bcastSF,MPIU_SCALAR,abuf.data(),Ca.view_device().data(),MPI_REPLACE);CHKERRQ(ierr);
446     ierr = PetscSFBcastEnd  (bcastSF,MPIU_INT,   jbuf.data(),Cj.view_device().data(),MPI_REPLACE);CHKERRQ(ierr);
447     ierr = PetscSFBcastEnd  (bcastSF,MPIU_SCALAR,abuf.data(),Ca.view_device().data(),MPI_REPLACE);CHKERRQ(ierr);
448     Cj.modify_device(); /* Mark Cj, Ca modified on device, but only sync Cj since we might not need Ca on host at all */
449     Cj.sync_host();
450     Ca.modify_device();
451 
452     /* Construct C with Ca, Ci, Cj */
453     auto ckok = new Mat_SeqAIJKokkos(Cm,Cn,Cnnz,Ci,Cj,Ca);
454     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,&C);CHKERRQ(ierr);
455     ierr = PetscFree3(sdisp,rdisp,reqs);CHKERRQ(ierr);
456     ierr = PetscFree(Browlens);CHKERRQ(ierr);
457   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unsupported MatReuse enum %d",reuse);
458   PetscFunctionReturn(0);
459 }
460 
461 /* MatSeqAIJKokkosReduce - Reduce rows of a SEQAIJKOKKOS matrix (A) to form a Kokkos Csr matrix (C)
462 
463   It is the reverse of MatSeqAIJKokkosBcast in some sense.
464 
465   Think each row of A as a leaf, then the given ownerSF specifies roots of the leaves. Roots may connect to multiple leaves.
466   In this routine, we reduce (i.e., concatenate) leaves (rows) at their roots to form potentially longer rows in C. Such rows might
467   contain repeats, which does not matter since they will be summed up by other routines. C's row size will be nroots of ownerSF.
468 
469   Input Parameters:
470 +  A        - the SEQAIJKOKKOS matrix to be reduced
471 .  reuse    - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX
472 .  local    - true if A uses local col ids; false if A is already in global col ids.
473 .  N        - if local, N is A's global col size
474 .  l2g      - if local, a map mapping A's local col ids to global ones, which are in range of [0,N).
475 -  ownerSF  - the SF specifies ownership (root) of rows in A
476 
477   Output Parameters:
478 +  reduceSF    - the SF to reduce A's rows to contiguous buffers at the receiver side
479 .  abuf         - a contiguous buffer to receive A's rows sent to this proc. Suppose there are 'nrows' such rows.
480 .  srcrowoffset - offset array of size nrows+1. Each entry is the corresponding row's offset in abuf[]. srcrowoffset[i+1]-srcrowoffset[i] is row i's len.
481 .  dstrowoffset - offset array of size nrows. Each entry is the corresponding row's offset in Ca[], i.e., C's 'a' array. Row i, i+1 in abuf[] may go to
482                   unrelated places in Ca, so dstrowoffset is not in CSR-like format as srcrowoffset.
483 -  C            - the matrix made up by rows sent to me from other ranks, using global col ids
484 
485    TODO: we can even have MatSeqAIJKokkosReduceBegin/End to provide oppertunity for callers to overlap comp./comm. when reuse = MAT_REUSE_MATRIX.
486  */
487 static PetscErrorCode MatSeqAIJKokkosReduce(Mat A,MatReuse reuse,PetscBool local,PetscInt N,const ConstMatColIdxKokkosView& l2g,PetscSF ownerSF,
488                                             PetscSF& reduceSF,MatScalarKokkosView& abuf,
489                                             MatRowMapKokkosView& srcrowoffset,MatRowMapKokkosView& dstrowoffset,
490                                             KokkosCsrMatrix& C)
491 {
492   PetscErrorCode         ierr;
493   PetscInt               i,r,Am,An,Annz,Cnnz,nrows;
494   const PetscInt         *Ai;
495   Mat_SeqAIJKokkos       *akok;
496 
497   PetscFunctionBegin;
498   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); /* So that A's latest data is on device */
499   ierr = MatGetSize(A,&Am,&An);
500   Ai   = static_cast<Mat_SeqAIJ*>(A->data)->i;
501   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
502   Annz = Ai[Am];
503 
504   if (reuse == MAT_REUSE_MATRIX) {
505     /* Send Aa to abuf */
506     ierr = PetscSFReduceBegin(reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE);CHKERRMPI(ierr);
507     ierr = PetscSFReduceEnd  (reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE);CHKERRMPI(ierr);
508 
509     /* Copy abuf to Ca */
510     const MatScalarKokkosView& Ca = C.values;
511     nrows = dstrowoffset.extent(0); /* Not srcrowoffset[] since it has an extra entry for CSR */
512     Kokkos::parallel_for(Kokkos::TeamPolicy<>(nrows, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
513       PetscInt i   = t.league_rank();
514       PetscInt src = srcrowoffset(i), dst = dstrowoffset(i);
515       PetscInt len = srcrowoffset(i+1) - srcrowoffset(i);
516       Kokkos::parallel_for(Kokkos::TeamThreadRange(t,len), [&](PetscInt k) {Ca(dst+k) = abuf(src+k);});
517     });
518   } else if (reuse == MAT_INITIAL_MATRIX) {
519     MPI_Comm               comm;
520     MPI_Request            *reqs;
521     PetscMPIInt            tag;
522     PetscInt               Cm;
523 
524     ierr = PetscObjectGetComm((PetscObject)ownerSF,&comm);CHKERRQ(ierr);
525     ierr = PetscCommGetNewTag(comm,&tag);CHKERRQ(ierr);
526 
527     PetscInt niranks,nranks,nroots,nleaves;
528     const PetscMPIInt *iranks,*ranks;
529     const PetscInt *ioffset,*rows,*roffset;  /* rows[] contains local indices of rows scattered to me from others. ioffset[] is a CSR on rows[] */
530     ierr = PetscSFSetUp(ownerSF);CHKERRQ(ierr);
531     ierr = PetscSFGetLeafRanks(ownerSF,&niranks,&iranks,&ioffset,&rows);CHKERRQ(ierr); /* recv info: iranks[] will send rows to me */
532     ierr = PetscSFGetRootRanks(ownerSF,&nranks,&ranks,&roffset,NULL/*rmine*/,NULL/*rremote*/);CHKERRQ(ierr); /* send info */
533     ierr = PetscSFGetGraph(ownerSF,&nroots,&nleaves,NULL,NULL);CHKERRQ(ierr);
534     if (nleaves != Am) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"ownerSF's nleaves(%" PetscInt_FMT ") != row size of A(%" PetscInt_FMT ")",nleaves,Am);
535     Cm    = nroots;
536     nrows = ioffset[niranks]; /* # of rows to be received. Might receive same row (each is partial) from different senders */
537 
538     /* Tell owners how long each row I will send */
539     PetscInt                *srowlens; /* send buf of row lens */
540     MatRowMapKokkosViewHost rrowlens_h("rrowoffset_h",nrows+1); /* recv buf of row lens. +1 to make CSR later. Memory might be passed to other views */
541     PetscInt                *rrowlens = rrowlens_h.data();
542 
543     ierr = PetscMalloc2(Am,&srowlens,niranks+nranks,&reqs);CHKERRQ(ierr);
544     for (i=0; i<Am; i++) srowlens[i] = Ai[i+1] - Ai[i];
545     rrowlens[0] = 0;
546     rrowlens++; /* shift the pointer to make the following expression more readable */
547     for (i=0; i<niranks; i++){ierr = MPI_Irecv(&rrowlens[ioffset[i]],ioffset[i+1]-ioffset[i],MPIU_INT,iranks[i],tag,comm,&reqs[i]);CHKERRMPI(ierr);}
548     for (i=0; i<nranks; i++) {ierr = MPI_Isend(&srowlens[roffset[i]],roffset[i+1]-roffset[i],MPIU_INT,ranks[i],tag,comm,&reqs[niranks+i]);CHKERRMPI(ierr);}
549     ierr = MPI_Waitall(niranks+nranks,reqs,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
550 
551     /* Owner builds Ci on host by histogramming rrowlens[] */
552     MatRowMapKokkosViewHost Ci_h("i",Cm+1);
553     Kokkos::deep_copy(Ci_h,0); /* Zero Ci */
554     MatRowMapType *Ci_ptr = Ci_h.data();
555 
556     for (i=0; i<nrows; i++) {
557       r = rows[i]; /* local row id of i-th received row */
558      #if defined(PETSC_USE_DEBUG)
559       if (r<0 || r>=Cm) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"local row id (%" PetscInt_FMT ") is out of range [0,%" PetscInt_FMT ")",r,Cm);
560      #endif
561       Ci_ptr[r+1] += rrowlens[i]; /* add to length of row r in C */
562     }
563     for (i=0; i<Cm; i++) Ci_ptr[i+1] += Ci_ptr[i]; /* to CSR format */
564     Cnnz = Ci_ptr[Cm];
565 
566     /* For each received row, compute src & dst offsets in memory copying (from recv bufs abuf, jbuf to Ca, Cj) */
567     MatRowMapKokkosViewHost dstrowoffset_h("dstrowoffset_h",nrows);
568     PetscInt                *dstrowoffset_hptr = dstrowoffset_h.data();
569     PetscInt                *currowlens; /* Current row lens. They are temp accumulators for row lens in C, to help build dstrowoffset */
570 
571     ierr = PetscCalloc1(Cm,&currowlens);CHKERRQ(ierr); /* Init with zero, to be added to */
572     for (i=0; i<nrows; i++) { /* for each row I receive */
573       r                    = rows[i]; /* row id in C */
574       dstrowoffset_hptr[i] = Ci_ptr[r] + currowlens[r]; /* dst offset of the new place for each recv'ed row in Ca/Cj */
575       currowlens[r]       += rrowlens[i]; /* accumulate to length of row r in C */
576     }
577     ierr = PetscFree(currowlens);CHKERRQ(ierr);
578 
579     rrowlens--;
580     for (i=0; i<nrows; i++) rrowlens[i+1] += rrowlens[i]; /* Change rrowlens[] to CSR format */
581     dstrowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),dstrowoffset_h);
582     srcrowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),rrowlens_h); /* src offset of each recv'ed row in abuf/jbuf */
583 
584     /* Build the reduceSF, which performs buffer to buffer send/recv */
585     PetscInt *sdisp,*rdisp; /* buffer to send offsets of roots, and buffer to recv them */
586     ierr = PetscMalloc2(niranks,&sdisp,nranks,&rdisp);CHKERRQ(ierr);
587     for (i=0; i<niranks; i++) sdisp[i] = rrowlens[ioffset[i]];
588     for (i=0; i<nranks; i++)  {ierr = MPI_Irecv(&rdisp[i],1,MPIU_INT,ranks[i],tag,comm,&reqs[i]);CHKERRMPI(ierr);}
589     for (i=0; i<niranks; i++) {ierr = MPI_Isend(&sdisp[i],1,MPIU_INT,iranks[i],tag,comm,&reqs[nranks+i]);CHKERRMPI(ierr);}
590     ierr = MPI_Waitall(niranks+nranks,reqs,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
591 
592     /* Nonzeros in abuf/jbuf are roots and those in A are leaves */
593     PetscInt    nroots2 = Cnnz,nleaves2 = Annz;
594     PetscSFNode *iremote;
595     ierr = PetscMalloc1(nleaves2,&iremote);CHKERRQ(ierr); /* no free, since memory will be given to reduceSF */
596     for (i=0; i<nranks; i++) {
597       PetscInt rootbase = rdisp[i]; /* root offset at this root rank */
598       PetscInt leafbase = Ai[roffset[i]]; /* leaf base */
599       PetscInt nz       = Ai[roffset[i+1]] - leafbase; /* I will send nz nonzeros to this root rank */
600       for (PetscInt k=0; k<nz; k++) {
601         iremote[leafbase+k].rank  = ranks[i];
602         iremote[leafbase+k].index = rootbase + k;
603       }
604     }
605     ierr = PetscSFCreate(comm,&reduceSF);CHKERRQ(ierr);
606     ierr = PetscSFSetGraph(reduceSF,nroots2,nleaves2,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
607     ierr = PetscFree2(sdisp,rdisp);CHKERRQ(ierr);
608 
609     /* Reduce Aa, Ajg to abuf and jbuf */
610 
611     /* If A uses local col ids, convert them to global ones before sending */
612     MatColIdxKokkosView Ajg;
613     if (local) {
614       Ajg = MatColIdxKokkosView("j",Annz);
615       const MatColIdxKokkosView& Aj = akok->j_dual.view_device();
616       Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscInt i) {Ajg(i) = l2g(Aj(i));});
617     } else {
618       Ajg = akok->j_dual.view_device(); /* no data copy, just take a reference */
619     }
620 
621     MatColIdxKokkosView   jbuf("jbuf",Cnnz);
622     abuf = MatScalarKokkosView("abuf",Cnnz);
623     ierr = PetscSFReduceBegin(reduceSF,MPIU_INT,   Ajg.data(),           jbuf.data(),MPI_REPLACE);CHKERRMPI(ierr);
624     ierr = PetscSFReduceEnd  (reduceSF,MPIU_INT,   Ajg.data(),           jbuf.data(),MPI_REPLACE);CHKERRMPI(ierr);
625     ierr = PetscSFReduceBegin(reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE);CHKERRMPI(ierr);
626     ierr = PetscSFReduceEnd  (reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE);CHKERRMPI(ierr);
627 
628     /* Copy data from abuf, jbuf to Ca, Cj */
629     MatRowMapKokkosView    Ci = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),Ci_h); /* Ci is an alias of Ci_h if no device */
630     MatColIdxKokkosView    Cj("j",Cnnz);
631     MatScalarKokkosView    Ca("a",Cnnz);
632 
633     Kokkos::parallel_for(Kokkos::TeamPolicy<>(nrows, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
634       PetscInt i   = t.league_rank();
635       PetscInt src = srcrowoffset(i), dst = dstrowoffset(i);
636       PetscInt len = srcrowoffset(i+1) - srcrowoffset(i);
637       Kokkos::parallel_for(Kokkos::TeamThreadRange(t,len), [&](PetscInt k) {
638         Ca(dst+k) = abuf(src+k);
639         Cj(dst+k) = jbuf(src+k);
640       });
641     });
642 
643     /* Build C with Ca, Ci, Cj */
644     C    = KokkosCsrMatrix("csrmat",Cm,N,Cnnz,Ca,Ci,Cj);
645     ierr = PetscFree2(srowlens,reqs);CHKERRQ(ierr);
646   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unsupported MatReuse enum %d",reuse);
647   PetscFunctionReturn(0);
648 }
649 
650 /* MatSetMPIAIJKokkosWithGlobalCSRMatrix - Set the diag and offdiag parts of a MATMPIAIJKOKKOS matrix by splitting a KokkosCsrMatrix
651 
652   Input Parameters:
653 +  C        - the MATMPIAIJKOKKOS matrix, of size m,n,M,N
654 .  reuse    - indicate whether the matrix has called this function before
655 .  csrmat   - the KokkosCsrMatrix, of size m,N
656 -  Cdstart  - when reuse == MAT_REUSE_MATRIX, it is an input parameter. For each row in csrmat, it stores the start of the first
657               entry of the diag block of C in csrmat's j array. E.g, if row i has col ids = {0, 3, 4, 5, 7, 9} and the first diag
658               entry is 5, then Cdstart[i] = 3.
659 
660   Output Parameters:
661 +  C        - the updated MATMPIAIJKOKKOS matrix
662 -  Cdstart - when reuse == MAT_INITIAL_MATRIX, it is an output parameter
663 
664   Notes:
665    Between calls with MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX, csrmat must have the same nonzero pattern
666  */
667 static PetscErrorCode MatSetMPIAIJKokkosWithGlobalCSRMatrix(Mat C,MatReuse reuse,const KokkosCsrMatrix& csrmat,MatRowMapKokkosView& Cdstart)
668 {
669   PetscErrorCode                  ierr;
670   const MatScalarKokkosView&      Ca = csrmat.values;
671   const ConstMatRowMapKokkosView& Ci = csrmat.graph.row_map;
672   PetscInt                        m,n,N;
673 
674   PetscFunctionBegin;
675   ierr = MatGetLocalSize(C,&m,&n);CHKERRQ(ierr);
676   ierr = MatGetSize(C,NULL,&N);CHKERRQ(ierr);
677 
678   if (reuse == MAT_REUSE_MATRIX) {
679     Mat_MPIAIJ                  *mpiaij = static_cast<Mat_MPIAIJ*>(C->data);
680     Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(mpiaij->A->spptr);
681     Mat_SeqAIJKokkos            *bkok = static_cast<Mat_SeqAIJKokkos*>(mpiaij->B->spptr);
682     const MatScalarKokkosView&  Cda = akok->a_dual.view_device(),Coa = bkok->a_dual.view_device();
683     const MatRowMapKokkosView&  Cdi = akok->i_dual.view_device(),Coi = bkok->i_dual.view_device();
684 
685     /* Fill 'a' of Cd and Co on device */
686     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
687       PetscInt i       = t.league_rank(); /* row i */
688       PetscInt clen    = Ci(i+1) - Ci(i); /* len of row i of C */
689       PetscInt cdlen   = Cdi(i+1) - Cdi(i); /* len of row i of Cd */
690       PetscInt cdstart = Cdstart(i); /* [start, end) of row i of Cd in C */
691       PetscInt cdend   = cdstart + cdlen;
692       /* [0, clen) is cut into three blocks: [0, cdstart), [cdstart, cdend), [cdend, clen) */
693       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clen), [&](PetscInt k) {
694         if (k < cdstart) {  /* k in [0, cdstart) */
695           Coa(Coi(i)+k) = Ca(Ci(i)+k);
696         } else if (k < cdend) { /* k in [cdstart, cdend) */
697           Cda(Cdi(i)+(k-cdstart)) = Ca(Ci(i)+k);
698         } else { /* k in [cdend, clen) */
699           Coa(Coi(i)+k-cdlen) = Ca(Ci(i)+k);
700         }
701       });
702     });
703 
704     akok->a_dual.modify_device();
705     bkok->a_dual.modify_device();
706   } else if (reuse == MAT_INITIAL_MATRIX) {
707     Mat                         Cd,Co;
708     const MatColIdxKokkosView&  Cj = csrmat.graph.entries;
709     MatRowMapKokkosDualView     Cdi_dual("i",m+1),Coi_dual("i",m+1);
710     MatRowMapKokkosView         Cdi = Cdi_dual.view_device(),Coi = Coi_dual.view_device();
711     PetscInt                    cstart,cend;
712 
713     /* Note that each row of C is sorted by col ids. We want to find out how to cut each row into three blocks:
714        left to the diag block, diag block, right to the diag block. The diag block have col ids in [cstart,cend).
715        Suppose a row of C has len nonzeros, indexed by [0, len). We want to know two indices: cdstart and cdend,
716        such that the three blocks are [0,cdstart), [cdstart,cdend), [cdend,len). The following code equivalentaly
717        stores values of cdstart and cdend-cstart (aka Cdi[]) instead.
718      */
719     Cdstart = MatRowMapKokkosView("Cdstart",m);
720     ierr    = PetscLayoutGetRange(C->cmap,&cstart,&cend);CHKERRQ(ierr); /* Not MatGetOwnershipRangeColumn() since C has not been preallocated yet */
721 
722     /* I could use RangePolicy and one thread per row. But since each thread essentially does binary search, threads in a
723       CUDA warp would completely diverge. So I use TeamPolicy with a team size 1.
724      */
725     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
726       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
727         PetscInt i = t.league_rank(); /* row i */
728         PetscInt j,first,count,step;
729 
730         if (i == 0) { /* Set the first entry of the i arrays to zero on device, to be used in CSR */
731           Cdi(0) = 0;
732           Coi(0) = 0;
733         }
734 
735         /* Do std::lower_bound(Ci(i),Ci(i+1),cstart) on Cj[]. We use j as the iterator. lower_bound() returns
736           in 'first' the first iterator with a value >= cstart, or last iterator if no such element is found.
737         */
738         count = Ci(i+1)-Ci(i);
739         first = Ci(i);
740         while (count > 0) {
741           j    = first;
742           step = count / 2;
743           j   += step;
744           if (Cj(j) < cstart) {
745             first  = ++j;
746             count -= step + 1;
747           } else count = step;
748         }
749         Cdstart(i) = first - Ci(i); /* 'first' is the while-loop's output */
750 
751         /* Do std::lower_bound(first,Ci(i+1),cend) on Cj[] */
752         count = Ci(i+1) - first;
753         while (count > 0) {
754           j    = first;
755           step = count / 2;
756           j   += step;
757           if (Cj(j) < cend) {
758             first  = ++j;
759             count -= step + 1;
760           } else count = step;
761         }
762         Cdi(i+1) = first - (Ci(i)+Cdstart(i)); /* 'first' is the while-loop's output */
763         Coi(i+1) = (Ci(i+1)-Ci(i)) - Cdi(i+1); /* Co's row len = C's row len - Cd's row len */
764       });
765     });
766 
767     /* Convert row lens in Cdi[], Coi[] to CSR format using inclusive scan, e.g., changing [0,1,2,3] into [0,1,3,6] */
768     Kokkos::parallel_scan(m+1,KOKKOS_LAMBDA(const PetscInt i,PetscInt& update,const bool final) {
769       update += Cdi(i);
770       if (final) Cdi(i) = update;
771     });
772     Kokkos::parallel_scan(m+1,KOKKOS_LAMBDA(const PetscInt i,PetscInt& update,const bool final) {
773       update += Coi(i);
774       if (final) Coi(i) = update;
775     });
776 
777     /* Get Cdi, Coi on host (it is not a waste, since we do need them on host in
778        MatCreateSeqAIJKokkosWithCSRMatrix() below), then get nnz of Cd and Co.
779     */
780     Cdi_dual.modify_device();
781     Coi_dual.modify_device();
782     Cdi_dual.sync_host();
783     Coi_dual.sync_host();
784     PetscInt Cd_nnz = Cdi_dual.view_host().data()[m];
785     PetscInt Co_nnz = Coi_dual.view_host().data()[m];
786 
787     /* With nnz, allocate a, j for Cd and Co */
788     MatColIdxKokkosDualView Cdj_dual("j",Cd_nnz),Coj_dual("j",Co_nnz);
789     MatScalarKokkosDualView Cda_dual("a",Cd_nnz),Coa_dual("a",Co_nnz);
790 
791     /* Fill a, j of Cd and Co on device */
792     MatColIdxKokkosView     Cdj = Cdj_dual.view_device(),Coj = Coj_dual.view_device();
793     MatScalarKokkosView     Cda = Cda_dual.view_device(),Coa = Coa_dual.view_device();
794 
795     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
796       PetscInt i       = t.league_rank(); /* row i */
797       PetscInt clen    = Ci(i+1) - Ci(i); /* len of row i of C */
798       PetscInt cdlen   = Cdi(i+1) - Cdi(i); /* len of row i of Cd */
799       PetscInt cdstart = Cdstart(i); /* [start, end) of row i of Cd in C */
800       PetscInt cdend   = cdstart + cdlen;
801       /* [0, clen) is cut into three blocks: [0, cdstart), [cdstart, cdend), [cdend, clen) */
802       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clen), [&](PetscInt k) {
803         if (k < cdstart) { /* k in [0, cdstart) */
804           Coa(Coi(i)+k) = Ca(Ci(i)+k);
805           Coj(Coi(i)+k) = Cj(Ci(i)+k);
806         } else if (k < cdend) { /* k in [cdstart, cdend) */
807           Cda(Cdi(i)+(k-cdstart)) = Ca(Ci(i)+k);
808           Cdj(Cdi(i)+(k-cdstart)) = Cj(Ci(i)+k) - cstart; /* Use local col ids in Cdj */
809         } else { /* k in [cdend, clen) */
810           Coa(Coi(i)+k-cdlen) = Ca(Ci(i)+k);
811           Coj(Coi(i)+k-cdlen) = Cj(Ci(i)+k);
812         }
813       });
814     });
815 
816     Cdj_dual.modify_device();
817     Cda_dual.modify_device();
818     Coj_dual.modify_device();
819     Coa_dual.modify_device();
820     /* With a, i, j for Cd and Co, finally build Cd, Co and then C. Their offloadmask will be set in each's MatAssemblyEnd */
821     auto cdkok = new Mat_SeqAIJKokkos(m,n,Cd_nnz,Cdi_dual,Cdj_dual,Cda_dual);
822     auto cokok = new Mat_SeqAIJKokkos(m,N,Co_nnz,Coi_dual,Coj_dual,Coa_dual);
823     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,cdkok,&Cd);CHKERRQ(ierr);
824     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,cokok,&Co);CHKERRQ(ierr);
825     ierr = MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices(C,Cd,Co);CHKERRQ(ierr); /* Coj will be converted to local ids within */
826   }
827   PetscFunctionReturn(0);
828 }
829 
830 /* MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos - Compact a SEQAIJKOKKS matrix's global col ids.
831 
832   It is similar to MatSeqAIJCompactOutExtraColumns_SeqAIJ, but it applies to SEQAIJKOKKOS and returns the l2g map in Kokkos view.
833 
834   Input Parameters:
835 +  C        - the MATMPIAIJKOKKOS matrix, of size m,n,M,N
836 .  reuse    - indicate whether the matrix has called this function before
837 .  csrmat   - the KokkosCsrMatrix, of size m,N
838 -  Cdoffset - when reuse == MAT_REUSE_MATRIX, it is an input parameter. For each row in csrmat, it stores the offset of the first
839               entry of the diag block of C in csrmat's j array.
840 
841   Output Parameters:
842 +  C        - the updated MATMPIAIJKOKKOS matrix
843 -  Cdoffset - when reuse == MAT_INITIAL_MATRIX, it is an output parameter
844 
845   Notes: the input matrix's col ids and col size will be changed.
846 */
847 static PetscErrorCode MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos(Mat C,MatColIdxKokkosView& l2g)
848 {
849   PetscErrorCode         ierr;
850   Mat_SeqAIJKokkos       *ckok;
851   ISLocalToGlobalMapping l2gmap;
852   const PetscInt         *garray;
853   PetscInt               sz;
854 
855   PetscFunctionBegin;
856   /* Compact P_other's global col ids and col size. We do it since we guess with local ids KK might be more memory scalable */
857   ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(C,&l2gmap);CHKERRQ(ierr);
858   ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
859   ckok->j_dual.modify_host(); /* P_other's j is modified on host; we need to sync it on device */
860   ckok->j_dual.sync_device();
861   ckok->SetColSize(C->cmap->n); /* Update col size of the csrmat in spptr */
862 
863   /* Build l2g -- the local to global mapping of C's cols */
864   ierr = ISLocalToGlobalMappingGetIndices(l2gmap,&garray);CHKERRQ(ierr);
865   ierr = ISLocalToGlobalMappingGetSize(l2gmap,&sz);CHKERRQ(ierr);
866   if (C->cmap->n != sz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"matrix column size(%" PetscInt_FMT ") != l2g mapping size(%" PetscInt_FMT ")", C->cmap->n,sz);
867 
868   ConstMatColIdxKokkosViewHost tmp(garray,sz);
869   l2g = MatColIdxKokkosView("l2g",sz);
870   Kokkos::deep_copy(l2g,tmp);
871 
872   ierr = ISLocalToGlobalMappingRestoreIndices(l2gmap,&garray);CHKERRQ(ierr);
873   ierr = ISLocalToGlobalMappingDestroy(&l2gmap);CHKERRQ(ierr);
874   PetscFunctionReturn(0);
875 }
876 
877 /* MatProductSymbolic_MPIAIJKokkos_AB - AB flavor of MatProductSymbolic_MPIAIJKokkos
878 
879   Input Parameters:
880 +  product  - Mat_Product which carried out the computation. Passed in to access info about this mat product.
881 .  A        - an MPIAIJKOKKOS matrix
882 .  B        - an MPIAIJKOKKOS matrix
883 -  mm       - a struct used to stash intermediate data when computing AB. Persist from symbolic to numeric operations.
884 
885   Notes: The local part of the result C is stored as mm->C_global, which is of type KokkosCsrMatrix and uses global col ids.
886 */
887 static PetscErrorCode MatProductSymbolic_MPIAIJKokkos_AB(Mat_Product *product,Mat A,Mat B,MatMatStruct_AB *mm)
888 {
889   PetscErrorCode              ierr;
890   Mat_MPIAIJ                  *a = static_cast<Mat_MPIAIJ*>(A->data);
891   Mat                         Ad = a->A,Ao = a->B; /* diag and offdiag of A */
892   IS                          glob = NULL;
893   const PetscInt              *garray;
894   PetscInt                    N = B->cmap->N,sz;
895   ConstMatColIdxKokkosView    l2g1; /* two temp maps mapping local col ids to global ones */
896   MatColIdxKokkosView         l2g2;
897   Mat                         C1,C2; /* intermediate matrices */
898 
899   PetscFunctionBegin;
900   /* C1 = Ad * B_local. B_local is a matrix got by merging Bd and Bo, and uses local col ids */
901   ierr = MatMPIAIJGetLocalMatMerge(B,MAT_INITIAL_MATRIX,&glob,&mm->B_local);CHKERRQ(ierr);
902   ierr = MatProductCreate(Ad,mm->B_local,NULL,&C1);CHKERRQ(ierr);
903   ierr = MatProductSetType(C1,MATPRODUCT_AB);CHKERRQ(ierr);
904   ierr = MatProductSetFill(C1,product->fill);CHKERRQ(ierr);
905   C1->product->api_user = product->api_user;
906   ierr = MatProductSetFromOptions(C1);CHKERRQ(ierr);
907   if (!C1->ops->productsymbolic) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C1->product->type]);
908   ierr = (*C1->ops->productsymbolic)(C1);CHKERRQ(ierr);
909 
910   ierr = ISGetIndices(glob,&garray);CHKERRQ(ierr);
911   ierr = ISGetSize(glob,&sz);CHKERRQ(ierr);
912   const auto& tmp  = ConstMatColIdxKokkosViewHost(garray,sz); /* wrap garray as a view */
913   l2g1 = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),tmp); /* maybe just an alias to tmp, so we restore garray at the very end */
914   ierr = MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C1,N,l2g1,mm->C1_global);
915 
916   /* C2 = Ao * B_other. B_other is a matrix consisting of needed rows of B gathered from other procs */
917   ierr = MatSeqAIJKokkosBcast(mm->B_local,MAT_INITIAL_MATRIX,N,l2g1,a->Mvctx,mm->sf,
918                               mm->abuf,mm->rows,mm->rowoffset,mm->B_other);CHKERRQ(ierr);
919 
920   /* Compact B_other to use local ids as we guess KK spgemm is more memroy scalable with that; We could skip the compaction to simplify code */
921   ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos(mm->B_other,l2g2);CHKERRQ(ierr);
922   ierr = MatProductCreate(Ao,mm->B_other,NULL,&C2);CHKERRQ(ierr);
923   ierr = MatProductSetType(C2,MATPRODUCT_AB);CHKERRQ(ierr);
924   ierr = MatProductSetFill(C2,product->fill);CHKERRQ(ierr);
925   C2->product->api_user = product->api_user;
926   ierr = MatProductSetFromOptions(C2);CHKERRQ(ierr);
927   if (!C2->ops->productsymbolic) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C2->product->type]);
928   ierr = (*C2->ops->productsymbolic)(C2);CHKERRQ(ierr);
929   ierr = MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C2,N,l2g2,mm->C2_global);
930 
931   /* C = C1 + C2.  We actually use their global col ids versions in adding */
932   mm->kh.create_spadd_handle(false); /* Input C1, C2 are NOT sorted, since B_local, B_other are not */
933   KokkosSparse::spadd_symbolic(&mm->kh,mm->C1_global,mm->C2_global,mm->C_global);
934   /* Have to do numeric since spadd_symbolic does not really populate column indices of the result matrix */
935   KokkosSparse::spadd_numeric(&mm->kh,(MatScalarType)1.0,mm->C1_global,(MatScalarType)1.0,mm->C2_global,mm->C_global);
936 
937   mm->C1 = C1;
938   mm->C2 = C2;
939   ierr = ISRestoreIndices(glob,&garray);CHKERRQ(ierr);
940   ierr = ISDestroy(&glob);CHKERRQ(ierr);
941   PetscFunctionReturn(0);
942 }
943 
944 /* MatProductSymbolic_MPIAIJKokkos_AtB - A^tB flavor of MatProductSymbolic_MPIAIJKokkos
945 
946   Input Parameters:
947 +  product  - Mat_Product which carried out the computation. Passed in to access info about this mat product.
948 .  A        - an MPIAIJKOKKOS matrix
949 .  B        - a SEQAIJKOKKOS matrix. It works as if A^t is multiplied by a parallel matrix made up of Bs on each rank.
950 .  localB   - Does B use local col ids? If false, then B is already in global col ids.
951 .  N        - col size of the "parallel B matrix". It implies B's global col ids are in range of [0,N) and N is the same across the communicator.
952 .  l2g      - If localB, then l2g maps B's local col ids to global ones.
953 -  mm       - a struct used to stash intermediate data in AtB
954 
955   Notes: The local part of the result C is stored as mm->C_global, which is of type KokkosCsrMatrix and uses global col ids.
956 */
957 static PetscErrorCode MatProductSymbolic_MPIAIJKokkos_AtB(Mat_Product *product,Mat A,Mat B,PetscBool localB,PetscInt N,const ConstMatColIdxKokkosView& l2g,MatMatStruct_AtB *mm)
958 {
959   PetscErrorCode         ierr;
960   Mat_MPIAIJ             *a = static_cast<Mat_MPIAIJ*>(A->data);
961   Mat                    Ad = a->A,Ao = a->B; /* diag and offdiag of A */
962   Mat                    C1,C2; /* intermediate matrices */
963 
964   PetscFunctionBegin;
965   /* C1 = Ad^t * B */
966   ierr = MatProductCreate(Ad,B,NULL,&C1);CHKERRQ(ierr);
967   ierr = MatProductSetType(C1,MATPRODUCT_AtB);CHKERRQ(ierr);
968   ierr = MatProductSetFill(C1,product->fill);CHKERRQ(ierr);
969   C1->product->api_user = product->api_user;
970   ierr = MatProductSetFromOptions(C1);CHKERRQ(ierr);
971   if (!C1->ops->productsymbolic) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C1->product->type]);
972   ierr = (*C1->ops->productsymbolic)(C1);CHKERRQ(ierr);
973 
974   if (localB) {ierr = MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C1,N,l2g,mm->C1_global);}
975   else mm->C1_global = static_cast<Mat_SeqAIJKokkos*>(C1->spptr)->csrmat; /* the csrmat already uses global col ids */
976 
977   /* C2 = Ao^t * B */
978   ierr = MatProductCreate(Ao,B,NULL,&C2);CHKERRQ(ierr);
979   ierr = MatProductSetType(C2,MATPRODUCT_AtB);CHKERRQ(ierr);
980   ierr = MatProductSetFill(C2,product->fill);CHKERRQ(ierr);
981   C2->product->api_user = product->api_user;
982   ierr = MatProductSetFromOptions(C2);CHKERRQ(ierr);
983   if (!C2->ops->productsymbolic) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C2->product->type]);
984   ierr = (*C2->ops->productsymbolic)(C2);CHKERRQ(ierr);
985 
986   ierr = MatSeqAIJKokkosReduce(C2,MAT_INITIAL_MATRIX,localB,N,l2g,a->Mvctx,mm->sf,mm->abuf,
987                                mm->srcrowoffset,mm->dstrowoffset,mm->C2_global);CHKERRQ(ierr);
988 
989   mm->kh.create_spadd_handle(false); /* Input C1, C2 are NOT sorted, since B may be not */
990   KokkosSparse::spadd_symbolic(&mm->kh,mm->C1_global,mm->C2_global,mm->C_global);
991   /* Have to do numeric since spadd_symbolic does not really populate column indices of the result matrix */
992   KokkosSparse::spadd_numeric(&mm->kh,(MatScalarType)1.0,mm->C1_global,(MatScalarType)1.0,mm->C2_global,mm->C_global);
993   mm->C1 = C1;
994   mm->C2 = C2;
995   PetscFunctionReturn(0);
996 }
997 
998 PetscErrorCode MatProductNumeric_MPIAIJKokkos(Mat C)
999 {
1000   PetscErrorCode                ierr;
1001   Mat_Product                   *product = C->product;
1002   MatProductType                ptype;
1003   MatProductData_MPIAIJKokkos   *mmdata;
1004   MatMatStruct                  *mm = NULL;
1005   MatMatStruct_AB               *ab;
1006   MatMatStruct_AtB              *atb;
1007   Mat                           A,B,Ad,Ao,Bd,Bo;
1008   const MatScalarType           one = 1.0; /* Not use literal 1.0 directly, to avoid wrong template instantiation in KokkosSparse::spadd_numeric */
1009 
1010   PetscFunctionBegin;
1011   MatCheckProduct(C,1);
1012   mmdata = static_cast<MatProductData_MPIAIJKokkos*>(product->data);
1013   ptype  = product->type;
1014   A      = product->A;
1015   B      = product->B;
1016   Ad     = static_cast<Mat_MPIAIJ*>(A->data)->A;
1017   Ao     = static_cast<Mat_MPIAIJ*>(A->data)->B;
1018   Bd     = static_cast<Mat_MPIAIJ*>(B->data)->A;
1019   Bo     = static_cast<Mat_MPIAIJ*>(B->data)->B;
1020 
1021   if (mmdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
1022     mmdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
1023     ab  = mmdata->mmAB;
1024     atb = mmdata->mmAtB;
1025     if (ab) {
1026       static_cast<MatProductData_SeqAIJKokkos*>(ab->C1->product->data)->reusesym = PETSC_FALSE;
1027       static_cast<MatProductData_SeqAIJKokkos*>(ab->C2->product->data)->reusesym = PETSC_FALSE;
1028     }
1029     if (atb) {
1030       static_cast<MatProductData_SeqAIJKokkos*>(atb->C1->product->data)->reusesym = PETSC_FALSE;
1031       static_cast<MatProductData_SeqAIJKokkos*>(atb->C2->product->data)->reusesym = PETSC_FALSE;
1032     }
1033     PetscFunctionReturn(0);
1034   }
1035 
1036   if (ptype == MATPRODUCT_AB) {
1037     ab   = mmdata->mmAB;
1038     /* C1 = Ad * B_local */
1039     if (!ab->C1->ops->productnumeric || !ab->C2->ops->productnumeric) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing numeric op for MATPRODUCT_AB");
1040     ierr = MatMPIAIJGetLocalMatMerge(B,MAT_REUSE_MATRIX,NULL/*glob*/,&ab->B_local);CHKERRQ(ierr);
1041     if (ab->C1->product->B != ab->B_local) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AB, internal mat product matrix C1->B has unexpectedly changed");
1042     if (ab->C1->product->A != Ad) {ierr = MatProductReplaceMats(Ad,NULL,NULL,ab->C1);CHKERRQ(ierr);}
1043     ierr = (*ab->C1->ops->productnumeric)(ab->C1);CHKERRQ(ierr);
1044     ierr = MatSeqAIJKokkosBcast(ab->B_local,MAT_REUSE_MATRIX,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,ab->sf,
1045                                 ab->abuf,ab->rows,ab->rowoffset,ab->B_other);CHKERRQ(ierr);
1046     /* C2 = Ao * B_other */
1047     if (ab->C2->product->B != ab->B_other) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AB, internal mat product matrix C2->B has unexpectedly changed");
1048     if (ab->C1->product->A != Ao) {ierr = MatProductReplaceMats(Ao,NULL,NULL,ab->C2);CHKERRQ(ierr);}
1049     ierr = (*ab->C2->ops->productnumeric)(ab->C2);CHKERRQ(ierr);
1050     /* C = C1_global + C2_global */
1051     KokkosSparse::spadd_numeric(&ab->kh,one,ab->C1_global,one,ab->C2_global,ab->C_global);
1052     mm = static_cast<MatMatStruct*>(ab);
1053   } else if (ptype == MATPRODUCT_AtB) {
1054     atb  = mmdata->mmAtB;
1055     if (!atb->C1->ops->productnumeric || !atb->C2->ops->productnumeric) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing numeric op for MATPRODUCT_AtB");
1056     /* C1 = Ad^t * B_local */
1057     ierr = MatMPIAIJGetLocalMatMerge(B,MAT_REUSE_MATRIX,NULL/*glob*/,&atb->B_local);CHKERRQ(ierr);
1058     if (atb->C1->product->B != atb->B_local) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AtB, internal mat product matrix C1->B has unexpectedly changed");
1059     if (atb->C1->product->A != Ad) {ierr = MatProductReplaceMats(Ad,NULL,NULL,atb->C1);CHKERRQ(ierr);}
1060     ierr = (*atb->C1->ops->productnumeric)(atb->C1);CHKERRQ(ierr);
1061 
1062     /* C2 = Ao^t * B_local */
1063     if (atb->C2->product->B != atb->B_local) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AtB, internal mat product matrix C2->B has unexpectedly changed");
1064     if (atb->C2->product->A != Ao) {ierr = MatProductReplaceMats(Ao,NULL,NULL,atb->C2);CHKERRQ(ierr);}
1065     ierr = (*atb->C2->ops->productnumeric)(atb->C2);CHKERRQ(ierr);
1066     /* Form C2_global */
1067     ierr = MatSeqAIJKokkosReduce(atb->C2,MAT_REUSE_MATRIX,PETSC_TRUE,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,atb->sf,
1068                                  atb->abuf,atb->srcrowoffset,atb->dstrowoffset,atb->C2_global);CHKERRQ(ierr);
1069     /* C = C1_global + C2_global */
1070     KokkosSparse::spadd_numeric(&atb->kh,one,atb->C1_global,one,atb->C2_global,atb->C_global);
1071     mm = static_cast<MatMatStruct*>(atb);
1072   } else if (ptype == MATPRODUCT_PtAP) { /* BtAB */
1073     ab   = mmdata->mmAB;
1074     ierr = MatMPIAIJGetLocalMatMerge(B,MAT_REUSE_MATRIX,NULL/*glob*/,&ab->B_local);CHKERRQ(ierr);
1075 
1076     /* ab->C1 = Ad * B_local */
1077     if (ab->C1->product->B != ab->B_local) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_PtAP, internal mat product matrix ab->C1->B has unexpectedly changed");
1078     if (ab->C1->product->A != Ad) {ierr = MatProductReplaceMats(Ad,NULL,NULL,ab->C1);CHKERRQ(ierr);}
1079     ierr = (*ab->C1->ops->productnumeric)(ab->C1);CHKERRQ(ierr);
1080     ierr = MatSeqAIJKokkosBcast(ab->B_local,MAT_REUSE_MATRIX,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,ab->sf,
1081                                 ab->abuf,ab->rows,ab->rowoffset,ab->B_other);CHKERRQ(ierr);
1082     /* ab->C2 = Ao * B_other */
1083     if (ab->C2->product->A != Ao) {ierr = MatProductReplaceMats(Ao,NULL,NULL,ab->C2);CHKERRQ(ierr);}
1084     ierr = (*ab->C2->ops->productnumeric)(ab->C2);CHKERRQ(ierr); /* C2 = Ao * B_other */
1085     KokkosSparse::spadd_numeric(&ab->kh,one,ab->C1_global,one,ab->C2_global,ab->C_global);
1086 
1087     /* atb->C1 = Bd^t * ab->C_petsc */
1088     atb  = mmdata->mmAtB;
1089     if (atb->C1->product->B != ab->C_petsc) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_PtAP, internal mat product matrix atb->C1->B has unexpectedly changed");
1090     if (atb->C1->product->A != Bd) {ierr = MatProductReplaceMats(Bd,NULL,NULL,atb->C1);CHKERRQ(ierr);}
1091     ierr = (*atb->C1->ops->productnumeric)(atb->C1);CHKERRQ(ierr);
1092     /* atb->C2 = Bo^t * ab->C_petsc */
1093     if (atb->C2->product->A != Bo) {ierr = MatProductReplaceMats(Bo,NULL,NULL,atb->C2);CHKERRQ(ierr);}
1094     ierr = (*atb->C2->ops->productnumeric)(atb->C2);CHKERRQ(ierr);
1095     ierr = MatSeqAIJKokkosReduce(atb->C2,MAT_REUSE_MATRIX,PETSC_FALSE,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,atb->sf,
1096                                  atb->abuf,atb->srcrowoffset,atb->dstrowoffset,atb->C2_global);CHKERRQ(ierr);
1097     KokkosSparse::spadd_numeric(&atb->kh,one,atb->C1_global,one,atb->C2_global,atb->C_global);
1098     mm = static_cast<MatMatStruct*>(atb);
1099   }
1100   /* Split C_global to form C */
1101   ierr = MatSetMPIAIJKokkosWithGlobalCSRMatrix(C,MAT_REUSE_MATRIX,mm->C_global,mm->Cdstart);CHKERRQ(ierr);
1102   PetscFunctionReturn(0);
1103 }
1104 
1105 PetscErrorCode MatProductSymbolic_MPIAIJKokkos(Mat C)
1106 {
1107   PetscErrorCode              ierr;
1108   Mat                         A,B;
1109   Mat_Product                 *product = C->product;
1110   MatProductType              ptype;
1111   MatProductData_MPIAIJKokkos *mmdata;
1112   MatMatStruct                *mm = NULL;
1113   IS                          glob = NULL;
1114   const PetscInt              *garray;
1115   PetscInt                    m,n,M,N,sz;
1116   ConstMatColIdxKokkosView    l2g; /* map local col ids to global ones */
1117 
1118   PetscFunctionBegin;
1119   MatCheckProduct(C,1);
1120   if (product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
1121   ptype = product->type;
1122   A     = product->A;
1123   B     = product->B;
1124 
1125   switch (ptype) {
1126     case MATPRODUCT_AB:   m = A->rmap->n; n = B->cmap->n; M = A->rmap->N; N = B->cmap->N; break;
1127     case MATPRODUCT_AtB:  m = A->cmap->n; n = B->cmap->n; M = A->cmap->N; N = B->cmap->N; break;
1128     case MATPRODUCT_PtAP: m = B->cmap->n; n = B->cmap->n; M = B->cmap->N; N = B->cmap->N; break; /* BtAB */
1129     default: SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for product type %s",MatProductTypes[ptype]);
1130   }
1131 
1132   ierr = MatSetSizes(C,m,n,M,N);CHKERRQ(ierr);
1133   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
1134   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
1135   ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
1136 
1137   mmdata           = new MatProductData_MPIAIJKokkos();
1138   mmdata->reusesym = product->api_user;
1139 
1140   if (ptype == MATPRODUCT_AB) {
1141     mmdata->mmAB = new MatMatStruct_AB();
1142     ierr = MatProductSymbolic_MPIAIJKokkos_AB(product,A,B,mmdata->mmAB);CHKERRQ(ierr);
1143     mm   = static_cast<MatMatStruct*>(mmdata->mmAB);
1144   } else if (ptype == MATPRODUCT_AtB) {
1145     mmdata->mmAtB = new MatMatStruct_AtB();
1146     auto atb      = mmdata->mmAtB;
1147     ierr = MatMPIAIJGetLocalMatMerge(B,MAT_INITIAL_MATRIX,&glob,&atb->B_local);CHKERRQ(ierr);
1148     ierr = ISGetIndices(glob,&garray);CHKERRQ(ierr);
1149     ierr = ISGetSize(glob,&sz);CHKERRQ(ierr);
1150     l2g  = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatColIdxKokkosViewHost(garray,sz));
1151     ierr = MatProductSymbolic_MPIAIJKokkos_AtB(product,A,atb->B_local,PETSC_TRUE,N,l2g,atb);CHKERRQ(ierr);
1152     ierr = ISRestoreIndices(glob,&garray);CHKERRQ(ierr);
1153     ierr = ISDestroy(&glob);CHKERRQ(ierr);
1154     mm   = static_cast<MatMatStruct*>(atb);
1155   } else if (ptype == MATPRODUCT_PtAP) { /* BtAB */
1156     mmdata->mmAB  = new MatMatStruct_AB(); /* tmp=A*B */
1157     mmdata->mmAtB = new MatMatStruct_AtB(); /* C=B^t*tmp */
1158     auto ab       = mmdata->mmAB;
1159     auto atb      = mmdata->mmAtB;
1160     ierr = MatProductSymbolic_MPIAIJKokkos_AB(product,A,B,ab);CHKERRQ(ierr);
1161     auto tmp = new Mat_SeqAIJKokkos(ab->C_global); /* Memory will be owned by ab->C_petsc */
1162     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,tmp,&ab->C_petsc);CHKERRQ(ierr);
1163     ierr = MatProductSymbolic_MPIAIJKokkos_AtB(product,B,ab->C_petsc,PETSC_FALSE,N,l2g/*not used*/,atb);CHKERRQ(ierr);
1164     mm   = static_cast<MatMatStruct*>(atb);
1165   }
1166   /* Split the C_global into petsc A, B format */
1167   ierr = MatSetMPIAIJKokkosWithGlobalCSRMatrix(C,MAT_INITIAL_MATRIX,mm->C_global,mm->Cdstart);CHKERRQ(ierr);
1168   C->product->data        = mmdata;
1169   C->product->destroy     = MatProductDataDestroy_MPIAIJKokkos;
1170   C->ops->productnumeric  = MatProductNumeric_MPIAIJKokkos;
1171   PetscFunctionReturn(0);
1172 }
1173 
1174 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_MPIAIJKokkos(Mat mat)
1175 {
1176   PetscErrorCode ierr;
1177   Mat_Product    *product = mat->product;
1178   PetscBool      match = PETSC_FALSE;
1179   PetscBool      usecpu = PETSC_FALSE;
1180 
1181   PetscFunctionBegin;
1182   MatCheckProduct(mat,1);
1183   if (!product->A->boundtocpu && !product->B->boundtocpu) {
1184     ierr = PetscObjectTypeCompare((PetscObject)product->B,((PetscObject)product->A)->type_name,&match);CHKERRQ(ierr);
1185   }
1186   if (match) { /* we can always fallback to the CPU if requested */
1187     switch (product->type) {
1188     case MATPRODUCT_AB:
1189       if (product->api_user) {
1190         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
1191         ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
1192         ierr = PetscOptionsEnd();CHKERRQ(ierr);
1193       } else {
1194         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
1195         ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
1196         ierr = PetscOptionsEnd();CHKERRQ(ierr);
1197       }
1198       break;
1199     case MATPRODUCT_AtB:
1200       if (product->api_user) {
1201         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
1202         ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
1203         ierr = PetscOptionsEnd();CHKERRQ(ierr);
1204       } else {
1205         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
1206         ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
1207         ierr = PetscOptionsEnd();CHKERRQ(ierr);
1208       }
1209       break;
1210     case MATPRODUCT_PtAP:
1211       if (product->api_user) {
1212         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
1213         ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
1214         ierr = PetscOptionsEnd();CHKERRQ(ierr);
1215       } else {
1216         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
1217         ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
1218         ierr = PetscOptionsEnd();CHKERRQ(ierr);
1219       }
1220       break;
1221     default:
1222       break;
1223     }
1224     match = (PetscBool)!usecpu;
1225   }
1226   if (match) {
1227     switch (product->type) {
1228     case MATPRODUCT_AB:
1229     case MATPRODUCT_AtB:
1230     case MATPRODUCT_PtAP:
1231       mat->ops->productsymbolic = MatProductSymbolic_MPIAIJKokkos;
1232       break;
1233     default:
1234       break;
1235     }
1236   }
1237   /* fallback to MPIAIJ ops */
1238   if (!mat->ops->productsymbolic) {
1239     ierr = MatProductSetFromOptions_MPIAIJ(mat);CHKERRQ(ierr);
1240   }
1241   PetscFunctionReturn(0);
1242 }
1243 
1244 PetscErrorCode MatDestroy_MPIAIJKokkos(Mat A)
1245 {
1246   PetscErrorCode     ierr;
1247 
1248   PetscFunctionBegin;
1249   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
1250   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
1251   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
1252   PetscFunctionReturn(0);
1253 }
1254 
1255 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
1256 {
1257   PetscErrorCode     ierr;
1258   Mat                B;
1259   Mat_MPIAIJ         *a;
1260 
1261   PetscFunctionBegin;
1262   if (reuse == MAT_INITIAL_MATRIX) {
1263     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
1264   } else if (reuse == MAT_REUSE_MATRIX) {
1265     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
1266   }
1267   B = *newmat;
1268 
1269   B->boundtocpu = PETSC_FALSE;
1270   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
1271   ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr);
1272   ierr = PetscObjectChangeTypeName((PetscObject)B,MATMPIAIJKOKKOS);CHKERRQ(ierr);
1273 
1274   a = static_cast<Mat_MPIAIJ*>(A->data);
1275   if (a->A) {ierr = MatSetType(a->A,MATSEQAIJKOKKOS);CHKERRQ(ierr);}
1276   if (a->B) {ierr = MatSetType(a->B,MATSEQAIJKOKKOS);CHKERRQ(ierr);}
1277   if (a->lvec) {ierr = VecSetType(a->lvec,VECSEQKOKKOS);CHKERRQ(ierr);}
1278 
1279   B->ops->assemblyend           = MatAssemblyEnd_MPIAIJKokkos;
1280   B->ops->mult                  = MatMult_MPIAIJKokkos;
1281   B->ops->multadd               = MatMultAdd_MPIAIJKokkos;
1282   B->ops->multtranspose         = MatMultTranspose_MPIAIJKokkos;
1283   B->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJKokkos;
1284   B->ops->destroy               = MatDestroy_MPIAIJKokkos;
1285 
1286   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJKokkos);CHKERRQ(ierr);
1287   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJKokkos);CHKERRQ(ierr);
1288   PetscFunctionReturn(0);
1289 }
1290 
1291 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJKokkos(Mat A)
1292 {
1293   PetscErrorCode ierr;
1294 
1295   PetscFunctionBegin;
1296   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
1297   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
1298   ierr = MatConvert_MPIAIJ_MPIAIJKokkos(A,MATMPIAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
1299   PetscFunctionReturn(0);
1300 }
1301 
1302 /*@C
1303    MatCreateAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1304    (the default parallel PETSc format).  This matrix will ultimately pushed down
1305    to Kokkos for calculations. For good matrix
1306    assembly performance the user should preallocate the matrix storage by setting
1307    the parameter nz (or the array nnz).  By setting these parameters accurately,
1308    performance during matrix assembly can be increased by more than a factor of 50.
1309 
1310    Collective
1311 
1312    Input Parameters:
1313 +  comm - MPI communicator, set to PETSC_COMM_SELF
1314 .  m - number of rows
1315 .  n - number of columns
1316 .  nz - number of nonzeros per row (same for all rows)
1317 -  nnz - array containing the number of nonzeros in the various rows
1318          (possibly different for each row) or NULL
1319 
1320    Output Parameter:
1321 .  A - the matrix
1322 
1323    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1324    MatXXXXSetPreallocation() paradigm instead of this routine directly.
1325    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1326 
1327    Notes:
1328    If nnz is given then nz is ignored
1329 
1330    The AIJ format (also called the Yale sparse matrix format or
1331    compressed row storage), is fully compatible with standard Fortran 77
1332    storage.  That is, the stored row and column indices can begin at
1333    either one (as in Fortran) or zero.  See the users' manual for details.
1334 
1335    Specify the preallocated storage with either nz or nnz (not both).
1336    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1337    allocation.  For large problems you MUST preallocate memory or you
1338    will get TERRIBLE performance, see the users' manual chapter on matrices.
1339 
1340    By default, this format uses inodes (identical nodes) when possible, to
1341    improve numerical efficiency of matrix-vector products and solves. We
1342    search for consecutive rows with the same nonzero structure, thereby
1343    reusing matrix information to achieve increased efficiency.
1344 
1345    Level: intermediate
1346 
1347 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJKOKKOS, MATAIJKokkos
1348 @*/
1349 PetscErrorCode  MatCreateAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
1350 {
1351   PetscErrorCode ierr;
1352   PetscMPIInt    size;
1353 
1354   PetscFunctionBegin;
1355   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1356   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
1357   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
1358   if (size > 1) {
1359     ierr = MatSetType(*A,MATMPIAIJKOKKOS);CHKERRQ(ierr);
1360     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
1361   } else {
1362     ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1363     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
1364   }
1365   PetscFunctionReturn(0);
1366 }
1367 
1368 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
1369 PetscErrorCode MatKokkosGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
1370 {
1371   PetscMPIInt                size,rank;
1372   MPI_Comm                   comm;
1373   PetscErrorCode             ierr;
1374   PetscSplitCSRDataStructure d_mat=NULL;
1375 
1376   PetscFunctionBegin;
1377   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1378   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
1379   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
1380   if (size == 1) {
1381     ierr   = MatSeqAIJKokkosGetDeviceMat(A,&d_mat);CHKERRQ(ierr);
1382     ierr   = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); /* Since we are going to modify matrix values on device */
1383   } else {
1384     Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
1385     ierr   = MatSeqAIJKokkosGetDeviceMat(aij->A,&d_mat);CHKERRQ(ierr);
1386     ierr   = MatSeqAIJKokkosModifyDevice(aij->A);CHKERRQ(ierr);
1387     ierr   = MatSeqAIJKokkosModifyDevice(aij->B);CHKERRQ(ierr);
1388   }
1389   // act like MatSetValues because not called on host
1390   if (A->assembled) {
1391     if (A->was_assembled) {
1392       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
1393     }
1394     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
1395   } else {
1396     ierr = PetscInfo1(A,"Warning !assemble ??? assembled=%" PetscInt_FMT "\n",A->assembled);CHKERRQ(ierr);
1397     // SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
1398   }
1399   if (!d_mat) {
1400     struct _n_SplitCSRMat h_mat; /* host container */
1401     Mat_SeqAIJKokkos      *aijkokA;
1402     Mat_SeqAIJ            *jaca;
1403     PetscInt              n = A->rmap->n, nnz;
1404     Mat                   Amat;
1405     PetscInt              *colmap;
1406 
1407     /* create and copy h_mat */
1408     h_mat.M = A->cmap->N; // use for debug build
1409     ierr = PetscInfo(A,"Create device matrix in Kokkos\n");CHKERRQ(ierr);
1410     if (size == 1) {
1411       Amat = A;
1412       jaca = (Mat_SeqAIJ*)A->data;
1413       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
1414       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
1415       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
1416       h_mat.offdiag.a = NULL;
1417       aijkokA = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1418       aijkokA->i_uncompressed_d = NULL;
1419       aijkokA->colmap_d = NULL;
1420     } else {
1421       Mat_MPIAIJ       *aij = (Mat_MPIAIJ*)A->data;
1422       Mat_SeqAIJ       *jacb = (Mat_SeqAIJ*)aij->B->data;
1423       PetscInt         ii;
1424       Mat_SeqAIJKokkos *aijkokB;
1425 
1426       Amat = aij->A;
1427       aijkokA = static_cast<Mat_SeqAIJKokkos*>(aij->A->spptr);
1428       aijkokB = static_cast<Mat_SeqAIJKokkos*>(aij->B->spptr);
1429       aijkokA->i_uncompressed_d = NULL;
1430       aijkokA->colmap_d = NULL;
1431       jaca = (Mat_SeqAIJ*)aij->A->data;
1432       if (aij->B->cmap->n && !aij->garray) SETERRQ(comm,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
1433       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
1434       aij->donotstash = PETSC_TRUE;
1435       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
1436       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more disassembly
1437       ierr = PetscCalloc1(A->cmap->N,&colmap);CHKERRQ(ierr);
1438       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N)*sizeof(PetscInt));CHKERRQ(ierr);
1439       for (ii=0; ii<aij->B->cmap->n; ii++) colmap[aij->garray[ii]] = ii+1;
1440       // allocate B copy data
1441       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
1442       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
1443       nnz = jacb->i[n];
1444       if (jacb->compressedrow.use) {
1445         const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_i_k (jacb->i,n+1);
1446         aijkokB->i_uncompressed_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_i_k));
1447         Kokkos::deep_copy (*aijkokB->i_uncompressed_d, h_i_k);
1448         h_mat.offdiag.i = aijkokB->i_uncompressed_d->data();
1449       } else {
1450          h_mat.offdiag.i = aijkokB->i_device_data();
1451       }
1452       h_mat.offdiag.j = aijkokB->j_device_data();
1453       h_mat.offdiag.a = aijkokB->a_device_data();
1454       {
1455         Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_colmap_k (colmap,A->cmap->N);
1456         aijkokB->colmap_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_colmap_k));
1457         Kokkos::deep_copy (*aijkokB->colmap_d, h_colmap_k);
1458         h_mat.colmap = aijkokB->colmap_d->data();
1459         ierr = PetscFree(colmap);CHKERRQ(ierr);
1460       }
1461       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
1462       h_mat.offdiag.n = n;
1463     }
1464     // allocate A copy data
1465     nnz = jaca->i[n];
1466     h_mat.diag.n = n;
1467     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
1468     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr);
1469     if (jaca->compressedrow.use) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A does not suppport compressed row (todo)");
1470     else {
1471       h_mat.diag.i = aijkokA->i_device_data();
1472     }
1473     h_mat.diag.j = aijkokA->j_device_data();
1474     h_mat.diag.a = aijkokA->a_device_data();
1475     // copy pointers and metdata to device
1476     ierr = MatSeqAIJKokkosSetDeviceMat(Amat,&h_mat);CHKERRQ(ierr);
1477     ierr = MatSeqAIJKokkosGetDeviceMat(Amat,&d_mat);CHKERRQ(ierr);
1478     ierr = PetscInfo2(A,"Create device Mat n=%" PetscInt_FMT " nnz=%" PetscInt_FMT "\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
1479   }
1480   *B = d_mat; // return it, set it in Mat, and set it up
1481   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
1482   PetscFunctionReturn(0);
1483 }
1484 
1485 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetOffloadMask(Mat A, const char **mask)
1486 {
1487   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1488 
1489   PetscFunctionBegin;
1490   if (!aijkok) *mask = "AIJKOK_UNALLOCATED";
1491   else if (aijkok->a_dual.need_sync_host()) *mask = "PETSC_OFFLOAD_GPU";
1492   else if (aijkok->a_dual.need_sync_device()) *mask = "PETSC_OFFLOAD_CPU";
1493   else *mask = "PETSC_OFFLOAD_BOTH";
1494   PetscFunctionReturn(0);
1495 }
1496 
1497 PETSC_INTERN PetscErrorCode MatAIJKokkosPrintOffloadMask(Mat A)
1498 {
1499   PetscErrorCode    ierr;
1500   PetscMPIInt       size;
1501   Mat               Ad,Ao;
1502   const char        *amask,*bmask;
1503 
1504   PetscFunctionBegin;
1505   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr);
1506 
1507   if (size == 1) {
1508     ierr = MatSeqAIJKokkosGetOffloadMask(A,&amask);CHKERRQ(ierr);
1509     ierr = PetscPrintf(PETSC_COMM_SELF,"%s\n",amask);CHKERRQ(ierr);
1510   } else {
1511     Ad  = ((Mat_MPIAIJ*)A->data)->A;
1512     Ao  = ((Mat_MPIAIJ*)A->data)->B;
1513     ierr = MatSeqAIJKokkosGetOffloadMask(Ad,&amask);CHKERRQ(ierr);
1514     ierr = MatSeqAIJKokkosGetOffloadMask(Ao,&bmask);CHKERRQ(ierr);
1515     ierr = PetscPrintf(PETSC_COMM_SELF,"Diag : Off-diag = %s : %s\n",amask,bmask);CHKERRQ(ierr);
1516   }
1517   PetscFunctionReturn(0);
1518 }
1519