xref: /petsc/src/mat/impls/aij/mpi/kokkos/mpiaijkok.kokkos.cxx (revision de40df40cf67d557bacf774f8a40b904b72e5e2e)
1 #include <petscvec_kokkos.hpp>
2 #include <petscsf.h>
3 #include <petsc/private/sfimpl.h>
4 #include <../src/mat/impls/aij/mpi/mpiaij.h>
5 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
6 #include <../src/mat/impls/aij/mpi/kokkos/mpiaijkok.hpp>
7 #include <KokkosSparse_spadd.hpp>
8 
9 PetscErrorCode MatAssemblyEnd_MPIAIJKokkos(Mat A,MatAssemblyType mode)
10 {
11   PetscErrorCode   ierr;
12   Mat_MPIAIJ       *mpiaij = (Mat_MPIAIJ*)A->data;
13   Mat_SeqAIJKokkos *aijkok = mpiaij->A->spptr ? static_cast<Mat_SeqAIJKokkos*>(mpiaij->A->spptr) : NULL;
14 
15   PetscFunctionBegin;
16   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
17   if (aijkok && aijkok->device_mat_d.data()) {
18     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
19   }
20 
21   PetscFunctionReturn(0);
22 }
23 
24 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJKokkos(Mat mat,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
25 {
26   PetscErrorCode ierr;
27   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
28 
29   PetscFunctionBegin;
30   ierr = PetscLayoutSetUp(mat->rmap);CHKERRQ(ierr);
31   ierr = PetscLayoutSetUp(mat->cmap);CHKERRQ(ierr);
32 #if defined(PETSC_USE_DEBUG)
33   if (d_nnz) {
34     PetscInt i;
35     for (i=0; i<mat->rmap->n; i++) {
36       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]);
37     }
38   }
39   if (o_nnz) {
40     PetscInt i;
41     for (i=0; i<mat->rmap->n; i++) {
42       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]);
43     }
44   }
45 #endif
46 #if defined(PETSC_USE_CTABLE)
47   ierr = PetscTableDestroy(&mpiaij->colmap);CHKERRQ(ierr);
48 #else
49   ierr = PetscFree(mpiaij->colmap);CHKERRQ(ierr);
50 #endif
51   ierr = PetscFree(mpiaij->garray);CHKERRQ(ierr);
52   ierr = VecDestroy(&mpiaij->lvec);CHKERRQ(ierr);
53   ierr = VecScatterDestroy(&mpiaij->Mvctx);CHKERRQ(ierr);
54   /* Because the B will have been resized we simply destroy it and create a new one each time */
55   ierr = MatDestroy(&mpiaij->B);CHKERRQ(ierr);
56 
57   if (!mpiaij->A) {
58     ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->A);CHKERRQ(ierr);
59     ierr = MatSetSizes(mpiaij->A,mat->rmap->n,mat->cmap->n,mat->rmap->n,mat->cmap->n);CHKERRQ(ierr);
60     ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->A);CHKERRQ(ierr);
61   }
62   if (!mpiaij->B) {
63     PetscMPIInt size;
64     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size);CHKERRMPI(ierr);
65     ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->B);CHKERRQ(ierr);
66     ierr = MatSetSizes(mpiaij->B,mat->rmap->n,size > 1 ? mat->cmap->N : 0,mat->rmap->n,size > 1 ? mat->cmap->N : 0);CHKERRQ(ierr);
67     ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->B);CHKERRQ(ierr);
68   }
69   ierr = MatSetType(mpiaij->A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
70   ierr = MatSetType(mpiaij->B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
71   ierr = MatSeqAIJSetPreallocation(mpiaij->A,d_nz,d_nnz);CHKERRQ(ierr);
72   ierr = MatSeqAIJSetPreallocation(mpiaij->B,o_nz,o_nnz);CHKERRQ(ierr);
73   mat->preallocated = PETSC_TRUE;
74   PetscFunctionReturn(0);
75 }
76 
77 PetscErrorCode MatMult_MPIAIJKokkos(Mat mat,Vec xx,Vec yy)
78 {
79   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
80   PetscErrorCode ierr;
81   PetscInt       nt;
82 
83   PetscFunctionBegin;
84   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
85   if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",mat->cmap->n,nt);
86   ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
87   ierr = (*mpiaij->A->ops->mult)(mpiaij->A,xx,yy);CHKERRQ(ierr);
88   ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
89   ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,yy,yy);CHKERRQ(ierr);
90   PetscFunctionReturn(0);
91 }
92 
93 PetscErrorCode MatMultAdd_MPIAIJKokkos(Mat mat,Vec xx,Vec yy,Vec zz)
94 {
95   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
96   PetscErrorCode ierr;
97   PetscInt       nt;
98 
99   PetscFunctionBegin;
100   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
101   if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",mat->cmap->n,nt);
102   ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
103   ierr = (*mpiaij->A->ops->multadd)(mpiaij->A,xx,yy,zz);CHKERRQ(ierr);
104   ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
105   ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,zz,zz);CHKERRQ(ierr);
106   PetscFunctionReturn(0);
107 }
108 
109 PetscErrorCode MatMultTranspose_MPIAIJKokkos(Mat mat,Vec xx,Vec yy)
110 {
111   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
112   PetscErrorCode ierr;
113   PetscInt       nt;
114 
115   PetscFunctionBegin;
116   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
117   if (nt != mat->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",mat->rmap->n,nt);
118   ierr = (*mpiaij->B->ops->multtranspose)(mpiaij->B,xx,mpiaij->lvec);CHKERRQ(ierr);
119   ierr = (*mpiaij->A->ops->multtranspose)(mpiaij->A,xx,yy);CHKERRQ(ierr);
120   ierr = VecScatterBegin(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
121   ierr = VecScatterEnd(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
122   PetscFunctionReturn(0);
123 }
124 
125 /* Merge the "A, B" matrices of mat into a matrix C.  mat's type is MPIAIJKOKKOS. C's type is MATSEQAIJKOKKOS.
126    A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n).
127    C still uses local column ids. Their corresponding global column ids are returned in glob.
128 */
129 PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJKokkos(Mat mat,MatReuse reuse,IS *glob,Mat *C)
130 {
131   Mat            Ad,Ao;
132   const PetscInt *cmap;
133   PetscErrorCode ierr;
134 
135   PetscFunctionBegin;
136   ierr = MatMPIAIJGetSeqAIJ(mat,&Ad,&Ao,&cmap);CHKERRQ(ierr);
137   ierr = MatSeqAIJKokkosMergeMats(Ad,Ao,reuse,C);CHKERRQ(ierr);
138   if (glob) {
139     PetscInt cst, i, dn, on, *gidx;
140     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
141     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
142     ierr = MatGetOwnershipRangeColumn(mat,&cst,NULL);CHKERRQ(ierr);
143     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
144     for (i=0; i<dn; i++) gidx[i]    = cst + i;
145     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
146     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
147   }
148   PetscFunctionReturn(0);
149 }
150 
151 /* Structs used in matrix product C=AB, C=A^tB and C=B^tAB */
152 struct MatMatStruct {
153   MatRowMapKokkosView   Cdstart; /* Used to split sequential matrix into petsc's A, B format */
154   PetscSF               sf; /* SF to send/recv matrix entries */
155   MatScalarKokkosView   abuf; /* buf of mat values in send/recv */
156   Mat                   C1,C2,B_local;
157   KokkosCsrMatrix       C1_global,C2_global,C_global;
158   KernelHandle          kh;
159   MatMatStruct() {
160     C1 = C2 = B_local = NULL;
161     sf = NULL;
162   }
163 
164   ~MatMatStruct() {
165     MatDestroy(&C1);
166     MatDestroy(&C2);
167     MatDestroy(&B_local);
168     PetscSFDestroy(&sf);
169     kh.destroy_spadd_handle();
170   }
171 };
172 
173 struct MatMatStruct_AB : public MatMatStruct {
174   MatColIdxKokkosView   rows;
175   MatRowMapKokkosView   rowoffset;
176   Mat                   B_other,C_petsc; /* SEQAIJKOKKOS matrices. TODO: have a better var name than C_petsc */
177 
178   MatMatStruct_AB() : B_other(NULL),C_petsc(NULL){}
179   ~MatMatStruct_AB() {
180     MatDestroy(&B_other);
181     MatDestroy(&C_petsc);
182   }
183 };
184 
185 struct MatMatStruct_AtB : public MatMatStruct {
186   MatRowMapKokkosView   srcrowoffset,dstrowoffset;
187 };
188 
189 struct MatProductData_MPIAIJKokkos
190 {
191   MatMatStruct_AB   *mmAB;
192   MatMatStruct_AtB  *mmAtB;
193   PetscBool         reusesym;
194 
195   MatProductData_MPIAIJKokkos(): mmAB(NULL),mmAtB(NULL),reusesym(PETSC_FALSE){}
196   ~MatProductData_MPIAIJKokkos() {
197     delete mmAB;
198     delete mmAtB;
199   }
200 };
201 
202 static PetscErrorCode MatProductDataDestroy_MPIAIJKokkos(void *data)
203 {
204   PetscFunctionBegin;
205   CHKERRCXX(delete static_cast<MatProductData_MPIAIJKokkos*>(data));
206   PetscFunctionReturn(0);
207 }
208 
209 /* MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds - Get a KokkosCsrMatrix from a MATSEQAIJKOKKOS matrix
210 
211    Input Parameters:
212 +  A       - the MATSEQAIJKOKKOS matrix
213 .  N       - new column size for the returned Kokkos matrix
214 -  l2g     - a map that maps old col ids to new col ids
215 
216    Output Parameters:
217 .  csrmat  - the Kokkos matrix, which has the same row size as A, shares a, i but not j with A.
218  */
219 static PetscErrorCode MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(Mat A,PetscInt N,const ConstMatColIdxKokkosView& l2g,KokkosCsrMatrix& csrmat)
220 {
221   KokkosCsrMatrix&         orig = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->csrmat;
222   MatColIdxKokkosView      jg("jg",orig.nnz()); /* New j array for csrmat */
223 
224   PetscFunctionBegin;
225   CHKERRCXX(Kokkos::parallel_for(orig.nnz(), KOKKOS_LAMBDA(const PetscInt i) {jg(i) = l2g(orig.graph.entries(i));}));
226   CHKERRCXX(csrmat = KokkosCsrMatrix("csrmat",orig.numRows(),N,orig.nnz(),orig.values,orig.graph.row_map,jg));
227   PetscFunctionReturn(0);
228 }
229 
230 /* MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices - Set the diag and offdiag matrices of a MATMPIAIJKOKKOS matrix.
231    It is similar to MatCreateMPIAIJWithSplitArrays.
232 
233   Input Parameters:
234 +  mat   - the MATMPIAIJKOKKOS matrix, which should have its type and layout set, but should not have its diag, offdiag matrices set
235 .  A     - the diag matrix using local col ids
236 -  B     - the offdiag matrix using global col ids
237 
238   Output Parameters:
239 .  mat   - the updated MATMPIAIJKOKKOS matrix
240 */
241 static PetscErrorCode MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices(Mat mat,Mat A,Mat B)
242 {
243   PetscErrorCode      ierr;
244   Mat_MPIAIJ          *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data);
245   PetscInt            m,n,M,N,Am,An,Bm,Bn;
246   Mat_SeqAIJKokkos    *bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
247 
248   PetscFunctionBegin;
249   ierr = MatGetSize(mat,&M,&N);CHKERRQ(ierr);
250   ierr = MatGetLocalSize(mat,&m,&n);CHKERRQ(ierr);
251   ierr = MatGetLocalSize(A,&Am,&An);CHKERRQ(ierr);
252   ierr = MatGetLocalSize(B,&Bm,&Bn);CHKERRQ(ierr);
253 
254   if (m != Am || m != Bm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"local number of rows do not match");
255   if (n != An) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"local number of columns do not match");
256   if (N != Bn) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"global number of columns do not match");
257   if (mpiaij->A || mpiaij->B) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A, B of the MPIAIJ matrix are not empty");
258   mpiaij->A = A;
259   mpiaij->B = B;
260 
261   mat->preallocated     = PETSC_TRUE;
262   mat->nooffprocentries = PETSC_TRUE; /* See MatAssemblyBegin_MPIAIJ. In effect, making MatAssemblyBegin a nop */
263 
264   ierr = MatSetOption(mat,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE);CHKERRQ(ierr);
265   ierr = MatAssemblyBegin(mat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
266   /* MatAssemblyEnd is critical here. It sets mat->offloadmask according to A and B's, and
267     also gets mpiaij->B compacted, with its col ids and size reduced
268   */
269   ierr = MatAssemblyEnd(mat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
270   ierr = MatSetOption(mat,MAT_NO_OFF_PROC_ENTRIES,PETSC_FALSE);CHKERRQ(ierr);
271   ierr = MatSetOption(mat,MAT_NEW_NONZERO_LOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
272 
273   /* Update bkok with new local col ids (stored on host) and size */
274   bkok->j_dual.modify_host();
275   bkok->j_dual.sync_device();
276   bkok->SetColSize(mpiaij->B->cmap->n);
277   PetscFunctionReturn(0);
278 }
279 
280 /* MatSeqAIJKokkosBcast - Bcast rows of a SEQAIJKOKKOS matrice (B) to form a SEQAIJKOKKOS matrix (C).
281 
282    It is essentially the MPIAIJKOKKOS counterpart of MatGetBrowsOfAoCols_MPIAIJ, but supports device and uses PetscSF.
283    In the given ownerSF, leaves correspond to rows in C, and roots correspond to rows in B. Roots may connect to multiple leaves.
284    Suppose C's j-th row is connected to a root identified by PetscSFNode (k,i), it means we will bcast the i-th row of B on rank k
285    to j-th row of C. ownerSF's leaves must be contiguous (in other words, as if ilocal=NULL was used to set its graph).
286 
287    Collective on comm of ownerSF
288 
289    Input Parameters:
290 +   B       - the SEQAIJKOKKOS matrix, using local col ids
291 .   reuse   - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX
292 .   N       - global col ids are in range of [0,N). N Must be the same across ranks (nonsignificant in MAT_REUSE_MATRIX)
293 .   l2g     - a map mapping B's local col ids to global ones (nonsignificant in MAT_REUSE_MATRIX)
294 .   ownerSF - the ownership SF (nonsignificant in MAT_REUSE_MATRIX)
295 
296    Input/Output Parameters (out when resue = MAT_INITIAL_MATRIX, inout when reuse = MAT_REUSE_MATRIX)
297 +   bcastSF   - the SF used to bcast rows of B. This plain SF does buffer (abuf) to buffer (Ca) send/recv. In this SF, vertices are nonzeros.
298 .   abuf      - buffer for sending matrix values
299 .   rows      - array containing indices of (local) rows that this rank needs to bcast to others. Each receiver rank has a chunk in rows[].
300                 Values in rows[] might have repeats, which simply indicates a row will be bcast'ed to multiple neighbors.
301 .   rowoffset - For each row in rows[], it will be copied to rowoffset[] at abuf[]
302 -   C         -  the SEQAIJKOKKOS matrix made of the bcast'ed rows, using local col ids.
303 */
304 static PetscErrorCode MatSeqAIJKokkosBcast(Mat B,MatReuse reuse,PetscInt N,const ConstMatColIdxKokkosView& l2g,PetscSF ownerSF,
305                                            PetscSF& bcastSF,MatScalarKokkosView& abuf,MatColIdxKokkosView& rows,
306                                            MatRowMapKokkosView& rowoffset,Mat& C)
307 {
308   PetscErrorCode               ierr;
309   Mat_SeqAIJKokkos             *bkok,*ckok;
310 
311   PetscFunctionBegin;
312   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); /* Make sure B->spptr is accessible */
313   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
314 
315   if (reuse == MAT_REUSE_MATRIX) {
316     ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
317 
318     const auto& Ba = bkok->a_dual.view_device();
319     const auto& Bi = bkok->i_dual.view_device();
320     const auto& Ca = ckok->a_dual.view_device();
321 
322     /* Copy Ba to abuf */
323     Kokkos::parallel_for(Kokkos::TeamPolicy<>(rows.extent(0), Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
324       PetscInt i    = t.league_rank(); /* rows[i] is r-th row of B */
325       PetscInt r    = rows(i);
326       PetscInt base = rowoffset(i); /* Copy r-th row of B to this offset in abuf[] */
327       Kokkos::parallel_for(Kokkos::TeamThreadRange(t,Bi(r+1)-Bi(r)),[&](PetscInt k) {
328         abuf(base+k) = Ba(Bi(r)+k);
329       });
330     });
331 
332     /* Send abuf to Ca through bcastSF and then mark C is updated on device */
333     ierr = PetscSFBcastBegin(bcastSF,MPIU_SCALAR,abuf.data(),Ca.data(),MPI_REPLACE);CHKERRQ(ierr); /* TODO: get memtype for abuf */
334     ierr = PetscSFBcastEnd  (bcastSF,MPIU_SCALAR,abuf.data(),Ca.data(),MPI_REPLACE);CHKERRQ(ierr);
335     ckok->a_dual.modify_device();
336   } else if (reuse == MAT_INITIAL_MATRIX) {
337     MPI_Comm       comm;
338     PetscMPIInt    tag;
339     PetscInt       k,Cm,Cn,Cnnz,*Ci_h,nroots,nleaves;
340 
341     ierr = PetscObjectGetComm((PetscObject)ownerSF,&comm);CHKERRMPI(ierr);
342     ierr = PetscSFGetGraph(ownerSF,&nroots,&nleaves,NULL,NULL);CHKERRQ(ierr);
343     Cm   = nleaves; /* row size of C */
344     Cn   = N;  /* col size of C, which initially uses global ids, so we can safely set its col size as N */
345 
346     /* Get row lens (nz) of B's rows for later fast query */
347     PetscInt       *Browlens;
348     const PetscInt *tmp = bkok->i_host_data();
349     ierr = PetscMalloc1(nroots,&Browlens);CHKERRQ(ierr);
350     for (k=0; k<nroots; k++) Browlens[k] = tmp[k+1]-tmp[k];
351 
352     /* By ownerSF, each proc gets lens of rows of C */
353     MatRowMapKokkosDualView Ci("i",Cm+1); /* C's rowmap */
354     Ci_h    = Ci.view_host().data();
355     Ci_h[0] = 0;
356     ierr    = PetscSFBcastWithMemTypeBegin(ownerSF,MPIU_INT,PETSC_MEMTYPE_HOST,Browlens,PETSC_MEMTYPE_HOST,&Ci_h[1],MPI_REPLACE);CHKERRQ(ierr);
357     ierr    = PetscSFBcastEnd(ownerSF,MPIU_INT,Browlens,&Ci_h[1],MPI_REPLACE);CHKERRQ(ierr);
358     for (k=1; k<Cm+1; k++) Ci_h[k] += Ci_h[k-1]; /* Convert lens to CSR */
359     Cnnz    = Ci_h[Cm];
360     Ci.modify_host();
361     Ci.sync_device();
362 
363     /* With the newly known Cnnz, we are able to allocate (j, a) for C on host & device */
364     MatColIdxKokkosDualView  Cj("j",Cnnz);
365     MatScalarKokkosDualView  Ca("a",Cnnz);
366 
367     /* Now build the bcastSF to fill Ca, Cj. This plain SF only does (contiguous) buffer to buffer send/recv */
368     const PetscMPIInt *iranks,*ranks;
369     const PetscInt    *ioffset,*irootloc,*roffset;
370     PetscInt          i,j,niranks,nranks,*sdisp,*rdisp,*rowptr;
371     MPI_Request       *reqs;
372 
373     ierr = PetscSFGetLeafRanks(ownerSF,&niranks,&iranks,&ioffset,&irootloc);CHKERRQ(ierr); /* irootloc[] contains indices of rows I need to send to each receiver */
374     ierr = PetscSFGetRootRanks(ownerSF,&nranks,&ranks,&roffset,NULL/*rmine*/,NULL/*rremote*/);CHKERRQ(ierr); /* recv info */
375 
376     /* figure out offsets at the send buffer, to build the SF
377       sdisp[]  - stores offsets of nonzeros (in abuf or jbuf, see later) I need to send, per receiver.
378       rowptr[] - stores offsets for data of each row in abuf
379 
380       rdisp[]  - to receive sdisp[]
381     */
382     ierr = PetscMalloc3(niranks+1,&sdisp,nranks,&rdisp,niranks+nranks,&reqs);CHKERRQ(ierr);
383     MatRowMapKokkosViewHost rowptr_h("rowptr_h",ioffset[niranks]+1); /* Let Kokkos do the allocation, so that we can do an easy mirror later */
384     rowptr = rowptr_h.data();
385 
386     sdisp[0] = 0;
387     rowptr[0]  = 0;
388     for (i=0; i<niranks; i++) { /* for each receiver */
389       PetscInt len, nz = 0;
390       for (j=ioffset[i]; j<ioffset[i+1]; j++) { /* for each row to this receiver */
391         len         = Browlens[irootloc[j]];
392         rowptr[j+1] = rowptr[j] + len;
393         nz         += len;
394       }
395       sdisp[i+1] = sdisp[i] + nz;
396     }
397     ierr = PetscCommGetNewTag(comm,&tag);CHKERRMPI(ierr);
398     for (i=0; i<nranks; i++)  {ierr = MPI_Irecv(&rdisp[i],1,MPIU_INT,ranks[i],tag,comm,&reqs[i]);CHKERRMPI(ierr);}
399     for (i=0; i<niranks; i++) {ierr = MPI_Isend(&sdisp[i],1,MPIU_INT,iranks[i],tag,comm,&reqs[nranks+i]);CHKERRMPI(ierr);}
400     ierr = MPI_Waitall(niranks+nranks,reqs,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
401 
402     PetscInt    nleaves2 = Cnnz; /* leaves are the nonzeros I will receive */
403     PetscInt    nroots2  = sdisp[niranks]; /* roots are the nonzeros (in abuf) I will send */
404     PetscSFNode *iremote;
405     ierr = PetscMalloc1(nleaves2,&iremote);CHKERRQ(ierr);
406     for (i=0; i<nranks; i++) { /* for each sender */
407       k = 0;
408       for (j=Ci_h[roffset[i]]; j<Ci_h[roffset[i+1]]; j++) {
409         iremote[j].rank  = ranks[i];
410         iremote[j].index = rdisp[i] + k;
411         k++;
412       }
413     }
414     /* TODO: we should extend PetscSF APIs for this buffer-to-buffer send/recv */
415     ierr = PetscSFCreate(comm,&bcastSF);CHKERRQ(ierr);
416     ierr = PetscSFSetGraph(bcastSF,nroots2,nleaves2,NULL/*ilocal*/,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
417 
418     /* Extract selected rows of B, and copy their (a, j) into abuf[] and jbuf[], with j converted
419       from local to global. Then use bcastSF to fill Ca, Cj.
420     */
421     ConstMatColIdxKokkosViewHost rows_h(irootloc,ioffset[niranks]); /* irootloc[] stores indices of rows I need to send */
422     MatColIdxKokkosView          rows("rows",ioffset[niranks]);
423     Kokkos::deep_copy(rows,rows_h); /* Use deep copy since irootoc is managed by PetscSF and we want 'rows' to be standalone */
424 
425     rowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),rowptr_h); /* If no device, rowoffset will be an alias to rowptr_h */
426 
427     MatColIdxKokkosView jbuf("jbuf",sdisp[niranks]); /* send buf for (global) col ids */
428     abuf = MatScalarKokkosView("abuf",sdisp[niranks]); /* send buf for mat values */
429 
430     const auto& Ba = bkok->a_dual.view_device();
431     const auto& Bi = bkok->i_dual.view_device();
432     const auto& Bj = bkok->j_dual.view_device();
433 
434     /* Copy Ba, Bj to abuf, jbuf with change col ids from local to global */
435     Kokkos::parallel_for(Kokkos::TeamPolicy<>(rows.extent(0), Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
436       PetscInt i    = t.league_rank(); /* rows[i] is r-th row of B */
437       PetscInt r    = rows(i);
438       PetscInt base = rowoffset(i); /* Copy r-th row of B to this offset in abuf[] */
439       Kokkos::parallel_for(Kokkos::TeamThreadRange(t,Bi(r+1)-Bi(r)),[&](PetscInt k) {
440         abuf(base+k) = Ba(Bi(r)+k);
441         jbuf(base+k) = l2g(Bj(Bi(r)+k));
442       });
443     });
444 
445     /* Send abuf & jbuf to fill Ca, Cj */
446     ierr = PetscSFBcastBegin(bcastSF,MPIU_INT,   jbuf.data(),Cj.view_device().data(),MPI_REPLACE);CHKERRQ(ierr);
447     ierr = PetscSFBcastBegin(bcastSF,MPIU_SCALAR,abuf.data(),Ca.view_device().data(),MPI_REPLACE);CHKERRQ(ierr);
448     ierr = PetscSFBcastEnd  (bcastSF,MPIU_INT,   jbuf.data(),Cj.view_device().data(),MPI_REPLACE);CHKERRQ(ierr);
449     ierr = PetscSFBcastEnd  (bcastSF,MPIU_SCALAR,abuf.data(),Ca.view_device().data(),MPI_REPLACE);CHKERRQ(ierr);
450     Cj.modify_device(); /* Mark Cj, Ca modified on device, but only sync Cj since we might not need Ca on host at all */
451     Cj.sync_host();
452     Ca.modify_device();
453 
454     /* Construct C with Ca, Ci, Cj */
455     auto ckok = new Mat_SeqAIJKokkos(Cm,Cn,Cnnz,Ci,Cj,Ca);
456     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,&C);CHKERRQ(ierr);
457     ierr = PetscFree3(sdisp,rdisp,reqs);CHKERRQ(ierr);
458     ierr = PetscFree(Browlens);CHKERRQ(ierr);
459   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unsupported MatReuse enum %d",reuse);
460   PetscFunctionReturn(0);
461 }
462 
463 /* MatSeqAIJKokkosReduce - Reduce rows of a SEQAIJKOKKOS matrix (A) to form a Kokkos Csr matrix (C)
464 
465   It is the reverse of MatSeqAIJKokkosBcast in some sense.
466 
467   Think each row of A as a leaf, then the given ownerSF specifies roots of the leaves. Roots may connect to multiple leaves.
468   In this routine, we reduce (i.e., concatenate) leaves (rows) at their roots to form potentially longer rows in C. Such rows might
469   contain repeats, which does not matter since they will be summed up by other routines. C's row size will be nroots of ownerSF.
470 
471   Input Parameters:
472 +  A        - the SEQAIJKOKKOS matrix to be reduced
473 .  reuse    - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX
474 .  local    - true if A uses local col ids; false if A is already in global col ids.
475 .  N        - if local, N is A's global col size
476 .  l2g      - if local, a map mapping A's local col ids to global ones, which are in range of [0,N).
477 -  ownerSF  - the SF specifies ownership (root) of rows in A
478 
479   Output Parameters:
480 +  reduceSF    - the SF to reduce A's rows to contiguous buffers at the receiver side
481 .  abuf         - a contiguous buffer to receive A's rows sent to this proc. Suppose there are 'nrows' such rows.
482 .  srcrowoffset - offset array of size nrows+1. Each entry is the corresponding row's offset in abuf[]. srcrowoffset[i+1]-srcrowoffset[i] is row i's len.
483 .  dstrowoffset - offset array of size nrows. Each entry is the corresponding row's offset in Ca[], i.e., C's 'a' array. Row i, i+1 in abuf[] may go to
484                   unrelated places in Ca, so dstrowoffset is not in CSR-like format as srcrowoffset.
485 -  C            - the matrix made up by rows sent to me from other ranks, using global col ids
486 
487    TODO: we can even have MatSeqAIJKokkosReduceBegin/End to provide oppertunity for callers to overlap comp./comm. when reuse = MAT_REUSE_MATRIX.
488  */
489 static PetscErrorCode MatSeqAIJKokkosReduce(Mat A,MatReuse reuse,PetscBool local,PetscInt N,const ConstMatColIdxKokkosView& l2g,PetscSF ownerSF,
490                                             PetscSF& reduceSF,MatScalarKokkosView& abuf,
491                                             MatRowMapKokkosView& srcrowoffset,MatRowMapKokkosView& dstrowoffset,
492                                             KokkosCsrMatrix& C)
493 {
494   PetscErrorCode         ierr;
495   PetscInt               i,r,Am,An,Annz,Cnnz,nrows;
496   const PetscInt         *Ai;
497   Mat_SeqAIJKokkos       *akok;
498 
499   PetscFunctionBegin;
500   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); /* So that A's latest data is on device */
501   ierr = MatGetSize(A,&Am,&An);
502   Ai   = static_cast<Mat_SeqAIJ*>(A->data)->i;
503   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
504   Annz = Ai[Am];
505 
506   if (reuse == MAT_REUSE_MATRIX) {
507     /* Send Aa to abuf */
508     ierr = PetscSFReduceBegin(reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE);CHKERRMPI(ierr);
509     ierr = PetscSFReduceEnd  (reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE);CHKERRMPI(ierr);
510 
511     /* Copy abuf to Ca */
512     const MatScalarKokkosView& Ca = C.values;
513     nrows = dstrowoffset.extent(0); /* Not srcrowoffset[] since it has an extra entry for CSR */
514     Kokkos::parallel_for(Kokkos::TeamPolicy<>(nrows, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
515       PetscInt i   = t.league_rank();
516       PetscInt src = srcrowoffset(i), dst = dstrowoffset(i);
517       PetscInt len = srcrowoffset(i+1) - srcrowoffset(i);
518       Kokkos::parallel_for(Kokkos::TeamThreadRange(t,len), [&](PetscInt k) {Ca(dst+k) = abuf(src+k);});
519     });
520   } else if (reuse == MAT_INITIAL_MATRIX) {
521     MPI_Comm               comm;
522     MPI_Request            *reqs;
523     PetscMPIInt            tag;
524     PetscInt               Cm;
525 
526     ierr = PetscObjectGetComm((PetscObject)ownerSF,&comm);CHKERRQ(ierr);
527     ierr = PetscCommGetNewTag(comm,&tag);CHKERRQ(ierr);
528 
529     PetscInt niranks,nranks,nroots,nleaves;
530     const PetscMPIInt *iranks,*ranks;
531     const PetscInt *ioffset,*rows,*roffset;  /* rows[] contains local indices of rows scattered to me from others. ioffset[] is a CSR on rows[] */
532     ierr = PetscSFSetUp(ownerSF);CHKERRQ(ierr);
533     ierr = PetscSFGetLeafRanks(ownerSF,&niranks,&iranks,&ioffset,&rows);CHKERRQ(ierr); /* recv info: iranks[] will send rows to me */
534     ierr = PetscSFGetRootRanks(ownerSF,&nranks,&ranks,&roffset,NULL/*rmine*/,NULL/*rremote*/);CHKERRQ(ierr); /* send info */
535     ierr = PetscSFGetGraph(ownerSF,&nroots,&nleaves,NULL,NULL);CHKERRQ(ierr);
536     if (nleaves != Am) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"ownerSF's nleaves(%" PetscInt_FMT ") != row size of A(%" PetscInt_FMT ")",nleaves,Am);
537     Cm    = nroots;
538     nrows = ioffset[niranks]; /* # of rows to be received. Might receive same row (each is partial) from different senders */
539 
540     /* Tell owners how long each row I will send */
541     PetscInt                *srowlens; /* send buf of row lens */
542     MatRowMapKokkosViewHost rrowlens_h("rrowoffset_h",nrows+1); /* recv buf of row lens. +1 to make CSR later. Memory might be passed to other views */
543     PetscInt                *rrowlens = rrowlens_h.data();
544 
545     ierr = PetscMalloc2(Am,&srowlens,niranks+nranks,&reqs);CHKERRQ(ierr);
546     for (i=0; i<Am; i++) srowlens[i] = Ai[i+1] - Ai[i];
547     rrowlens[0] = 0;
548     rrowlens++; /* shift the pointer to make the following expression more readable */
549     for (i=0; i<niranks; i++){ierr = MPI_Irecv(&rrowlens[ioffset[i]],ioffset[i+1]-ioffset[i],MPIU_INT,iranks[i],tag,comm,&reqs[i]);CHKERRMPI(ierr);}
550     for (i=0; i<nranks; i++) {ierr = MPI_Isend(&srowlens[roffset[i]],roffset[i+1]-roffset[i],MPIU_INT,ranks[i],tag,comm,&reqs[niranks+i]);CHKERRMPI(ierr);}
551     ierr = MPI_Waitall(niranks+nranks,reqs,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
552 
553     /* Owner builds Ci on host by histogramming rrowlens[] */
554     MatRowMapKokkosViewHost Ci_h("i",Cm+1);
555     Kokkos::deep_copy(Ci_h,0); /* Zero Ci */
556     MatRowMapType *Ci_ptr = Ci_h.data();
557 
558     for (i=0; i<nrows; i++) {
559       r = rows[i]; /* local row id of i-th received row */
560      #if defined(PETSC_USE_DEBUG)
561       if (r<0 || r>=Cm) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"local row id (%" PetscInt_FMT ") is out of range [0,%" PetscInt_FMT ")",r,Cm);
562      #endif
563       Ci_ptr[r+1] += rrowlens[i]; /* add to length of row r in C */
564     }
565     for (i=0; i<Cm; i++) Ci_ptr[i+1] += Ci_ptr[i]; /* to CSR format */
566     Cnnz = Ci_ptr[Cm];
567 
568     /* For each received row, compute src & dst offsets in memory copying (from recv bufs abuf, jbuf to Ca, Cj) */
569     MatRowMapKokkosViewHost dstrowoffset_h("dstrowoffset_h",nrows);
570     PetscInt                *dstrowoffset_hptr = dstrowoffset_h.data();
571     PetscInt                *currowlens; /* Current row lens. They are temp accumulators for row lens in C, to help build dstrowoffset */
572 
573     ierr = PetscCalloc1(Cm,&currowlens);CHKERRQ(ierr); /* Init with zero, to be added to */
574     for (i=0; i<nrows; i++) { /* for each row I receive */
575       r                    = rows[i]; /* row id in C */
576       dstrowoffset_hptr[i] = Ci_ptr[r] + currowlens[r]; /* dst offset of the new place for each recv'ed row in Ca/Cj */
577       currowlens[r]       += rrowlens[i]; /* accumulate to length of row r in C */
578     }
579     ierr = PetscFree(currowlens);CHKERRQ(ierr);
580 
581     rrowlens--;
582     for (i=0; i<nrows; i++) rrowlens[i+1] += rrowlens[i]; /* Change rrowlens[] to CSR format */
583     dstrowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),dstrowoffset_h);
584     srcrowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),rrowlens_h); /* src offset of each recv'ed row in abuf/jbuf */
585 
586     /* Build the reduceSF, which performs buffer to buffer send/recv */
587     PetscInt *sdisp,*rdisp; /* buffer to send offsets of roots, and buffer to recv them */
588     ierr = PetscMalloc2(niranks,&sdisp,nranks,&rdisp);CHKERRQ(ierr);
589     for (i=0; i<niranks; i++) sdisp[i] = rrowlens[ioffset[i]];
590     for (i=0; i<nranks; i++)  {ierr = MPI_Irecv(&rdisp[i],1,MPIU_INT,ranks[i],tag,comm,&reqs[i]);CHKERRMPI(ierr);}
591     for (i=0; i<niranks; i++) {ierr = MPI_Isend(&sdisp[i],1,MPIU_INT,iranks[i],tag,comm,&reqs[nranks+i]);CHKERRMPI(ierr);}
592     ierr = MPI_Waitall(niranks+nranks,reqs,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
593 
594     /* Nonzeros in abuf/jbuf are roots and those in A are leaves */
595     PetscInt    nroots2 = Cnnz,nleaves2 = Annz;
596     PetscSFNode *iremote;
597     ierr = PetscMalloc1(nleaves2,&iremote);CHKERRQ(ierr); /* no free, since memory will be given to reduceSF */
598     for (i=0; i<nranks; i++) {
599       PetscInt rootbase = rdisp[i]; /* root offset at this root rank */
600       PetscInt leafbase = Ai[roffset[i]]; /* leaf base */
601       PetscInt nz       = Ai[roffset[i+1]] - leafbase; /* I will send nz nonzeros to this root rank */
602       for (PetscInt k=0; k<nz; k++) {
603         iremote[leafbase+k].rank  = ranks[i];
604         iremote[leafbase+k].index = rootbase + k;
605       }
606     }
607     ierr = PetscSFCreate(comm,&reduceSF);CHKERRQ(ierr);
608     ierr = PetscSFSetGraph(reduceSF,nroots2,nleaves2,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
609     ierr = PetscFree2(sdisp,rdisp);CHKERRQ(ierr);
610 
611     /* Reduce Aa, Ajg to abuf and jbuf */
612 
613     /* If A uses local col ids, convert them to global ones before sending */
614     MatColIdxKokkosView Ajg;
615     if (local) {
616       Ajg = MatColIdxKokkosView("j",Annz);
617       const MatColIdxKokkosView& Aj = akok->j_dual.view_device();
618       Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscInt i) {Ajg(i) = l2g(Aj(i));});
619     } else {
620       Ajg = akok->j_dual.view_device(); /* no data copy, just take a reference */
621     }
622 
623     MatColIdxKokkosView   jbuf("jbuf",Cnnz);
624     abuf = MatScalarKokkosView("abuf",Cnnz);
625     ierr = PetscSFReduceBegin(reduceSF,MPIU_INT,   Ajg.data(),           jbuf.data(),MPI_REPLACE);CHKERRMPI(ierr);
626     ierr = PetscSFReduceEnd  (reduceSF,MPIU_INT,   Ajg.data(),           jbuf.data(),MPI_REPLACE);CHKERRMPI(ierr);
627     ierr = PetscSFReduceBegin(reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE);CHKERRMPI(ierr);
628     ierr = PetscSFReduceEnd  (reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE);CHKERRMPI(ierr);
629 
630     /* Copy data from abuf, jbuf to Ca, Cj */
631     MatRowMapKokkosView    Ci = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),Ci_h); /* Ci is an alias of Ci_h if no device */
632     MatColIdxKokkosView    Cj("j",Cnnz);
633     MatScalarKokkosView    Ca("a",Cnnz);
634 
635     Kokkos::parallel_for(Kokkos::TeamPolicy<>(nrows, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
636       PetscInt i   = t.league_rank();
637       PetscInt src = srcrowoffset(i), dst = dstrowoffset(i);
638       PetscInt len = srcrowoffset(i+1) - srcrowoffset(i);
639       Kokkos::parallel_for(Kokkos::TeamThreadRange(t,len), [&](PetscInt k) {
640         Ca(dst+k) = abuf(src+k);
641         Cj(dst+k) = jbuf(src+k);
642       });
643     });
644 
645     /* Build C with Ca, Ci, Cj */
646     C    = KokkosCsrMatrix("csrmat",Cm,N,Cnnz,Ca,Ci,Cj);
647     ierr = PetscFree2(srowlens,reqs);CHKERRQ(ierr);
648   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unsupported MatReuse enum %d",reuse);
649   PetscFunctionReturn(0);
650 }
651 
652 /* MatSetMPIAIJKokkosWithGlobalCSRMatrix - Set the diag and offdiag parts of a MATMPIAIJKOKKOS matrix by splitting a KokkosCsrMatrix
653 
654   Input Parameters:
655 +  C        - the MATMPIAIJKOKKOS matrix, of size m,n,M,N
656 .  reuse    - indicate whether the matrix has called this function before
657 .  csrmat   - the KokkosCsrMatrix, of size m,N
658 -  Cdstart  - when reuse == MAT_REUSE_MATRIX, it is an input parameter. For each row in csrmat, it stores the start of the first
659               entry of the diag block of C in csrmat's j array. E.g, if row i has col ids = {0, 3, 4, 5, 7, 9} and the first diag
660               entry is 5, then Cdstart[i] = 3.
661 
662   Output Parameters:
663 +  C        - the updated MATMPIAIJKOKKOS matrix
664 -  Cdstart - when reuse == MAT_INITIAL_MATRIX, it is an output parameter
665 
666   Notes:
667    Between calls with MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX, csrmat must have the same nonzero pattern
668  */
669 static PetscErrorCode MatSetMPIAIJKokkosWithGlobalCSRMatrix(Mat C,MatReuse reuse,const KokkosCsrMatrix& csrmat,MatRowMapKokkosView& Cdstart)
670 {
671   PetscErrorCode                  ierr;
672   const MatScalarKokkosView&      Ca = csrmat.values;
673   const ConstMatRowMapKokkosView& Ci = csrmat.graph.row_map;
674   PetscInt                        m,n,N;
675 
676   PetscFunctionBegin;
677   ierr = MatGetLocalSize(C,&m,&n);CHKERRQ(ierr);
678   ierr = MatGetSize(C,NULL,&N);CHKERRQ(ierr);
679 
680   if (reuse == MAT_REUSE_MATRIX) {
681     Mat_MPIAIJ                  *mpiaij = static_cast<Mat_MPIAIJ*>(C->data);
682     Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(mpiaij->A->spptr);
683     Mat_SeqAIJKokkos            *bkok = static_cast<Mat_SeqAIJKokkos*>(mpiaij->B->spptr);
684     const MatScalarKokkosView&  Cda = akok->a_dual.view_device(),Coa = bkok->a_dual.view_device();
685     const MatRowMapKokkosView&  Cdi = akok->i_dual.view_device(),Coi = bkok->i_dual.view_device();
686 
687     /* Fill 'a' of Cd and Co on device */
688     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
689       PetscInt i       = t.league_rank(); /* row i */
690       PetscInt clen    = Ci(i+1) - Ci(i); /* len of row i of C */
691       PetscInt cdlen   = Cdi(i+1) - Cdi(i); /* len of row i of Cd */
692       PetscInt cdstart = Cdstart(i); /* [start, end) of row i of Cd in C */
693       PetscInt cdend   = cdstart + cdlen;
694       /* [0, clen) is cut into three blocks: [0, cdstart), [cdstart, cdend), [cdend, clen) */
695       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clen), [&](PetscInt k) {
696         if (k < cdstart) {  /* k in [0, cdstart) */
697           Coa(Coi(i)+k) = Ca(Ci(i)+k);
698         } else if (k < cdend) { /* k in [cdstart, cdend) */
699           Cda(Cdi(i)+(k-cdstart)) = Ca(Ci(i)+k);
700         } else { /* k in [cdend, clen) */
701           Coa(Coi(i)+k-cdlen) = Ca(Ci(i)+k);
702         }
703       });
704     });
705 
706     akok->a_dual.modify_device();
707     bkok->a_dual.modify_device();
708   } else if (reuse == MAT_INITIAL_MATRIX) {
709     Mat                         Cd,Co;
710     const MatColIdxKokkosView&  Cj = csrmat.graph.entries;
711     MatRowMapKokkosDualView     Cdi_dual("i",m+1),Coi_dual("i",m+1);
712     MatRowMapKokkosView         Cdi = Cdi_dual.view_device(),Coi = Coi_dual.view_device();
713     PetscInt                    cstart,cend;
714 
715     /* Note that each row of C is sorted by col ids. We want to find out how to cut each row into three blocks:
716        left to the diag block, diag block, right to the diag block. The diag block have col ids in [cstart,cend).
717        Suppose a row of C has len nonzeros, indexed by [0, len). We want to know two indices: cdstart and cdend,
718        such that the three blocks are [0,cdstart), [cdstart,cdend), [cdend,len). The following code equivalentaly
719        stores values of cdstart and cdend-cstart (aka Cdi[]) instead.
720      */
721     Cdstart = MatRowMapKokkosView("Cdstart",m);
722     ierr    = PetscLayoutGetRange(C->cmap,&cstart,&cend);CHKERRQ(ierr); /* Not MatGetOwnershipRangeColumn() since C has not been preallocated yet */
723 
724     /* I could use RangePolicy and one thread per row. But since each thread essentially does binary search, threads in a
725       CUDA warp would completely diverge. So I use TeamPolicy with a team size 1.
726      */
727     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
728       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
729         PetscInt i = t.league_rank(); /* row i */
730         PetscInt j,first,count,step;
731 
732         if (i == 0) { /* Set the first entry of the i arrays to zero on device, to be used in CSR */
733           Cdi(0) = 0;
734           Coi(0) = 0;
735         }
736 
737         /* Do std::lower_bound(Ci(i),Ci(i+1),cstart) on Cj[]. We use j as the iterator. lower_bound() returns
738           in 'first' the first iterator with a value >= cstart, or last iterator if no such element is found.
739         */
740         count = Ci(i+1)-Ci(i);
741         first = Ci(i);
742         while (count > 0) {
743           j    = first;
744           step = count / 2;
745           j   += step;
746           if (Cj(j) < cstart) {
747             first  = ++j;
748             count -= step + 1;
749           } else count = step;
750         }
751         Cdstart(i) = first - Ci(i); /* 'first' is the while-loop's output */
752 
753         /* Do std::lower_bound(first,Ci(i+1),cend) on Cj[] */
754         count = Ci(i+1) - first;
755         while (count > 0) {
756           j    = first;
757           step = count / 2;
758           j   += step;
759           if (Cj(j) < cend) {
760             first  = ++j;
761             count -= step + 1;
762           } else count = step;
763         }
764         Cdi(i+1) = first - (Ci(i)+Cdstart(i)); /* 'first' is the while-loop's output */
765         Coi(i+1) = (Ci(i+1)-Ci(i)) - Cdi(i+1); /* Co's row len = C's row len - Cd's row len */
766       });
767     });
768 
769     /* Convert row lens in Cdi[], Coi[] to CSR format using inclusive scan, e.g., changing [0,1,2,3] into [0,1,3,6] */
770     Kokkos::parallel_scan(m+1,KOKKOS_LAMBDA(const PetscInt i,PetscInt& update,const bool final) {
771       update += Cdi(i);
772       if (final) Cdi(i) = update;
773     });
774     Kokkos::parallel_scan(m+1,KOKKOS_LAMBDA(const PetscInt i,PetscInt& update,const bool final) {
775       update += Coi(i);
776       if (final) Coi(i) = update;
777     });
778 
779     /* Get Cdi, Coi on host (it is not a waste, since we do need them on host in
780        MatCreateSeqAIJKokkosWithCSRMatrix() below), then get nnz of Cd and Co.
781     */
782     Cdi_dual.modify_device();
783     Coi_dual.modify_device();
784     Cdi_dual.sync_host();
785     Coi_dual.sync_host();
786     PetscInt Cd_nnz = Cdi_dual.view_host().data()[m];
787     PetscInt Co_nnz = Coi_dual.view_host().data()[m];
788 
789     /* With nnz, allocate a, j for Cd and Co */
790     MatColIdxKokkosDualView Cdj_dual("j",Cd_nnz),Coj_dual("j",Co_nnz);
791     MatScalarKokkosDualView Cda_dual("a",Cd_nnz),Coa_dual("a",Co_nnz);
792 
793     /* Fill a, j of Cd and Co on device */
794     MatColIdxKokkosView     Cdj = Cdj_dual.view_device(),Coj = Coj_dual.view_device();
795     MatScalarKokkosView     Cda = Cda_dual.view_device(),Coa = Coa_dual.view_device();
796 
797     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
798       PetscInt i       = t.league_rank(); /* row i */
799       PetscInt clen    = Ci(i+1) - Ci(i); /* len of row i of C */
800       PetscInt cdlen   = Cdi(i+1) - Cdi(i); /* len of row i of Cd */
801       PetscInt cdstart = Cdstart(i); /* [start, end) of row i of Cd in C */
802       PetscInt cdend   = cdstart + cdlen;
803       /* [0, clen) is cut into three blocks: [0, cdstart), [cdstart, cdend), [cdend, clen) */
804       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clen), [&](PetscInt k) {
805         if (k < cdstart) { /* k in [0, cdstart) */
806           Coa(Coi(i)+k) = Ca(Ci(i)+k);
807           Coj(Coi(i)+k) = Cj(Ci(i)+k);
808         } else if (k < cdend) { /* k in [cdstart, cdend) */
809           Cda(Cdi(i)+(k-cdstart)) = Ca(Ci(i)+k);
810           Cdj(Cdi(i)+(k-cdstart)) = Cj(Ci(i)+k) - cstart; /* Use local col ids in Cdj */
811         } else { /* k in [cdend, clen) */
812           Coa(Coi(i)+k-cdlen) = Ca(Ci(i)+k);
813           Coj(Coi(i)+k-cdlen) = Cj(Ci(i)+k);
814         }
815       });
816     });
817 
818     Cdj_dual.modify_device();
819     Cda_dual.modify_device();
820     Coj_dual.modify_device();
821     Coa_dual.modify_device();
822     /* With a, i, j for Cd and Co, finally build Cd, Co and then C. Their offloadmask will be set in each's MatAssemblyEnd */
823     auto cdkok = new Mat_SeqAIJKokkos(m,n,Cd_nnz,Cdi_dual,Cdj_dual,Cda_dual);
824     auto cokok = new Mat_SeqAIJKokkos(m,N,Co_nnz,Coi_dual,Coj_dual,Coa_dual);
825     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,cdkok,&Cd);CHKERRQ(ierr);
826     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,cokok,&Co);CHKERRQ(ierr);
827     ierr = MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices(C,Cd,Co);CHKERRQ(ierr); /* Coj will be converted to local ids within */
828   }
829   PetscFunctionReturn(0);
830 }
831 
832 /* MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos - Compact a SEQAIJKOKKS matrix's global col ids.
833 
834   It is similar to MatSeqAIJCompactOutExtraColumns_SeqAIJ, but it applies to SEQAIJKOKKOS and returns the l2g map in Kokkos view.
835 
836   Input Parameters:
837 +  C        - the MATMPIAIJKOKKOS matrix, of size m,n,M,N
838 .  reuse    - indicate whether the matrix has called this function before
839 .  csrmat   - the KokkosCsrMatrix, of size m,N
840 -  Cdoffset - when reuse == MAT_REUSE_MATRIX, it is an input parameter. For each row in csrmat, it stores the offset of the first
841               entry of the diag block of C in csrmat's j array.
842 
843   Output Parameters:
844 +  C        - the updated MATMPIAIJKOKKOS matrix
845 -  Cdoffset - when reuse == MAT_INITIAL_MATRIX, it is an output parameter
846 
847   Notes: the input matrix's col ids and col size will be changed.
848 */
849 static PetscErrorCode MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos(Mat C,MatColIdxKokkosView& l2g)
850 {
851   PetscErrorCode         ierr;
852   Mat_SeqAIJKokkos       *ckok;
853   ISLocalToGlobalMapping l2gmap;
854   const PetscInt         *garray;
855   PetscInt               sz;
856 
857   PetscFunctionBegin;
858   /* Compact P_other's global col ids and col size. We do it since we guess with local ids KK might be more memory scalable */
859   ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(C,&l2gmap);CHKERRQ(ierr);
860   ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
861   ckok->j_dual.modify_host(); /* P_other's j is modified on host; we need to sync it on device */
862   ckok->j_dual.sync_device();
863   ckok->SetColSize(C->cmap->n); /* Update col size of the csrmat in spptr */
864 
865   /* Build l2g -- the local to global mapping of C's cols */
866   ierr = ISLocalToGlobalMappingGetIndices(l2gmap,&garray);CHKERRQ(ierr);
867   ierr = ISLocalToGlobalMappingGetSize(l2gmap,&sz);CHKERRQ(ierr);
868   if (C->cmap->n != sz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"matrix column size(%" PetscInt_FMT ") != l2g mapping size(%" PetscInt_FMT ")", C->cmap->n,sz);
869 
870   ConstMatColIdxKokkosViewHost tmp(garray,sz);
871   l2g = MatColIdxKokkosView("l2g",sz);
872   Kokkos::deep_copy(l2g,tmp);
873 
874   ierr = ISLocalToGlobalMappingRestoreIndices(l2gmap,&garray);CHKERRQ(ierr);
875   ierr = ISLocalToGlobalMappingDestroy(&l2gmap);CHKERRQ(ierr);
876   PetscFunctionReturn(0);
877 }
878 
879 /* MatProductSymbolic_MPIAIJKokkos_AB - AB flavor of MatProductSymbolic_MPIAIJKokkos
880 
881   Input Parameters:
882 +  product  - Mat_Product which carried out the computation. Passed in to access info about this mat product.
883 .  A        - an MPIAIJKOKKOS matrix
884 .  B        - an MPIAIJKOKKOS matrix
885 -  mm       - a struct used to stash intermediate data when computing AB. Persist from symbolic to numeric operations.
886 
887   Notes: The local part of the result C is stored as mm->C_global, which is of type KokkosCsrMatrix and uses global col ids.
888 */
889 static PetscErrorCode MatProductSymbolic_MPIAIJKokkos_AB(Mat_Product *product,Mat A,Mat B,MatMatStruct_AB *mm)
890 {
891   PetscErrorCode              ierr;
892   Mat_MPIAIJ                  *a = static_cast<Mat_MPIAIJ*>(A->data);
893   Mat                         Ad = a->A,Ao = a->B; /* diag and offdiag of A */
894   IS                          glob = NULL;
895   const PetscInt              *garray;
896   PetscInt                    N = B->cmap->N,sz;
897   ConstMatColIdxKokkosView    l2g1; /* two temp maps mapping local col ids to global ones */
898   MatColIdxKokkosView         l2g2;
899   Mat                         C1,C2; /* intermediate matrices */
900 
901   PetscFunctionBegin;
902   /* C1 = Ad * B_local. B_local is a matrix got by merging Bd and Bo, and uses local col ids */
903   ierr = MatMPIAIJGetLocalMatMerge(B,MAT_INITIAL_MATRIX,&glob,&mm->B_local);CHKERRQ(ierr);
904   ierr = MatProductCreate(Ad,mm->B_local,NULL,&C1);CHKERRQ(ierr);
905   ierr = MatProductSetType(C1,MATPRODUCT_AB);CHKERRQ(ierr);
906   ierr = MatProductSetFill(C1,product->fill);CHKERRQ(ierr);
907   C1->product->api_user = product->api_user;
908   ierr = MatProductSetFromOptions(C1);CHKERRQ(ierr);
909   if (!C1->ops->productsymbolic) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C1->product->type]);
910   ierr = (*C1->ops->productsymbolic)(C1);CHKERRQ(ierr);
911 
912   ierr = ISGetIndices(glob,&garray);CHKERRQ(ierr);
913   ierr = ISGetSize(glob,&sz);CHKERRQ(ierr);
914   const auto& tmp  = ConstMatColIdxKokkosViewHost(garray,sz); /* wrap garray as a view */
915   l2g1 = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),tmp); /* maybe just an alias to tmp, so we restore garray at the very end */
916   ierr = MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C1,N,l2g1,mm->C1_global);
917 
918   /* C2 = Ao * B_other. B_other is a matrix consisting of needed rows of B gathered from other procs */
919   ierr = MatSeqAIJKokkosBcast(mm->B_local,MAT_INITIAL_MATRIX,N,l2g1,a->Mvctx,mm->sf,
920                               mm->abuf,mm->rows,mm->rowoffset,mm->B_other);CHKERRQ(ierr);
921 
922   /* Compact B_other to use local ids as we guess KK spgemm is more memroy scalable with that; We could skip the compaction to simplify code */
923   ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos(mm->B_other,l2g2);CHKERRQ(ierr);
924   ierr = MatProductCreate(Ao,mm->B_other,NULL,&C2);CHKERRQ(ierr);
925   ierr = MatProductSetType(C2,MATPRODUCT_AB);CHKERRQ(ierr);
926   ierr = MatProductSetFill(C2,product->fill);CHKERRQ(ierr);
927   C2->product->api_user = product->api_user;
928   ierr = MatProductSetFromOptions(C2);CHKERRQ(ierr);
929   if (!C2->ops->productsymbolic) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C2->product->type]);
930   ierr = (*C2->ops->productsymbolic)(C2);CHKERRQ(ierr);
931   ierr = MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C2,N,l2g2,mm->C2_global);
932 
933   /* C = C1 + C2.  We actually use their global col ids versions in adding */
934   mm->kh.create_spadd_handle(false); /* Input C1, C2 are NOT sorted, since B_local, B_other are not */
935   KokkosSparse::spadd_symbolic(&mm->kh,mm->C1_global,mm->C2_global,mm->C_global);
936   /* Have to do numeric since spadd_symbolic does not really populate column indices of the result matrix */
937   KokkosSparse::spadd_numeric(&mm->kh,(MatScalarType)1.0,mm->C1_global,(MatScalarType)1.0,mm->C2_global,mm->C_global);
938 
939   mm->C1 = C1;
940   mm->C2 = C2;
941   ierr = ISRestoreIndices(glob,&garray);CHKERRQ(ierr);
942   ierr = ISDestroy(&glob);CHKERRQ(ierr);
943   PetscFunctionReturn(0);
944 }
945 
946 /* MatProductSymbolic_MPIAIJKokkos_AtB - A^tB flavor of MatProductSymbolic_MPIAIJKokkos
947 
948   Input Parameters:
949 +  product  - Mat_Product which carried out the computation. Passed in to access info about this mat product.
950 .  A        - an MPIAIJKOKKOS matrix
951 .  B        - a SEQAIJKOKKOS matrix. It works as if A^t is multiplied by a parallel matrix made up of Bs on each rank.
952 .  localB   - Does B use local col ids? If false, then B is already in global col ids.
953 .  N        - col size of the "parallel B matrix". It implies B's global col ids are in range of [0,N) and N is the same across the communicator.
954 .  l2g      - If localB, then l2g maps B's local col ids to global ones.
955 -  mm       - a struct used to stash intermediate data in AtB
956 
957   Notes: The local part of the result C is stored as mm->C_global, which is of type KokkosCsrMatrix and uses global col ids.
958 */
959 static PetscErrorCode MatProductSymbolic_MPIAIJKokkos_AtB(Mat_Product *product,Mat A,Mat B,PetscBool localB,PetscInt N,const ConstMatColIdxKokkosView& l2g,MatMatStruct_AtB *mm)
960 {
961   PetscErrorCode         ierr;
962   Mat_MPIAIJ             *a = static_cast<Mat_MPIAIJ*>(A->data);
963   Mat                    Ad = a->A,Ao = a->B; /* diag and offdiag of A */
964   Mat                    C1,C2; /* intermediate matrices */
965 
966   PetscFunctionBegin;
967   /* C1 = Ad^t * B */
968   ierr = MatProductCreate(Ad,B,NULL,&C1);CHKERRQ(ierr);
969   ierr = MatProductSetType(C1,MATPRODUCT_AtB);CHKERRQ(ierr);
970   ierr = MatProductSetFill(C1,product->fill);CHKERRQ(ierr);
971   C1->product->api_user = product->api_user;
972   ierr = MatProductSetFromOptions(C1);CHKERRQ(ierr);
973   if (!C1->ops->productsymbolic) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C1->product->type]);
974   ierr = (*C1->ops->productsymbolic)(C1);CHKERRQ(ierr);
975 
976   if (localB) {ierr = MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C1,N,l2g,mm->C1_global);}
977   else mm->C1_global = static_cast<Mat_SeqAIJKokkos*>(C1->spptr)->csrmat; /* the csrmat already uses global col ids */
978 
979   /* C2 = Ao^t * B */
980   ierr = MatProductCreate(Ao,B,NULL,&C2);CHKERRQ(ierr);
981   ierr = MatProductSetType(C2,MATPRODUCT_AtB);CHKERRQ(ierr);
982   ierr = MatProductSetFill(C2,product->fill);CHKERRQ(ierr);
983   C2->product->api_user = product->api_user;
984   ierr = MatProductSetFromOptions(C2);CHKERRQ(ierr);
985   if (!C2->ops->productsymbolic) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C2->product->type]);
986   ierr = (*C2->ops->productsymbolic)(C2);CHKERRQ(ierr);
987 
988   ierr = MatSeqAIJKokkosReduce(C2,MAT_INITIAL_MATRIX,localB,N,l2g,a->Mvctx,mm->sf,mm->abuf,
989                                mm->srcrowoffset,mm->dstrowoffset,mm->C2_global);CHKERRQ(ierr);
990 
991   mm->kh.create_spadd_handle(false); /* Input C1, C2 are NOT sorted, since B may be not */
992   KokkosSparse::spadd_symbolic(&mm->kh,mm->C1_global,mm->C2_global,mm->C_global);
993   /* Have to do numeric since spadd_symbolic does not really populate column indices of the result matrix */
994   KokkosSparse::spadd_numeric(&mm->kh,(MatScalarType)1.0,mm->C1_global,(MatScalarType)1.0,mm->C2_global,mm->C_global);
995   mm->C1 = C1;
996   mm->C2 = C2;
997   PetscFunctionReturn(0);
998 }
999 
1000 PetscErrorCode MatProductNumeric_MPIAIJKokkos(Mat C)
1001 {
1002   PetscErrorCode                ierr;
1003   Mat_Product                   *product = C->product;
1004   MatProductType                ptype;
1005   MatProductData_MPIAIJKokkos   *mmdata;
1006   MatMatStruct                  *mm = NULL;
1007   MatMatStruct_AB               *ab;
1008   MatMatStruct_AtB              *atb;
1009   Mat                           A,B,Ad,Ao,Bd,Bo;
1010   const MatScalarType           one = 1.0; /* Not use literal 1.0 directly, to avoid wrong template instantiation in KokkosSparse::spadd_numeric */
1011 
1012   PetscFunctionBegin;
1013   MatCheckProduct(C,1);
1014   mmdata = static_cast<MatProductData_MPIAIJKokkos*>(product->data);
1015   ptype  = product->type;
1016   A      = product->A;
1017   B      = product->B;
1018   Ad     = static_cast<Mat_MPIAIJ*>(A->data)->A;
1019   Ao     = static_cast<Mat_MPIAIJ*>(A->data)->B;
1020   Bd     = static_cast<Mat_MPIAIJ*>(B->data)->A;
1021   Bo     = static_cast<Mat_MPIAIJ*>(B->data)->B;
1022 
1023   if (mmdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
1024     mmdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
1025     ab  = mmdata->mmAB;
1026     atb = mmdata->mmAtB;
1027     if (ab) {
1028       static_cast<MatProductData_SeqAIJKokkos*>(ab->C1->product->data)->reusesym = PETSC_FALSE;
1029       static_cast<MatProductData_SeqAIJKokkos*>(ab->C2->product->data)->reusesym = PETSC_FALSE;
1030     }
1031     if (atb) {
1032       static_cast<MatProductData_SeqAIJKokkos*>(atb->C1->product->data)->reusesym = PETSC_FALSE;
1033       static_cast<MatProductData_SeqAIJKokkos*>(atb->C2->product->data)->reusesym = PETSC_FALSE;
1034     }
1035     PetscFunctionReturn(0);
1036   }
1037 
1038   if (ptype == MATPRODUCT_AB) {
1039     ab   = mmdata->mmAB;
1040     /* C1 = Ad * B_local */
1041     if (!ab->C1->ops->productnumeric || !ab->C2->ops->productnumeric) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing numeric op for MATPRODUCT_AB");
1042     ierr = MatMPIAIJGetLocalMatMerge(B,MAT_REUSE_MATRIX,NULL/*glob*/,&ab->B_local);CHKERRQ(ierr);
1043     if (ab->C1->product->B != ab->B_local) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AB, internal mat product matrix C1->B has unexpectedly changed");
1044     if (ab->C1->product->A != Ad) {ierr = MatProductReplaceMats(Ad,NULL,NULL,ab->C1);CHKERRQ(ierr);}
1045     ierr = (*ab->C1->ops->productnumeric)(ab->C1);CHKERRQ(ierr);
1046     ierr = MatSeqAIJKokkosBcast(ab->B_local,MAT_REUSE_MATRIX,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,ab->sf,
1047                                 ab->abuf,ab->rows,ab->rowoffset,ab->B_other);CHKERRQ(ierr);
1048     /* C2 = Ao * B_other */
1049     if (ab->C2->product->B != ab->B_other) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AB, internal mat product matrix C2->B has unexpectedly changed");
1050     if (ab->C1->product->A != Ao) {ierr = MatProductReplaceMats(Ao,NULL,NULL,ab->C2);CHKERRQ(ierr);}
1051     ierr = (*ab->C2->ops->productnumeric)(ab->C2);CHKERRQ(ierr);
1052     /* C = C1_global + C2_global */
1053     KokkosSparse::spadd_numeric(&ab->kh,one,ab->C1_global,one,ab->C2_global,ab->C_global);
1054     mm = static_cast<MatMatStruct*>(ab);
1055   } else if (ptype == MATPRODUCT_AtB) {
1056     atb  = mmdata->mmAtB;
1057     if (!atb->C1->ops->productnumeric || !atb->C2->ops->productnumeric) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing numeric op for MATPRODUCT_AtB");
1058     /* C1 = Ad^t * B_local */
1059     ierr = MatMPIAIJGetLocalMatMerge(B,MAT_REUSE_MATRIX,NULL/*glob*/,&atb->B_local);CHKERRQ(ierr);
1060     if (atb->C1->product->B != atb->B_local) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AtB, internal mat product matrix C1->B has unexpectedly changed");
1061     if (atb->C1->product->A != Ad) {ierr = MatProductReplaceMats(Ad,NULL,NULL,atb->C1);CHKERRQ(ierr);}
1062     ierr = (*atb->C1->ops->productnumeric)(atb->C1);CHKERRQ(ierr);
1063 
1064     /* C2 = Ao^t * B_local */
1065     if (atb->C2->product->B != atb->B_local) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AtB, internal mat product matrix C2->B has unexpectedly changed");
1066     if (atb->C2->product->A != Ao) {ierr = MatProductReplaceMats(Ao,NULL,NULL,atb->C2);CHKERRQ(ierr);}
1067     ierr = (*atb->C2->ops->productnumeric)(atb->C2);CHKERRQ(ierr);
1068     /* Form C2_global */
1069     ierr = MatSeqAIJKokkosReduce(atb->C2,MAT_REUSE_MATRIX,PETSC_TRUE,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,atb->sf,
1070                                  atb->abuf,atb->srcrowoffset,atb->dstrowoffset,atb->C2_global);CHKERRQ(ierr);
1071     /* C = C1_global + C2_global */
1072     KokkosSparse::spadd_numeric(&atb->kh,one,atb->C1_global,one,atb->C2_global,atb->C_global);
1073     mm = static_cast<MatMatStruct*>(atb);
1074   } else if (ptype == MATPRODUCT_PtAP) { /* BtAB */
1075     ab   = mmdata->mmAB;
1076     ierr = MatMPIAIJGetLocalMatMerge(B,MAT_REUSE_MATRIX,NULL/*glob*/,&ab->B_local);CHKERRQ(ierr);
1077 
1078     /* ab->C1 = Ad * B_local */
1079     if (ab->C1->product->B != ab->B_local) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_PtAP, internal mat product matrix ab->C1->B has unexpectedly changed");
1080     if (ab->C1->product->A != Ad) {ierr = MatProductReplaceMats(Ad,NULL,NULL,ab->C1);CHKERRQ(ierr);}
1081     ierr = (*ab->C1->ops->productnumeric)(ab->C1);CHKERRQ(ierr);
1082     ierr = MatSeqAIJKokkosBcast(ab->B_local,MAT_REUSE_MATRIX,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,ab->sf,
1083                                 ab->abuf,ab->rows,ab->rowoffset,ab->B_other);CHKERRQ(ierr);
1084     /* ab->C2 = Ao * B_other */
1085     if (ab->C2->product->A != Ao) {ierr = MatProductReplaceMats(Ao,NULL,NULL,ab->C2);CHKERRQ(ierr);}
1086     ierr = (*ab->C2->ops->productnumeric)(ab->C2);CHKERRQ(ierr); /* C2 = Ao * B_other */
1087     KokkosSparse::spadd_numeric(&ab->kh,one,ab->C1_global,one,ab->C2_global,ab->C_global);
1088 
1089     /* atb->C1 = Bd^t * ab->C_petsc */
1090     atb  = mmdata->mmAtB;
1091     if (atb->C1->product->B != ab->C_petsc) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_PtAP, internal mat product matrix atb->C1->B has unexpectedly changed");
1092     if (atb->C1->product->A != Bd) {ierr = MatProductReplaceMats(Bd,NULL,NULL,atb->C1);CHKERRQ(ierr);}
1093     ierr = (*atb->C1->ops->productnumeric)(atb->C1);CHKERRQ(ierr);
1094     /* atb->C2 = Bo^t * ab->C_petsc */
1095     if (atb->C2->product->A != Bo) {ierr = MatProductReplaceMats(Bo,NULL,NULL,atb->C2);CHKERRQ(ierr);}
1096     ierr = (*atb->C2->ops->productnumeric)(atb->C2);CHKERRQ(ierr);
1097     ierr = MatSeqAIJKokkosReduce(atb->C2,MAT_REUSE_MATRIX,PETSC_FALSE,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,atb->sf,
1098                                  atb->abuf,atb->srcrowoffset,atb->dstrowoffset,atb->C2_global);CHKERRQ(ierr);
1099     KokkosSparse::spadd_numeric(&atb->kh,one,atb->C1_global,one,atb->C2_global,atb->C_global);
1100     mm = static_cast<MatMatStruct*>(atb);
1101   }
1102   /* Split C_global to form C */
1103   ierr = MatSetMPIAIJKokkosWithGlobalCSRMatrix(C,MAT_REUSE_MATRIX,mm->C_global,mm->Cdstart);CHKERRQ(ierr);
1104   PetscFunctionReturn(0);
1105 }
1106 
1107 PetscErrorCode MatProductSymbolic_MPIAIJKokkos(Mat C)
1108 {
1109   PetscErrorCode              ierr;
1110   Mat                         A,B;
1111   Mat_Product                 *product = C->product;
1112   MatProductType              ptype;
1113   MatProductData_MPIAIJKokkos *mmdata;
1114   MatMatStruct                *mm = NULL;
1115   IS                          glob = NULL;
1116   const PetscInt              *garray;
1117   PetscInt                    m,n,M,N,sz;
1118   ConstMatColIdxKokkosView    l2g; /* map local col ids to global ones */
1119 
1120   PetscFunctionBegin;
1121   MatCheckProduct(C,1);
1122   if (product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
1123   ptype = product->type;
1124   A     = product->A;
1125   B     = product->B;
1126 
1127   switch (ptype) {
1128     case MATPRODUCT_AB:   m = A->rmap->n; n = B->cmap->n; M = A->rmap->N; N = B->cmap->N; break;
1129     case MATPRODUCT_AtB:  m = A->cmap->n; n = B->cmap->n; M = A->cmap->N; N = B->cmap->N; break;
1130     case MATPRODUCT_PtAP: m = B->cmap->n; n = B->cmap->n; M = B->cmap->N; N = B->cmap->N; break; /* BtAB */
1131     default: SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for product type %s",MatProductTypes[ptype]);
1132   }
1133 
1134   ierr = MatSetSizes(C,m,n,M,N);CHKERRQ(ierr);
1135   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
1136   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
1137   ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
1138 
1139   mmdata           = new MatProductData_MPIAIJKokkos();
1140   mmdata->reusesym = product->api_user;
1141 
1142   if (ptype == MATPRODUCT_AB) {
1143     mmdata->mmAB = new MatMatStruct_AB();
1144     ierr = MatProductSymbolic_MPIAIJKokkos_AB(product,A,B,mmdata->mmAB);CHKERRQ(ierr);
1145     mm   = static_cast<MatMatStruct*>(mmdata->mmAB);
1146   } else if (ptype == MATPRODUCT_AtB) {
1147     mmdata->mmAtB = new MatMatStruct_AtB();
1148     auto atb      = mmdata->mmAtB;
1149     ierr = MatMPIAIJGetLocalMatMerge(B,MAT_INITIAL_MATRIX,&glob,&atb->B_local);CHKERRQ(ierr);
1150     ierr = ISGetIndices(glob,&garray);CHKERRQ(ierr);
1151     ierr = ISGetSize(glob,&sz);CHKERRQ(ierr);
1152     l2g  = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatColIdxKokkosViewHost(garray,sz));
1153     ierr = MatProductSymbolic_MPIAIJKokkos_AtB(product,A,atb->B_local,PETSC_TRUE,N,l2g,atb);CHKERRQ(ierr);
1154     ierr = ISRestoreIndices(glob,&garray);CHKERRQ(ierr);
1155     ierr = ISDestroy(&glob);CHKERRQ(ierr);
1156     mm   = static_cast<MatMatStruct*>(atb);
1157   } else if (ptype == MATPRODUCT_PtAP) { /* BtAB */
1158     mmdata->mmAB  = new MatMatStruct_AB(); /* tmp=A*B */
1159     mmdata->mmAtB = new MatMatStruct_AtB(); /* C=B^t*tmp */
1160     auto ab       = mmdata->mmAB;
1161     auto atb      = mmdata->mmAtB;
1162     ierr = MatProductSymbolic_MPIAIJKokkos_AB(product,A,B,ab);CHKERRQ(ierr);
1163     auto tmp = new Mat_SeqAIJKokkos(ab->C_global); /* Memory will be owned by ab->C_petsc */
1164     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,tmp,&ab->C_petsc);CHKERRQ(ierr);
1165     ierr = MatProductSymbolic_MPIAIJKokkos_AtB(product,B,ab->C_petsc,PETSC_FALSE,N,l2g/*not used*/,atb);CHKERRQ(ierr);
1166     mm   = static_cast<MatMatStruct*>(atb);
1167   }
1168   /* Split the C_global into petsc A, B format */
1169   ierr = MatSetMPIAIJKokkosWithGlobalCSRMatrix(C,MAT_INITIAL_MATRIX,mm->C_global,mm->Cdstart);CHKERRQ(ierr);
1170   C->product->data        = mmdata;
1171   C->product->destroy     = MatProductDataDestroy_MPIAIJKokkos;
1172   C->ops->productnumeric  = MatProductNumeric_MPIAIJKokkos;
1173   PetscFunctionReturn(0);
1174 }
1175 
1176 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_MPIAIJKokkos(Mat mat)
1177 {
1178   PetscErrorCode ierr;
1179   Mat_Product    *product = mat->product;
1180   PetscBool      match = PETSC_FALSE;
1181   PetscBool      usecpu = PETSC_FALSE;
1182 
1183   PetscFunctionBegin;
1184   MatCheckProduct(mat,1);
1185   if (!product->A->boundtocpu && !product->B->boundtocpu) {
1186     ierr = PetscObjectTypeCompare((PetscObject)product->B,((PetscObject)product->A)->type_name,&match);CHKERRQ(ierr);
1187   }
1188   if (match) { /* we can always fallback to the CPU if requested */
1189     switch (product->type) {
1190     case MATPRODUCT_AB:
1191       if (product->api_user) {
1192         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
1193         ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
1194         ierr = PetscOptionsEnd();CHKERRQ(ierr);
1195       } else {
1196         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
1197         ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
1198         ierr = PetscOptionsEnd();CHKERRQ(ierr);
1199       }
1200       break;
1201     case MATPRODUCT_AtB:
1202       if (product->api_user) {
1203         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
1204         ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
1205         ierr = PetscOptionsEnd();CHKERRQ(ierr);
1206       } else {
1207         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
1208         ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
1209         ierr = PetscOptionsEnd();CHKERRQ(ierr);
1210       }
1211       break;
1212     case MATPRODUCT_PtAP:
1213       if (product->api_user) {
1214         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
1215         ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
1216         ierr = PetscOptionsEnd();CHKERRQ(ierr);
1217       } else {
1218         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
1219         ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
1220         ierr = PetscOptionsEnd();CHKERRQ(ierr);
1221       }
1222       break;
1223     default:
1224       break;
1225     }
1226     match = (PetscBool)!usecpu;
1227   }
1228   if (match) {
1229     switch (product->type) {
1230     case MATPRODUCT_AB:
1231     case MATPRODUCT_AtB:
1232     case MATPRODUCT_PtAP:
1233       mat->ops->productsymbolic = MatProductSymbolic_MPIAIJKokkos;
1234       break;
1235     default:
1236       break;
1237     }
1238   }
1239   /* fallback to MPIAIJ ops */
1240   if (!mat->ops->productsymbolic) {
1241     ierr = MatProductSetFromOptions_MPIAIJ(mat);CHKERRQ(ierr);
1242   }
1243   PetscFunctionReturn(0);
1244 }
1245 
1246 /* std::upper_bound(): Given a sorted array, return index of the first element in range [first,last) whose value
1247    is greater than value, or last if there is no such element.
1248 */
1249 PETSC_STATIC_INLINE PetscErrorCode PetscSortedIntUpperBound(PetscInt *array,PetscInt first,PetscInt last,PetscInt value,PetscInt *upper)
1250 {
1251   PetscInt  it,step,count = last - first;
1252 
1253   PetscFunctionBegin;
1254   while (count > 0) {
1255     it   = first;
1256     step = count / 2;
1257     it  += step;
1258     if (!(value < array[it])) {
1259       first  = ++it;
1260       count -= step + 1;
1261     } else count = step;
1262   }
1263   *upper = first;
1264   PetscFunctionReturn(0);
1265 }
1266 
1267 /* Merge two sets of sorted nonzero entries and return a CSR for the merged (sequential) matrix
1268 
1269   Input Parameters:
1270 
1271     j1,rowBegin1,rowEnd1,perm1,jmap1: describe the first set of nonzeros (Set1)
1272     j2,rowBegin2,rowEnd2,perm2,jmap2: describe the second set of nonzeros (Set2)
1273 
1274     mat: both sets' entries are on m rows, where m is the number of local rows of the matrix mat
1275 
1276     For Set1, j1[] contains column indices of the nonzeros.
1277     For the k-th row (0<=k<m), [rowBegin1[k],rowEnd1[k]) index into j1[] and point to the begin/end nonzero in row k
1278     respectively (note rowEnd1[k] is not necessarily equal to rwoBegin1[k+1]). Indices in this range of j1[] are sorted,
1279     but might have repeats. jmap1[t+1] - jmap1[t] is the number of repeats for the t-th unique nonzero in Set1.
1280 
1281     Similar for Set2.
1282 
1283     This routine merges the two sets of nonzeros row by row and removes repeats.
1284 
1285   Output Parameters: (memories are allocated by the caller)
1286 
1287     i[],j[]: the CSR of the merged matrix, which has m rows.
1288     imap1[]: the k-th unique nonzero in Set1 (k=0,1,...) corresponds to imap1[k]-th unique nonzero in the merged matrix.
1289     imap2[]: similar to imap1[], but for Set2.
1290     Note we order nonzeros row-by-row and from left to right.
1291 */
1292 static PetscErrorCode MatMergeEntries_Internal(Mat mat,const PetscInt *j1,const PetscInt *j2,const PetscInt *rowBegin1,const PetscInt *rowEnd1,
1293   const PetscInt *rowBegin2,const PetscInt *rowEnd2,const MatRowMapKokkosViewHost& jmap1_h,const MatRowMapKokkosViewHost& jmap2_h,
1294   MatRowMapKokkosViewHost& imap1_h,MatRowMapKokkosViewHost& imap2_h,PetscInt *i,PetscInt *j)
1295 {
1296   PetscErrorCode ierr;
1297   PetscInt       r,m,t,t1,t2,b1,e1,b2,e2;
1298   PetscInt       *jmap1 = jmap1_h.data(),*jmap2 = jmap2_h.data(),*imap1 = imap1_h.data(),*imap2 = imap2_h.data();
1299 
1300   PetscFunctionBegin;
1301   ierr = MatGetLocalSize(mat,&m,NULL);CHKERRQ(ierr);
1302   t1   = t2 = t = 0; /* Count unique nonzeros of in Set1, Set1 and the merged respectively */
1303   i[0] = 0;
1304   for (r=0; r<m; r++) { /* Do row by row merging */
1305     b1   = rowBegin1[r];
1306     e1   = rowEnd1[r];
1307     b2   = rowBegin2[r];
1308     e2   = rowEnd2[r];
1309     while (b1 < e1 && b2 < e2) {
1310       if (j1[b1] == j2[b2]) { /* Same column index and hence same nonzero */
1311         j[t]      = j1[b1];
1312         imap1[t1] = t;
1313         imap2[t2] = t;
1314         b1       += jmap1[t1+1] - jmap1[t1]; /* Jump to next unique local nonzero */
1315         b2       += jmap2[t2+1] - jmap2[t2]; /* Jump to next unique remote nonzero */
1316         t1++; t2++; t++;
1317       } else if (j1[b1] < j2[b2]) {
1318         j[t]      = j1[b1];
1319         imap1[t1] = t;
1320         b1       += jmap1[t1+1] - jmap1[t1];
1321         t1++; t++;
1322       } else {
1323         j[t]      = j2[b2];
1324         imap2[t2] = t;
1325         b2       += jmap2[t2+1] - jmap2[t2];
1326         t2++; t++;
1327       }
1328     }
1329     /* Merge the remaining in either j1[] or j2[] */
1330     while (b1 < e1) {
1331       j[t]      = j1[b1];
1332       imap1[t1] = t;
1333       b1       += jmap1[t1+1] - jmap1[t1];
1334       t1++; t++;
1335     }
1336     while (b2 < e2) {
1337       j[t]      = j2[b2];
1338       imap2[t2] = t;
1339       b2       += jmap2[t2+1] - jmap2[t2];
1340       t2++; t++;
1341     }
1342     i[r+1] = t;
1343   }
1344   PetscFunctionReturn(0);
1345 }
1346 
1347 /* Split a set/group of local entries into two subsets: those in the diagonal block and those in the off-diagonal block
1348 
1349   Input Parameters:
1350     mat: an MPI matrix that provides row and column layout information for splitting. Let's assume its number of local rows is m.
1351     n,i[],j[],perm[]: there are n input entries, belonging to m rows. Row/col indices of the entries are stored in i[] and j[]
1352       respectively, along with a permutation array perm[]. Length of the i[],j[],perm[] arrays is n.
1353 
1354       i[] is already sorted, but within a row, j[] is not sorted and might have repeats.
1355       i[] might contain negative indices at the beginning, which says the corresponding entries should be ignored in the splitting.
1356 
1357   Output Parameters:
1358     j[],perm[]: the routine needs to sort j[] within each row along with perm[].
1359     rowBegin[],rowMid[],rowEnd[]: of length m, and the memory is preallocated and zeroed by the caller.
1360       They contain indices pointing to j[]. For 0<=r<m, [rowBegin[r],rowMid[r]) point to begin/end entries in row r of the diagonal block,
1361       and [rowMid[r],rowEnd[r]) point to begin/end entries in row r of the off-diagonal block.
1362 
1363     Aperm_h,Ajmap_h: They are Kokkos views on host. This routine will resize and fill them with proper values. Let's say Aperm = Aperm_h.data(),
1364       and Ajmap = Ajmap_h.data(). Aperm[] stores values from perm[] for entries in the diagonal block. Hence length of Aperm[] is the number
1365       of entries in the diagonal block, though those entries might have repeats (i.e., same 'i,j' pair).
1366       Ajmap[] stores the number of repeats of each unique nonzero in the diagonal block. More precisely, Ajmap[t+1] - Ajmap[t] is the number of
1367       repeats for the t-th unique nonzero in the diagonal block. Ajmap[0] is always 0.
1368       Length of Aperm_h is the number of nonzeros in the diagonal block.
1369       Length of Ajmap_h is the number of unique nonzeros in the diagonal block + 1.
1370 
1371     Bperm_h and Bjmap_h are similar to Aperm_h and Ajmap_h, respectively, but for the off-diagonal block.
1372 */
1373 
1374 static PetscErrorCode MatSplitEntries_Internal(Mat mat,PetscInt n,const PetscInt i[],
1375   PetscInt j[],PetscInt perm[],PetscInt rowBegin[],PetscInt rowMid[],PetscInt rowEnd[],
1376   MatRowMapKokkosViewHost& Aperm_h,MatRowMapKokkosViewHost& Ajmap_h,MatRowMapKokkosViewHost& Bperm_h,MatRowMapKokkosViewHost& Bjmap_h)
1377 {
1378   PetscErrorCode    ierr;
1379   PetscInt          cstart,cend,rstart,rend,mid;
1380   PetscInt          Atot=0,Btot=0; /* Total number of nonzeros in the diagonal and off-diagonal blocks */
1381   PetscInt          Annz=0,Bnnz=0; /* Number of unique nonzeros in the diagonal and off-diagonal blocks */
1382   PetscInt          k,m,p,q,r,s,row,col;
1383   PetscInt          *Aperm,*Bperm,*Ajmap,*Bjmap;
1384 
1385   PetscFunctionBegin;
1386   ierr = PetscLayoutGetRange(mat->rmap,&rstart,&rend);CHKERRQ(ierr);
1387   ierr = PetscLayoutGetRange(mat->cmap,&cstart,&cend);CHKERRQ(ierr);
1388   m    = rend - rstart;
1389 
1390   for (k=0; k<n; k++) {if (i[k]>=0) break;} /* Skip negative rows */
1391 
1392   /* Process [k,n): sort and partition each local row into diag and offdiag portions,
1393      fill rowBegin[], rowMid[], rowEnd[], and count Atot, Btot, Annz, Bnnz.
1394   */
1395   while (k<n) {
1396     row = i[k];
1397     /* Entries in [k,s) are in one row. Shift diagonal block col indices so that diag is ahead of offdiag after sorting the row */
1398     for (s=k; s<n; s++) if (i[s] != row) break;
1399     for (p=k; p<s; p++) {
1400       if (j[p] >= cstart && j[p] < cend) j[p] -= PETSC_MAX_INT; /* Shift diag columns to range of [-PETSC_MAX_INT, -1]  */
1401      #if defined(PETSC_USE_DEBUG)
1402       else if (j[p] < 0 || j[p] > mat->cmap->N) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Column index %" PetscInt_FMT " is out of range",j[p]);
1403      #endif
1404     }
1405     ierr = PetscSortIntWithArray(s-k,j+k,perm+k);CHKERRQ(ierr);
1406     ierr = PetscSortedIntUpperBound(j,k,s,-1,&mid);CHKERRQ(ierr); /* Seperate [k,s) into [k,mid) for diag and [mid,s) for offdiag */
1407     rowBegin[row-rstart] = k;
1408     rowMid[row-rstart]   = mid;
1409     rowEnd[row-rstart]   = s;
1410 
1411     /* Count nonzeros of this diag/offdiag row, which might have repeats */
1412     Atot += mid - k;
1413     Btot += s - mid;
1414 
1415     /* Count unique nonzeros of this diag/offdiag row */
1416     for (p=k; p<mid;) {
1417       col = j[p];
1418       do {j[p] += PETSC_MAX_INT; p++;} while (p<mid && j[p] == col); /* Revert the modified diagonal indices */
1419       Annz++;
1420     }
1421 
1422     for (p=mid; p<s;) {
1423       col = j[p];
1424       do {p++;} while (p<s && j[p] == col);
1425       Bnnz++;
1426     }
1427     k = s;
1428   }
1429 
1430   /* Resize views according to Atot, Btot, Annz, Bnnz */
1431   Kokkos::resize(Aperm_h,Atot);
1432   Kokkos::resize(Ajmap_h,Annz+1);
1433   Kokkos::resize(Bperm_h,Btot);
1434   Kokkos::resize(Bjmap_h,Bnnz+1);
1435   Aperm    = Aperm_h.data();
1436   Bperm    = Bperm_h.data();
1437   Ajmap    = Ajmap_h.data();
1438   Bjmap    = Bjmap_h.data();
1439   Ajmap[0] = 0;
1440   Bjmap[0] = 0;
1441 
1442   /* Re-scan indices and copy diag/offdiag permuation indices to Aperm, Bperm and also fill Ajmap and Bjmap */
1443   Atot = Btot = Annz = Bnnz = 0;
1444   for (r=0; r<m; r++) {
1445     k     = rowBegin[r];
1446     mid   = rowMid[r];
1447     s     = rowEnd[r];
1448     ierr  = PetscArraycpy(Aperm+Atot,perm+k,  mid-k);CHKERRQ(ierr);
1449     ierr  = PetscArraycpy(Bperm+Btot,perm+mid,s-mid);CHKERRQ(ierr);
1450     Atot += mid - k;
1451     Btot += s - mid;
1452 
1453     /* Scan column indices in this row and find out how many repeats each unique nonzero has */
1454     for (p=k; p<mid;) {
1455       col = j[p];
1456       q   = p;
1457       do {p++;} while (p<mid && j[p] == col);
1458       Ajmap[Annz+1] = Ajmap[Annz] + (p - q);
1459       Annz++;
1460     }
1461 
1462     for (p=mid; p<s;) {
1463       col = j[p];
1464       q   = p;
1465       do {p++;} while (p<s && j[p] == col);
1466       Bjmap[Bnnz+1] = Bjmap[Bnnz] + (p - q);
1467       Bnnz++;
1468     }
1469   }
1470   PetscFunctionReturn(0);
1471 }
1472 
1473 static PetscErrorCode MatSetPreallocationCOO_MPIAIJKokkos(Mat mat, PetscInt coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
1474 {
1475   PetscErrorCode            ierr;
1476   MPI_Comm                  comm;
1477   PetscMPIInt               rank,size;
1478   PetscInt                  m,n,M,N,k,p,q,rstart,rend,cstart,cend,rem;
1479 
1480   PetscFunctionBegin;
1481   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
1482   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
1483   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
1484 
1485   ierr = PetscLayoutGetRange(mat->rmap,&rstart,&rend);CHKERRQ(ierr);
1486   ierr = PetscLayoutGetRange(mat->cmap,&cstart,&cend);CHKERRQ(ierr);
1487   ierr = MatGetLocalSize(mat,&m,&n);CHKERRQ(ierr);
1488   ierr = MatGetSize(mat,&M,&N);CHKERRQ(ierr);
1489 
1490   /* ---------------------------------------------------------------------------*/
1491   /* Sort (i,j) by row along with a permuation array, so that the to-be-ignored */
1492   /* entries come first, then local rows, then remote rows.                     */
1493   /* ---------------------------------------------------------------------------*/
1494   PetscInt  n1 = coo_n,*i1,*j1,*perm1; /* Copies of input COOs along with a permutation array */
1495   ierr = PetscMalloc3(n1,&i1,n1,&j1,n1,&perm1);CHKERRQ(ierr);
1496   ierr = PetscArraycpy(i1,coo_i,n1);CHKERRQ(ierr); /* Make a copy since we'll modify it */
1497   ierr = PetscArraycpy(j1,coo_j,n1);CHKERRQ(ierr);
1498   for (k=0; k<n1; k++) perm1[k] = k;
1499 
1500   /* Manipulate indices so that entries with negative row or col indices will have smallest
1501      row indices, local entries will have greater but negative row indices, and remote entries
1502      will have positive row indices.
1503   */
1504   for (k=0; k<n1; k++) {
1505     if (i1[k] < 0 || j1[k] < 0) i1[k] = PETSC_MIN_INT; /* e.g., -2^31, minimal to move them ahead */
1506     else if (i1[k] >= rstart && i1[k] < rend) i1[k] -= PETSC_MAX_INT; /* e.g., minus 2^31-1 to shift local rows to range of [-PETSC_MAX_INT, -1] */
1507   }
1508 
1509   /* Sort by row; after that, [0,k) have ignored entires, [k,rem) have local rows and [rem,n1) have remote rows */
1510   ierr = PetscSortIntWithArrayPair(n1,i1,j1,perm1);CHKERRQ(ierr);
1511   for (k=0; k<n1; k++) {if (i1[k] > PETSC_MIN_INT) break;} /* Advance k to the first entry we need to take care of */
1512   ierr = PetscSortedIntUpperBound(i1,k,n1,rend-1-PETSC_MAX_INT,&rem);CHKERRQ(ierr); /* rem is upper bound of the last local row */
1513   for (; k<rem; k++) i1[k] += PETSC_MAX_INT; /* Revert row indices of local rows*/
1514 
1515   /* ---------------------------------------------------------------------------*/
1516   /*           Split local rows into diag/offdiag portions                      */
1517   /* ---------------------------------------------------------------------------*/
1518   PetscInt                  *rowBegin1,*rowMid1,*rowEnd1;
1519   MatRowMapKokkosViewHost   Ajmap1_h,Aperm1_h,Bjmap1_h,Bperm1_h,Cperm1_h("Cperm1_h",n1-rem);
1520 
1521   ierr = PetscCalloc3(m,&rowBegin1,m,&rowMid1,m,&rowEnd1);CHKERRQ(ierr);
1522   ierr = MatSplitEntries_Internal(mat,rem,i1,j1,perm1,rowBegin1,rowMid1,rowEnd1,Aperm1_h,Ajmap1_h,Bperm1_h,Bjmap1_h);CHKERRQ(ierr);
1523 
1524   /* ---------------------------------------------------------------------------*/
1525   /*           Send remote rows to their owner                                  */
1526   /* ---------------------------------------------------------------------------*/
1527   /* Find which rows should be sent to which remote ranks*/
1528   PetscInt       nsend = 0;
1529   PetscMPIInt    *sendto; /* Of length nsend, storing remote ranks */
1530   PetscInt       *nentries; /* Of length nsend, storing number of entries to be sent to each remote rank */
1531   const PetscInt *ranges;
1532   PetscInt       maxNsend = size >= 128? 128 : size; /* Assume max 128 neighbors; realloc when needed */
1533 
1534   ierr = PetscLayoutGetRanges(mat->rmap,&ranges);CHKERRQ(ierr);
1535   ierr = PetscMalloc2(maxNsend,&sendto,maxNsend,&nentries);CHKERRQ(ierr);
1536   for (k=rem; k<n1;) {
1537     PetscMPIInt  owner;
1538     PetscInt     firstRow,lastRow;
1539     /* Locate a row range */
1540     firstRow = i1[k]; /* first row of this owner */
1541     ierr     = PetscLayoutFindOwner(mat->rmap,firstRow,&owner);CHKERRQ(ierr);
1542     lastRow  = ranges[owner+1]-1; /* last row of this owner */
1543 
1544     /* Find the first index 'p' in [k,n) with i[p] belonging to next owner */
1545     ierr     = PetscSortedIntUpperBound(i1,k,n1,lastRow,&p);CHKERRQ(ierr);
1546 
1547     /* All entries in [k,p) belong to this remote owner */
1548     if (nsend >= maxNsend) { /* Double the remote ranks arrays if not long enough */
1549       PetscMPIInt *sendto2;
1550       PetscInt    *nentries2;
1551       PetscInt    maxNsend2 = (maxNsend <= size/2) ? maxNsend*2 : size;
1552       ierr = PetscMalloc2(maxNsend2,&sendto2,maxNsend2,&nentries2);CHKERRQ(ierr);
1553       ierr = PetscArraycpy(sendto2,sendto,maxNsend);CHKERRQ(ierr);
1554       ierr = PetscArraycpy(nentries2,nentries2,maxNsend+1);CHKERRQ(ierr);
1555       ierr = PetscFree2(sendto,nentries2);CHKERRQ(ierr);
1556       sendto      = sendto2;
1557       nentries    = nentries2;
1558       maxNsend    = maxNsend2;
1559     }
1560     sendto[nsend]   = owner;
1561     nentries[nsend] = p - k;
1562     nsend++;
1563     k = p;
1564   }
1565 
1566   /* Build 1st SF to know offsets on remote to send data */
1567   PetscSF     sf1;
1568   PetscInt    nroots = 1,nroots2 = 0;
1569   PetscInt    nleaves = nsend,nleaves2 = 0;
1570   PetscInt    *offsets;
1571   PetscSFNode *iremote;
1572 
1573   ierr = PetscSFCreate(comm,&sf1);CHKERRQ(ierr);
1574   ierr = PetscMalloc1(nsend,&iremote);CHKERRQ(ierr);
1575   ierr = PetscMalloc1(nsend,&offsets);CHKERRQ(ierr);
1576   for (k=0; k<nsend; k++) {
1577     iremote[k].rank  = sendto[k];
1578     iremote[k].index = 0;
1579     nleaves2        += nentries[k];
1580   }
1581   ierr = PetscSFSetGraph(sf1,nroots,nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1582   ierr = PetscSFFetchAndOpWithMemTypeBegin(sf1,MPIU_INT,PETSC_MEMTYPE_HOST,&nroots2/*rootdata*/,PETSC_MEMTYPE_HOST,nentries/*leafdata*/,PETSC_MEMTYPE_HOST,offsets/*leafupdate*/,MPI_SUM);CHKERRQ(ierr);
1583   ierr = PetscSFFetchAndOpEnd(sf1,MPIU_INT,&nroots2,nentries,offsets,MPI_SUM);CHKERRQ(ierr);
1584   ierr = PetscSFDestroy(&sf1);CHKERRQ(ierr);
1585 
1586   /* Build 2nd SF to send remote COOs to their owner */
1587   PetscSF sf2;
1588   nroots  = nroots2;
1589   nleaves = nleaves2;
1590   ierr    = PetscSFCreate(comm,&sf2);CHKERRQ(ierr);
1591   ierr    = PetscSFSetFromOptions(sf2);CHKERRQ(ierr);
1592   ierr    = PetscMalloc1(nleaves,&iremote);CHKERRQ(ierr);
1593   p       = 0;
1594   for (k=0; k<nsend; k++) {
1595     for (q=0; q<nentries[k]; q++,p++) {
1596       iremote[p].rank  = sendto[k];
1597       iremote[p].index = offsets[k] + q;
1598     }
1599   }
1600   ierr = PetscSFSetGraph(sf2,nroots,nleaves,NULL,PETSC_USE_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1601 
1602   /* sf2 only sends contiguous leafdata to contiguous rootdata. We record the permuation which will be used to fill leafdata */
1603   ierr = PetscArraycpy(Cperm1_h.data(),perm1+rem,n1-rem);CHKERRQ(ierr);
1604 
1605   /* Send the remote COOs to their owner */
1606   PetscInt n2 = nroots,*i2,*j2,*perm2; /* Buffers for received COOs from other ranks, along with a permutation array */
1607   ierr = PetscMalloc3(n2,&i2,n2,&j2,n2,&perm2);CHKERRQ(ierr);
1608   ierr = PetscSFReduceWithMemTypeBegin(sf2,MPIU_INT,PETSC_MEMTYPE_HOST,i1+rem,PETSC_MEMTYPE_HOST,i2,MPI_REPLACE);CHKERRQ(ierr);
1609   ierr = PetscSFReduceEnd(sf2,MPIU_INT,i1+rem,i2,MPI_REPLACE);CHKERRQ(ierr);
1610   ierr = PetscSFReduceWithMemTypeBegin(sf2,MPIU_INT,PETSC_MEMTYPE_HOST,j1+rem,PETSC_MEMTYPE_HOST,j2,MPI_REPLACE);CHKERRQ(ierr);
1611   ierr = PetscSFReduceEnd(sf2,MPIU_INT,j1+rem,j2,MPI_REPLACE);CHKERRQ(ierr);
1612 
1613   ierr = PetscFree(offsets);CHKERRQ(ierr);
1614   ierr = PetscFree2(sendto,nentries);CHKERRQ(ierr);
1615 
1616   /* ---------------------------------------------------------------*/
1617   /* Sort received COOs by row along with the permutation array     */
1618   /* ---------------------------------------------------------------*/
1619   for (k=0; k<n2; k++) perm2[k] = k;
1620   ierr = PetscSortIntWithArrayPair(n2,i2,j2,perm2);CHKERRQ(ierr);
1621 
1622   /* ---------------------------------------------------------------*/
1623   /* Split received COOs into diag/offdiag portions                 */
1624   /* ---------------------------------------------------------------*/
1625   PetscInt                  *rowBegin2,*rowMid2,*rowEnd2;
1626   MatRowMapKokkosViewHost   Ajmap2_h,Aperm2_h,Bjmap2_h,Bperm2_h;
1627 
1628   ierr = PetscCalloc3(m,&rowBegin2,m,&rowMid2,m,&rowEnd2);CHKERRQ(ierr);
1629   ierr = MatSplitEntries_Internal(mat,n2,i2,j2,perm2,rowBegin2,rowMid2,rowEnd2,Aperm2_h,Ajmap2_h,Bperm2_h,Bjmap2_h);CHKERRQ(ierr);
1630 
1631   /* --------------------------------------------------------------------------*/
1632   /* Merge local COOs with received COOs: diag with diag, offdiag with offdiag */
1633   /* --------------------------------------------------------------------------*/
1634   PetscInt Annz1,Annz2,Bnnz1,Bnnz2;
1635   PetscInt *Ai,*Aj,*Bi,*Bj;
1636 
1637   Annz1 = Ajmap1_h.extent(0)-1; /* Number of unique local nonzeros in the diagonal block */
1638   Annz2 = Ajmap2_h.extent(0)-1; /* Number of unique received nonzeros in the diagonal block */
1639   Bnnz1 = Bjmap1_h.extent(0)-1; /* Similar, but for the off-diagonal block */
1640   Bnnz2 = Bjmap2_h.extent(0)-1;
1641   ierr  = PetscMalloc1(m+1,&Ai);CHKERRQ(ierr);
1642   ierr  = PetscMalloc1(m+1,&Bi);CHKERRQ(ierr);
1643   ierr  = PetscMalloc1(Annz1+Annz2,&Aj);CHKERRQ(ierr); /* Since local and remote entries might have dups, we might allocate excess memory */
1644   ierr  = PetscMalloc1(Bnnz1+Bnnz2,&Bj);CHKERRQ(ierr);
1645 
1646   MatRowMapKokkosViewHost Aimap1_h("Aimpa1",Annz1),Aimap2_h("Aimpa2",Annz2),Bimap1_h("Bimap1",Bnnz1),Bimap2_h("Bimap2",Bnnz2);
1647   ierr = MatMergeEntries_Internal(mat,j1,j2,rowBegin1,rowMid1,rowBegin2,rowMid2,Ajmap1_h,Ajmap2_h,Aimap1_h,Aimap2_h,Ai,Aj);CHKERRQ(ierr);
1648   ierr = MatMergeEntries_Internal(mat,j1,j2,rowMid1,  rowEnd1,rowMid2,  rowEnd2,Bjmap1_h,Bjmap2_h,Bimap1_h,Bimap2_h,Bi,Bj);CHKERRQ(ierr);
1649   ierr = PetscFree3(rowBegin1,rowMid1,rowEnd1);CHKERRQ(ierr);
1650   ierr = PetscFree3(rowBegin2,rowMid2,rowEnd2);CHKERRQ(ierr);
1651   ierr = PetscFree3(i1,j1,perm1);CHKERRQ(ierr);
1652   ierr = PetscFree3(i2,j2,perm2);CHKERRQ(ierr);
1653 
1654   /* Reallocate Aj, Bj once we know actual numbers of unique nonzeros in A and B */
1655   PetscInt Annz = Ai[m];
1656   PetscInt Bnnz = Bi[m];
1657   if (Annz < Annz1 + Annz2) {
1658     PetscInt *Aj_new;
1659     ierr = PetscMalloc1(Annz,&Aj_new);CHKERRQ(ierr);
1660     ierr = PetscArraycpy(Aj_new,Aj,Annz);CHKERRQ(ierr);
1661     ierr = PetscFree(Aj);CHKERRQ(ierr);
1662     Aj   = Aj_new;
1663   }
1664 
1665   if (Bnnz < Bnnz1 + Bnnz2) {
1666     PetscInt *Bj_new;
1667     ierr = PetscMalloc1(Bnnz,&Bj_new);CHKERRQ(ierr);
1668     ierr = PetscArraycpy(Bj_new,Bj,Bnnz);CHKERRQ(ierr);
1669     ierr = PetscFree(Bj);CHKERRQ(ierr);
1670     Bj   = Bj_new;
1671   }
1672 
1673   /* --------------------------------------------------------------------------------*/
1674   /* Create a MPIAIJKOKKOS newmat with CSRs of A and B, then replace mat with newmat */
1675   /* --------------------------------------------------------------------------------*/
1676   Mat           newmat;
1677   PetscScalar   *Aa,*Ba;
1678   Mat_MPIAIJ    *mpiaij;
1679   Mat_SeqAIJ    *a,*b;
1680 
1681   ierr   = PetscMalloc1(Annz,&Aa);CHKERRQ(ierr);
1682   ierr   = PetscMalloc1(Bnnz,&Ba);CHKERRQ(ierr);
1683   /* make Aj[] local, i.e, based off the start column of the diagonal portion */
1684   if (cstart) {for (k=0; k<Annz; k++) Aj[k] -= cstart;}
1685   ierr   = MatCreateMPIAIJWithSplitArrays(comm,m,n,M,N,Ai,Aj,Aa,Bi,Bj,Ba,&newmat);CHKERRQ(ierr);
1686   mpiaij = (Mat_MPIAIJ*)newmat->data;
1687   a      = (Mat_SeqAIJ*)mpiaij->A->data;
1688   b      = (Mat_SeqAIJ*)mpiaij->B->data;
1689   a->singlemalloc = b->singlemalloc = PETSC_FALSE; /* Let newmat own Ai,Aj,Aa,Bi,Bj,Ba */
1690   a->free_a       = b->free_a       = PETSC_TRUE;
1691   a->free_ij      = b->free_ij      = PETSC_TRUE;
1692   ierr   = MatConvert(newmat,MATMPIAIJKOKKOS,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr);
1693   ierr   = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr);
1694   ierr   = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */
1695   mpiaij = (Mat_MPIAIJ*)mat->data;
1696   mpiaij->spptr = new Mat_MPIAIJKokkos(n1,sf2,nroots,nleaves,Annz1,Annz2,Bnnz1,Bnnz2,
1697                                        Aimap1_h,Aimap2_h,Bimap1_h,Bimap2_h,
1698                                        Ajmap1_h,Ajmap2_h,Bjmap1_h,Bjmap2_h,
1699                                        Aperm1_h,Aperm2_h,Bperm1_h,Bperm2_h,Cperm1_h);
1700   ierr = PetscSFDestroy(&sf2);CHKERRQ(ierr); /* ctor of Mat_MPIAIJKokkos already took a reference of sf3 */
1701   PetscFunctionReturn(0);
1702 }
1703 
1704 static PetscErrorCode MatSetValuesCOO_MPIAIJKokkos(Mat mat,const PetscScalar v[],InsertMode imode)
1705 {
1706   PetscErrorCode                 ierr;
1707   Mat_MPIAIJ                     *mpiaij = (Mat_MPIAIJ*)mat->data;
1708   Mat_MPIAIJKokkos               *mpikok = static_cast<Mat_MPIAIJKokkos*>(mpiaij->spptr);
1709   Mat                            A = mpiaij->A,B = mpiaij->B;
1710   PetscInt                       Annz1 = mpikok->Annz1,Annz2 = mpikok->Annz2,Bnnz1 = mpikok->Bnnz1,Bnnz2 = mpikok->Bnnz2;
1711   MatScalarKokkosView            Aa,Ba;
1712   ConstMatScalarKokkosView       v1;
1713   MatScalarKokkosView&           vsend = mpikok->sendbuf_d;
1714   const MatScalarKokkosView&     v2 = mpikok->recvbuf_d;
1715   const MatRowMapKokkosView&     Ajmap1 = mpikok->Ajmap1_d,Ajmap2 = mpikok->Ajmap2_d,Aimap1 = mpikok->Aimap1_d,Aimap2 = mpikok->Aimap2_d;
1716   const MatRowMapKokkosView&     Bjmap1 = mpikok->Bjmap1_d,Bjmap2 = mpikok->Bjmap2_d,Bimap1 = mpikok->Bimap1_d,Bimap2 = mpikok->Bimap2_d;
1717   const MatRowMapKokkosView&     Aperm1 = mpikok->Aperm1_d,Aperm2 = mpikok->Aperm2_d,Bperm1 = mpikok->Bperm1_d,Bperm2 = mpikok->Bperm2_d;
1718   const MatRowMapKokkosView&     Cperm1 = mpikok->Cperm1_d;
1719   PetscMemType                   memtype;
1720 
1721   PetscFunctionBegin;
1722   if (!v) { /* NULL v means an all zero array */
1723     ierr = MatZeroEntries(mat);CHKERRQ(ierr);
1724     PetscFunctionReturn(0);
1725   }
1726 
1727   ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr);
1728   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device if any */
1729     v1 = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,mpikok->coo_n));
1730   } else {
1731     v1 = ConstMatScalarKokkosView(v,mpikok->coo_n); /* Directly use v[]'s memory */
1732   }
1733 
1734   ierr = MatSeqAIJGetKokkosView(A,&Aa);CHKERRQ(ierr); /* Might read and write matrix values */
1735   ierr = MatSeqAIJGetKokkosView(B,&Ba);CHKERRQ(ierr);
1736   if (imode == INSERT_VALUES) {
1737     Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */
1738     Kokkos::deep_copy(Ba,0.0);
1739   }
1740 
1741   /* Pack entries to be sent to remote */
1742   Kokkos::parallel_for(vsend.extent(0),KOKKOS_LAMBDA(const PetscInt i) {vsend(i) = v1(Cperm1(i));});
1743 
1744   /* Send remote entries to their owner and overlap the communication with local computation */
1745   ierr = PetscSFReduceWithMemTypeBegin(mpikok->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_KOKKOS,vsend.data(),PETSC_MEMTYPE_KOKKOS,v2.data(),MPI_REPLACE);CHKERRQ(ierr);
1746   /* Add local entries to A and B */
1747   Kokkos::parallel_for(Annz1,KOKKOS_LAMBDA(const PetscInt i) {for (PetscInt k=Ajmap1(i); k<Ajmap1(i+1); k++) Aa(Aimap1(i)) += v1(Aperm1(k));});
1748   Kokkos::parallel_for(Bnnz1,KOKKOS_LAMBDA(const PetscInt i) {for (PetscInt k=Bjmap1(i); k<Bjmap1(i+1); k++) Ba(Bimap1(i)) += v1(Bperm1(k));});
1749   ierr = PetscSFReduceEnd(mpikok->coo_sf,MPIU_SCALAR,vsend.data(),v2.data(),MPI_REPLACE);CHKERRQ(ierr);
1750 
1751   /* Add received remote entries to A and B */
1752   Kokkos::parallel_for(Annz2,KOKKOS_LAMBDA(const PetscInt i) {for (PetscInt k=Ajmap2(i); k<Ajmap2(i+1); k++) Aa(Aimap2(i)) += v2(Aperm2(k));});
1753   Kokkos::parallel_for(Bnnz2,KOKKOS_LAMBDA(const PetscInt i) {for (PetscInt k=Bjmap2(i); k<Bjmap2(i+1); k++) Ba(Bimap2(i)) += v2(Bperm2(k));});
1754 
1755   ierr = MatSeqAIJRestoreKokkosView(A,&Aa);CHKERRQ(ierr);
1756   ierr = MatSeqAIJRestoreKokkosView(B,&Ba);CHKERRQ(ierr);
1757 
1758   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1759   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1760   PetscFunctionReturn(0);
1761 }
1762 
1763 PetscErrorCode MatDestroy_MPIAIJKokkos(Mat A)
1764 {
1765   PetscErrorCode     ierr;
1766   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ*)A->data;
1767 
1768   PetscFunctionBegin;
1769   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
1770   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
1771   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",   NULL);CHKERRQ(ierr);
1772   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",          NULL);CHKERRQ(ierr);
1773   delete (Mat_MPIAIJKokkos*)mpiaij->spptr;
1774   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
1775   PetscFunctionReturn(0);
1776 }
1777 
1778 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
1779 {
1780   PetscErrorCode     ierr;
1781   Mat                B;
1782   Mat_MPIAIJ         *a;
1783 
1784   PetscFunctionBegin;
1785   if (reuse == MAT_INITIAL_MATRIX) {
1786     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
1787   } else if (reuse == MAT_REUSE_MATRIX) {
1788     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
1789   }
1790   B = *newmat;
1791 
1792   B->boundtocpu = PETSC_FALSE;
1793   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
1794   ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr);
1795   ierr = PetscObjectChangeTypeName((PetscObject)B,MATMPIAIJKOKKOS);CHKERRQ(ierr);
1796 
1797   a = static_cast<Mat_MPIAIJ*>(A->data);
1798   if (a->A) {ierr = MatSetType(a->A,MATSEQAIJKOKKOS);CHKERRQ(ierr);}
1799   if (a->B) {ierr = MatSetType(a->B,MATSEQAIJKOKKOS);CHKERRQ(ierr);}
1800   if (a->lvec) {ierr = VecSetType(a->lvec,VECSEQKOKKOS);CHKERRQ(ierr);}
1801 
1802   B->ops->assemblyend           = MatAssemblyEnd_MPIAIJKokkos;
1803   B->ops->mult                  = MatMult_MPIAIJKokkos;
1804   B->ops->multadd               = MatMultAdd_MPIAIJKokkos;
1805   B->ops->multtranspose         = MatMultTranspose_MPIAIJKokkos;
1806   B->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJKokkos;
1807   B->ops->destroy               = MatDestroy_MPIAIJKokkos;
1808 
1809   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJKokkos);CHKERRQ(ierr);
1810   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJKokkos);CHKERRQ(ierr);
1811   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSetPreallocationCOO_C",   MatSetPreallocationCOO_MPIAIJKokkos);CHKERRQ(ierr);
1812   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSetValuesCOO_C",          MatSetValuesCOO_MPIAIJKokkos);CHKERRQ(ierr);
1813   PetscFunctionReturn(0);
1814 }
1815 
1816 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJKokkos(Mat A)
1817 {
1818   PetscErrorCode ierr;
1819 
1820   PetscFunctionBegin;
1821   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
1822   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
1823   ierr = MatConvert_MPIAIJ_MPIAIJKokkos(A,MATMPIAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
1824   PetscFunctionReturn(0);
1825 }
1826 
1827 /*@C
1828    MatCreateAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1829    (the default parallel PETSc format).  This matrix will ultimately pushed down
1830    to Kokkos for calculations. For good matrix
1831    assembly performance the user should preallocate the matrix storage by setting
1832    the parameter nz (or the array nnz).  By setting these parameters accurately,
1833    performance during matrix assembly can be increased by more than a factor of 50.
1834 
1835    Collective
1836 
1837    Input Parameters:
1838 +  comm - MPI communicator, set to PETSC_COMM_SELF
1839 .  m - number of rows
1840 .  n - number of columns
1841 .  nz - number of nonzeros per row (same for all rows)
1842 -  nnz - array containing the number of nonzeros in the various rows
1843          (possibly different for each row) or NULL
1844 
1845    Output Parameter:
1846 .  A - the matrix
1847 
1848    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1849    MatXXXXSetPreallocation() paradigm instead of this routine directly.
1850    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1851 
1852    Notes:
1853    If nnz is given then nz is ignored
1854 
1855    The AIJ format (also called the Yale sparse matrix format or
1856    compressed row storage), is fully compatible with standard Fortran 77
1857    storage.  That is, the stored row and column indices can begin at
1858    either one (as in Fortran) or zero.  See the users' manual for details.
1859 
1860    Specify the preallocated storage with either nz or nnz (not both).
1861    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1862    allocation.  For large problems you MUST preallocate memory or you
1863    will get TERRIBLE performance, see the users' manual chapter on matrices.
1864 
1865    By default, this format uses inodes (identical nodes) when possible, to
1866    improve numerical efficiency of matrix-vector products and solves. We
1867    search for consecutive rows with the same nonzero structure, thereby
1868    reusing matrix information to achieve increased efficiency.
1869 
1870    Level: intermediate
1871 
1872 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJKOKKOS, MATAIJKokkos
1873 @*/
1874 PetscErrorCode  MatCreateAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
1875 {
1876   PetscErrorCode ierr;
1877   PetscMPIInt    size;
1878 
1879   PetscFunctionBegin;
1880   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1881   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
1882   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
1883   if (size > 1) {
1884     ierr = MatSetType(*A,MATMPIAIJKOKKOS);CHKERRQ(ierr);
1885     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
1886   } else {
1887     ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1888     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
1889   }
1890   PetscFunctionReturn(0);
1891 }
1892 
1893 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
1894 PetscErrorCode MatKokkosGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
1895 {
1896   PetscMPIInt                size,rank;
1897   MPI_Comm                   comm;
1898   PetscErrorCode             ierr;
1899   PetscSplitCSRDataStructure d_mat=NULL;
1900 
1901   PetscFunctionBegin;
1902   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1903   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
1904   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
1905   if (size == 1) {
1906     ierr   = MatSeqAIJKokkosGetDeviceMat(A,&d_mat);CHKERRQ(ierr);
1907     ierr   = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); /* Since we are going to modify matrix values on device */
1908   } else {
1909     Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
1910     ierr   = MatSeqAIJKokkosGetDeviceMat(aij->A,&d_mat);CHKERRQ(ierr);
1911     ierr   = MatSeqAIJKokkosModifyDevice(aij->A);CHKERRQ(ierr);
1912     ierr   = MatSeqAIJKokkosModifyDevice(aij->B);CHKERRQ(ierr);
1913   }
1914   // act like MatSetValues because not called on host
1915   if (A->assembled) {
1916     if (A->was_assembled) {
1917       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
1918     }
1919     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
1920   } else {
1921     ierr = PetscInfo1(A,"Warning !assemble ??? assembled=%" PetscInt_FMT "\n",A->assembled);CHKERRQ(ierr);
1922     // SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
1923   }
1924   if (!d_mat) {
1925     struct _n_SplitCSRMat h_mat; /* host container */
1926     Mat_SeqAIJKokkos      *aijkokA;
1927     Mat_SeqAIJ            *jaca;
1928     PetscInt              n = A->rmap->n, nnz;
1929     Mat                   Amat;
1930     PetscInt              *colmap;
1931 
1932     /* create and copy h_mat */
1933     h_mat.M = A->cmap->N; // use for debug build
1934     ierr = PetscInfo(A,"Create device matrix in Kokkos\n");CHKERRQ(ierr);
1935     if (size == 1) {
1936       Amat = A;
1937       jaca = (Mat_SeqAIJ*)A->data;
1938       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
1939       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
1940       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
1941       h_mat.offdiag.a = NULL;
1942       aijkokA = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1943       aijkokA->i_uncompressed_d = NULL;
1944       aijkokA->colmap_d = NULL;
1945     } else {
1946       Mat_MPIAIJ       *aij = (Mat_MPIAIJ*)A->data;
1947       Mat_SeqAIJ       *jacb = (Mat_SeqAIJ*)aij->B->data;
1948       PetscInt         ii;
1949       Mat_SeqAIJKokkos *aijkokB;
1950 
1951       Amat = aij->A;
1952       aijkokA = static_cast<Mat_SeqAIJKokkos*>(aij->A->spptr);
1953       aijkokB = static_cast<Mat_SeqAIJKokkos*>(aij->B->spptr);
1954       aijkokA->i_uncompressed_d = NULL;
1955       aijkokA->colmap_d = NULL;
1956       jaca = (Mat_SeqAIJ*)aij->A->data;
1957       if (aij->B->cmap->n && !aij->garray) SETERRQ(comm,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
1958       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
1959       aij->donotstash = PETSC_TRUE;
1960       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
1961       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more disassembly
1962       ierr = PetscCalloc1(A->cmap->N,&colmap);CHKERRQ(ierr);
1963       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N)*sizeof(PetscInt));CHKERRQ(ierr);
1964       for (ii=0; ii<aij->B->cmap->n; ii++) colmap[aij->garray[ii]] = ii+1;
1965       // allocate B copy data
1966       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
1967       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
1968       nnz = jacb->i[n];
1969       if (jacb->compressedrow.use) {
1970         const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_i_k (jacb->i,n+1);
1971         aijkokB->i_uncompressed_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_i_k));
1972         Kokkos::deep_copy (*aijkokB->i_uncompressed_d, h_i_k);
1973         h_mat.offdiag.i = aijkokB->i_uncompressed_d->data();
1974       } else {
1975          h_mat.offdiag.i = aijkokB->i_device_data();
1976       }
1977       h_mat.offdiag.j = aijkokB->j_device_data();
1978       h_mat.offdiag.a = aijkokB->a_device_data();
1979       {
1980         Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_colmap_k (colmap,A->cmap->N);
1981         aijkokB->colmap_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_colmap_k));
1982         Kokkos::deep_copy (*aijkokB->colmap_d, h_colmap_k);
1983         h_mat.colmap = aijkokB->colmap_d->data();
1984         ierr = PetscFree(colmap);CHKERRQ(ierr);
1985       }
1986       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
1987       h_mat.offdiag.n = n;
1988     }
1989     // allocate A copy data
1990     nnz = jaca->i[n];
1991     h_mat.diag.n = n;
1992     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
1993     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr);
1994     if (jaca->compressedrow.use) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A does not suppport compressed row (todo)");
1995     else {
1996       h_mat.diag.i = aijkokA->i_device_data();
1997     }
1998     h_mat.diag.j = aijkokA->j_device_data();
1999     h_mat.diag.a = aijkokA->a_device_data();
2000     // copy pointers and metdata to device
2001     ierr = MatSeqAIJKokkosSetDeviceMat(Amat,&h_mat);CHKERRQ(ierr);
2002     ierr = MatSeqAIJKokkosGetDeviceMat(Amat,&d_mat);CHKERRQ(ierr);
2003     ierr = PetscInfo2(A,"Create device Mat n=%" PetscInt_FMT " nnz=%" PetscInt_FMT "\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
2004   }
2005   *B = d_mat; // return it, set it in Mat, and set it up
2006   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
2007   PetscFunctionReturn(0);
2008 }
2009 
2010 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetOffloadMask(Mat A, const char **mask)
2011 {
2012   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
2013 
2014   PetscFunctionBegin;
2015   if (!aijkok) *mask = "AIJKOK_UNALLOCATED";
2016   else if (aijkok->a_dual.need_sync_host()) *mask = "PETSC_OFFLOAD_GPU";
2017   else if (aijkok->a_dual.need_sync_device()) *mask = "PETSC_OFFLOAD_CPU";
2018   else *mask = "PETSC_OFFLOAD_BOTH";
2019   PetscFunctionReturn(0);
2020 }
2021 
2022 PETSC_INTERN PetscErrorCode MatAIJKokkosPrintOffloadMask(Mat A)
2023 {
2024   PetscErrorCode    ierr;
2025   PetscMPIInt       size;
2026   Mat               Ad,Ao;
2027   const char        *amask,*bmask;
2028 
2029   PetscFunctionBegin;
2030   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr);
2031 
2032   if (size == 1) {
2033     ierr = MatSeqAIJKokkosGetOffloadMask(A,&amask);CHKERRQ(ierr);
2034     ierr = PetscPrintf(PETSC_COMM_SELF,"%s\n",amask);CHKERRQ(ierr);
2035   } else {
2036     Ad  = ((Mat_MPIAIJ*)A->data)->A;
2037     Ao  = ((Mat_MPIAIJ*)A->data)->B;
2038     ierr = MatSeqAIJKokkosGetOffloadMask(Ad,&amask);CHKERRQ(ierr);
2039     ierr = MatSeqAIJKokkosGetOffloadMask(Ao,&bmask);CHKERRQ(ierr);
2040     ierr = PetscPrintf(PETSC_COMM_SELF,"Diag : Off-diag = %s : %s\n",amask,bmask);CHKERRQ(ierr);
2041   }
2042   PetscFunctionReturn(0);
2043 }
2044