xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 7b8d4ba6a08626823f4aa3c70fa9bb901abea174)
111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp>
2076ba34aSJunchao Zhang #include <petscpkg_version.h>
3152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h>
442550becSJunchao Zhang #include <petsc/private/sfimpl.h>
58c3ff71bSJunchao Zhang #include <petscsystypes.h>
68c3ff71bSJunchao Zhang #include <petscerror.h>
78c3ff71bSJunchao Zhang 
88c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp>
9f0cf5187SStefano Zampini #include <KokkosBlas.hpp>
108c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp>
118c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
1286a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp>
1386a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp>
14076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp>
15076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp>
1686a27549SJunchao Zhang 
1742550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
188c3ff71bSJunchao Zhang 
190e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
20f98996d3SJunchao Zhang   #include <KokkosSparse_Utils.hpp>
21f98996d3SJunchao Zhang using KokkosSparse::sort_crs_matrix;
229371c9d4SSatish Balay using KokkosSparse::Impl::transpose_matrix;
23f98996d3SJunchao Zhang #else
24f98996d3SJunchao Zhang   #include <KokkosKernels_Sorting.hpp>
25f98996d3SJunchao Zhang using KokkosKernels::sort_crs_matrix;
269371c9d4SSatish Balay using KokkosKernels::Impl::transpose_matrix;
27f98996d3SJunchao Zhang #endif
28f98996d3SJunchao Zhang 
298c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
308c3ff71bSJunchao Zhang 
31076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
32076ba34aSJunchao Zhang    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
33076ba34aSJunchao Zhang    In the latter case, it is important to set a_dual's sync state correctly.
34076ba34aSJunchao Zhang  */
35d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
36d71ae5a4SJacob Faibussowitsch {
37076ba34aSJunchao Zhang   Mat_SeqAIJ       *aijseq;
38076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
398c3ff71bSJunchao Zhang 
408c3ff71bSJunchao Zhang   PetscFunctionBegin;
413ba16761SJacob Faibussowitsch   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
429566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
43076ba34aSJunchao Zhang 
44076ba34aSJunchao Zhang   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
45076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
46076ba34aSJunchao Zhang 
47076ba34aSJunchao Zhang   /* If aijkok does not exist, we just copy i, j to device.
48076ba34aSJunchao Zhang      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
49076ba34aSJunchao Zhang      In both cases, we build a new aijkok structure.
50076ba34aSJunchao Zhang   */
51076ba34aSJunchao Zhang   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
52076ba34aSJunchao Zhang     delete aijkok;
53076ba34aSJunchao Zhang     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
54076ba34aSJunchao Zhang     A->spptr = aijkok;
55076ba34aSJunchao Zhang   }
56076ba34aSJunchao Zhang 
573ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
588c3ff71bSJunchao Zhang }
598c3ff71bSJunchao Zhang 
6086a27549SJunchao Zhang /* Sync CSR data to device if not yet */
61d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
62d71ae5a4SJacob Faibussowitsch {
638c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
648c3ff71bSJunchao Zhang 
658c3ff71bSJunchao Zhang   PetscFunctionBegin;
66aaa8cc7dSPierre Jolivet   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
675f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
68076ba34aSJunchao Zhang   if (aijkok->a_dual.need_sync_device()) {
69076ba34aSJunchao Zhang     aijkok->a_dual.sync_device();
70580c7c76SPierre Jolivet     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
7186a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_FALSE;
728c3ff71bSJunchao Zhang   }
733ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
748c3ff71bSJunchao Zhang }
758c3ff71bSJunchao Zhang 
76076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */
77d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
78d71ae5a4SJacob Faibussowitsch {
7986a27549SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
8086a27549SJunchao Zhang 
8186a27549SJunchao Zhang   PetscFunctionBegin;
825f80ce2aSJacob Faibussowitsch   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
8386a27549SJunchao Zhang   aijkok->a_dual.clear_sync_state();
8486a27549SJunchao Zhang   aijkok->a_dual.modify_device();
8586a27549SJunchao Zhang   aijkok->transpose_updated = PETSC_FALSE;
8686a27549SJunchao Zhang   aijkok->hermitian_updated = PETSC_FALSE;
879566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
889566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
893ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
9086a27549SJunchao Zhang }
9186a27549SJunchao Zhang 
92d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
93d71ae5a4SJacob Faibussowitsch {
94f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
95f0cf5187SStefano Zampini 
96f0cf5187SStefano Zampini   PetscFunctionBegin;
97f0cf5187SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
9886a27549SJunchao Zhang   /* We do not expect one needs factors on host  */
99aaa8cc7dSPierre Jolivet   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
1005f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
101076ba34aSJunchao Zhang   aijkok->a_dual.sync_host();
1023ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
103f0cf5187SStefano Zampini }
104f0cf5187SStefano Zampini 
105d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
106d71ae5a4SJacob Faibussowitsch {
107076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
108f0cf5187SStefano Zampini 
109f0cf5187SStefano Zampini   PetscFunctionBegin;
1105519a089SJose E. Roman   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
1115519a089SJose E. Roman     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
1125519a089SJose E. Roman     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
1135519a089SJose E. Roman     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
1145519a089SJose E. Roman   */
1155519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
116076ba34aSJunchao Zhang     aijkok->a_dual.sync_host();
117076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
118076ba34aSJunchao Zhang   } else { /* Happens when calling MatSetValues on a newly created matrix */
119076ba34aSJunchao Zhang     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
120076ba34aSJunchao Zhang   }
1213ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
122076ba34aSJunchao Zhang }
123076ba34aSJunchao Zhang 
124d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
125d71ae5a4SJacob Faibussowitsch {
126076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
127076ba34aSJunchao Zhang 
128076ba34aSJunchao Zhang   PetscFunctionBegin;
1295519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
1303ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
131076ba34aSJunchao Zhang }
132076ba34aSJunchao Zhang 
133d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
134d71ae5a4SJacob Faibussowitsch {
135076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
136076ba34aSJunchao Zhang 
137076ba34aSJunchao Zhang   PetscFunctionBegin;
1385519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
139076ba34aSJunchao Zhang     aijkok->a_dual.sync_host();
140076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1412328674fSJunchao Zhang   } else {
1422328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
1432328674fSJunchao Zhang   }
1443ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
145076ba34aSJunchao Zhang }
146076ba34aSJunchao Zhang 
147d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
148d71ae5a4SJacob Faibussowitsch {
149076ba34aSJunchao Zhang   PetscFunctionBegin;
150076ba34aSJunchao Zhang   *array = NULL;
1513ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
152076ba34aSJunchao Zhang }
153076ba34aSJunchao Zhang 
154d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
155d71ae5a4SJacob Faibussowitsch {
156076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
157076ba34aSJunchao Zhang 
158076ba34aSJunchao Zhang   PetscFunctionBegin;
1595519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
160076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1612328674fSJunchao Zhang   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
1622328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
1632328674fSJunchao Zhang   }
1643ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
165076ba34aSJunchao Zhang }
166076ba34aSJunchao Zhang 
167d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
168d71ae5a4SJacob Faibussowitsch {
169076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
170076ba34aSJunchao Zhang 
171076ba34aSJunchao Zhang   PetscFunctionBegin;
1725519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
173076ba34aSJunchao Zhang     aijkok->a_dual.clear_sync_state();
174076ba34aSJunchao Zhang     aijkok->a_dual.modify_host();
1752328674fSJunchao Zhang   }
1763ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
177f0cf5187SStefano Zampini }
178f0cf5187SStefano Zampini 
179d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
180d71ae5a4SJacob Faibussowitsch {
1817ee59b9bSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1827ee59b9bSJunchao Zhang 
1837ee59b9bSJunchao Zhang   PetscFunctionBegin;
1847ee59b9bSJunchao Zhang   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
1857ee59b9bSJunchao Zhang 
1867ee59b9bSJunchao Zhang   if (i) *i = aijkok->i_device_data();
1877ee59b9bSJunchao Zhang   if (j) *j = aijkok->j_device_data();
1887ee59b9bSJunchao Zhang   if (a) {
1897ee59b9bSJunchao Zhang     aijkok->a_dual.sync_device();
1907ee59b9bSJunchao Zhang     *a = aijkok->a_device_data();
1917ee59b9bSJunchao Zhang   }
1927ee59b9bSJunchao Zhang   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
1933ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1947ee59b9bSJunchao Zhang }
1957ee59b9bSJunchao Zhang 
1960e3ece09SJunchao Zhang /*
1970e3ece09SJunchao Zhang   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
1980e3ece09SJunchao Zhang 
1990e3ece09SJunchao Zhang   Input Parameter:
2000e3ece09SJunchao Zhang .  A       - the MATSEQAIJKOKKOS matrix
2010e3ece09SJunchao Zhang 
2020e3ece09SJunchao Zhang   Output Parameters:
2030e3ece09SJunchao Zhang +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
204aaa8cc7dSPierre Jolivet -  T_d    - the transpose on device, whose value array is allocated but not initialized
2050e3ece09SJunchao Zhang */
2060e3ece09SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
207d71ae5a4SJacob Faibussowitsch {
2080e3ece09SJunchao Zhang   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
2090e3ece09SJunchao Zhang   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
2100e3ece09SJunchao Zhang   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
211*7b8d4ba6SJunchao Zhang   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
2120e3ece09SJunchao Zhang   MatRowMapType          *Ti = Ti_h.data();
213*7b8d4ba6SJunchao Zhang   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
214*7b8d4ba6SJunchao Zhang   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
2150e3ece09SJunchao Zhang   PetscInt               *Tj   = Tj_h.data();
2160e3ece09SJunchao Zhang   PetscInt               *perm = perm_h.data();
2170e3ece09SJunchao Zhang   PetscInt               *offset;
218152b3e56SJunchao Zhang 
219152b3e56SJunchao Zhang   PetscFunctionBegin;
2200e3ece09SJunchao Zhang   // Populate Ti
2210e3ece09SJunchao Zhang   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
2220e3ece09SJunchao Zhang   Ti++;
2230e3ece09SJunchao Zhang   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
2240e3ece09SJunchao Zhang   Ti--;
2250e3ece09SJunchao Zhang   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
2260e3ece09SJunchao Zhang 
2270e3ece09SJunchao Zhang   // Populate Tj and the permutation array
2280e3ece09SJunchao Zhang   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
2290e3ece09SJunchao Zhang   for (PetscInt i = 0; i < m; i++) {
2300e3ece09SJunchao Zhang     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
2310e3ece09SJunchao Zhang       PetscInt r    = Aj[j];                       // row r of T
2320e3ece09SJunchao Zhang       PetscInt disp = Ti[r] + offset[r];
2330e3ece09SJunchao Zhang 
2340e3ece09SJunchao Zhang       Tj[disp]   = i; // col i of T
2350e3ece09SJunchao Zhang       perm[disp] = j;
2360e3ece09SJunchao Zhang       offset[r]++;
237076ba34aSJunchao Zhang     }
2380e3ece09SJunchao Zhang   }
2390e3ece09SJunchao Zhang   PetscCall(PetscFree(offset));
2400e3ece09SJunchao Zhang 
2410e3ece09SJunchao Zhang   // Sort each row of T, along with the permutation array
2420e3ece09SJunchao Zhang   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
2430e3ece09SJunchao Zhang 
2440e3ece09SJunchao Zhang   // Output perm and T on device
2450e3ece09SJunchao Zhang   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
2460e3ece09SJunchao Zhang   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
2470e3ece09SJunchao Zhang   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
2480e3ece09SJunchao Zhang   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
2493ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
250152b3e56SJunchao Zhang }
251152b3e56SJunchao Zhang 
2520e3ece09SJunchao Zhang // Generate the transpose on device and cache it internally
2530e3ece09SJunchao Zhang // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
2540e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
255d71ae5a4SJacob Faibussowitsch {
2560e3ece09SJunchao Zhang   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
2570e3ece09SJunchao Zhang   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
2580e3ece09SJunchao Zhang   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
2590e3ece09SJunchao Zhang   KokkosCsrMatrix  &T = akok->csrmatT;
260152b3e56SJunchao Zhang 
261152b3e56SJunchao Zhang   PetscFunctionBegin;
2620e3ece09SJunchao Zhang   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
2630e3ece09SJunchao Zhang   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's valeus since we are going to access them on device
2640e3ece09SJunchao Zhang 
2650e3ece09SJunchao Zhang   const auto &Aa = akok->a_dual.view_device();
2660e3ece09SJunchao Zhang 
2670e3ece09SJunchao Zhang   if (A->symmetric == PETSC_BOOL3_TRUE) {
2680e3ece09SJunchao Zhang     *csrmatT = akok->csrmat;
2690e3ece09SJunchao Zhang   } else {
2700e3ece09SJunchao Zhang     // See if we already have a cached transpose and its value is up to date
2710e3ece09SJunchao Zhang     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
2720e3ece09SJunchao Zhang       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
2730e3ece09SJunchao Zhang         const auto &perm = akok->transpose_perm; // get the permutation array
2740e3ece09SJunchao Zhang         auto       &Ta   = T.values;
2750e3ece09SJunchao Zhang 
2760e3ece09SJunchao Zhang         PetscCallCXX(Kokkos::parallel_for(
2770e3ece09SJunchao Zhang           nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
278076ba34aSJunchao Zhang       }
2790e3ece09SJunchao Zhang     } else { // Generate T of size n x m for the first time
2800e3ece09SJunchao Zhang       MatRowMapKokkosView perm;
2810e3ece09SJunchao Zhang 
2820e3ece09SJunchao Zhang       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
2830e3ece09SJunchao Zhang       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
2840e3ece09SJunchao Zhang       PetscCallCXX(Kokkos::parallel_for(
2850e3ece09SJunchao Zhang         nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
2860e3ece09SJunchao Zhang     }
2870e3ece09SJunchao Zhang     akok->transpose_updated = PETSC_TRUE;
2880e3ece09SJunchao Zhang     *csrmatT                = akok->csrmatT;
2890e3ece09SJunchao Zhang   }
2900e3ece09SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
2910e3ece09SJunchao Zhang }
2920e3ece09SJunchao Zhang 
2930e3ece09SJunchao Zhang // Generate the Hermitian on device and cache it internally
2940e3ece09SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
2950e3ece09SJunchao Zhang {
2960e3ece09SJunchao Zhang   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
2970e3ece09SJunchao Zhang   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
2980e3ece09SJunchao Zhang   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
2990e3ece09SJunchao Zhang   KokkosCsrMatrix  &T = akok->csrmatH;
3000e3ece09SJunchao Zhang 
3010e3ece09SJunchao Zhang   PetscFunctionBegin;
3020e3ece09SJunchao Zhang   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
3030e3ece09SJunchao Zhang   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
3040e3ece09SJunchao Zhang 
3050e3ece09SJunchao Zhang   const auto &Aa = akok->a_dual.view_device();
3060e3ece09SJunchao Zhang 
3070e3ece09SJunchao Zhang   if (A->hermitian == PETSC_BOOL3_TRUE) {
3080e3ece09SJunchao Zhang     *csrmatH = akok->csrmat;
3090e3ece09SJunchao Zhang   } else {
3100e3ece09SJunchao Zhang     // See if we already have a cached hermitian and its value is up to date
3110e3ece09SJunchao Zhang     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
3120e3ece09SJunchao Zhang       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
3130e3ece09SJunchao Zhang         const auto &perm = akok->transpose_perm; // get the permutation array
3140e3ece09SJunchao Zhang         auto       &Ta   = T.values;
3150e3ece09SJunchao Zhang 
3160e3ece09SJunchao Zhang         PetscCallCXX(Kokkos::parallel_for(
3170e3ece09SJunchao Zhang           nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
3180e3ece09SJunchao Zhang       }
3190e3ece09SJunchao Zhang     } else { // Generate T of size n x m for the first time
3200e3ece09SJunchao Zhang       MatRowMapKokkosView perm;
3210e3ece09SJunchao Zhang 
3220e3ece09SJunchao Zhang       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
3230e3ece09SJunchao Zhang       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
3240e3ece09SJunchao Zhang       PetscCallCXX(Kokkos::parallel_for(
3250e3ece09SJunchao Zhang         nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
3260e3ece09SJunchao Zhang     }
3270e3ece09SJunchao Zhang     akok->hermitian_updated = PETSC_TRUE;
3280e3ece09SJunchao Zhang     *csrmatH                = akok->csrmatH;
3290e3ece09SJunchao Zhang   }
3303ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
331152b3e56SJunchao Zhang }
332a587d139SMark 
3338c3ff71bSJunchao Zhang /* y = A x */
334d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
335d71ae5a4SJacob Faibussowitsch {
3368c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
337152b3e56SJunchao Zhang   ConstPetscScalarKokkosView xv;
338152b3e56SJunchao Zhang   PetscScalarKokkosView      yv;
3398c3ff71bSJunchao Zhang 
3408c3ff71bSJunchao Zhang   PetscFunctionBegin;
3419566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3429566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
3439566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
3449566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(yy, &yv));
3458c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
3469d52486cSJunchao Zhang   PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
3479566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
3489566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
349076ba34aSJunchao Zhang   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
3509566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
3519566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3523ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3538c3ff71bSJunchao Zhang }
3548c3ff71bSJunchao Zhang 
3558c3ff71bSJunchao Zhang /* y = A^T x */
356d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
357d71ae5a4SJacob Faibussowitsch {
3588c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
359152b3e56SJunchao Zhang   const char                *mode;
360152b3e56SJunchao Zhang   ConstPetscScalarKokkosView xv;
361152b3e56SJunchao Zhang   PetscScalarKokkosView      yv;
3620e3ece09SJunchao Zhang   KokkosCsrMatrix            csrmat;
3638c3ff71bSJunchao Zhang 
3648c3ff71bSJunchao Zhang   PetscFunctionBegin;
3659566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3669566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
3679566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
3689566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(yy, &yv));
369152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
3709566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
371152b3e56SJunchao Zhang     mode = "N";
372152b3e56SJunchao Zhang   } else {
373076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
3740e3ece09SJunchao Zhang     csrmat = aijkok->csrmat;
375152b3e56SJunchao Zhang     mode   = "T";
376152b3e56SJunchao Zhang   }
3770e3ece09SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
3789566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
3799566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
3800e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
3819566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3823ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3838c3ff71bSJunchao Zhang }
3848c3ff71bSJunchao Zhang 
3858c3ff71bSJunchao Zhang /* y = A^H x */
386d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
387d71ae5a4SJacob Faibussowitsch {
3888c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
389152b3e56SJunchao Zhang   const char                *mode;
390152b3e56SJunchao Zhang   ConstPetscScalarKokkosView xv;
391152b3e56SJunchao Zhang   PetscScalarKokkosView      yv;
3920e3ece09SJunchao Zhang   KokkosCsrMatrix            csrmat;
3938c3ff71bSJunchao Zhang 
3948c3ff71bSJunchao Zhang   PetscFunctionBegin;
3959566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3969566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
3979566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
3989566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(yy, &yv));
399152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
4009566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
401152b3e56SJunchao Zhang     mode = "N";
402152b3e56SJunchao Zhang   } else {
403076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
4040e3ece09SJunchao Zhang     csrmat = aijkok->csrmat;
405152b3e56SJunchao Zhang     mode   = "C";
406152b3e56SJunchao Zhang   }
4070e3ece09SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
4089566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
4099566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
4100e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
4119566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
4123ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4138c3ff71bSJunchao Zhang }
4148c3ff71bSJunchao Zhang 
4158c3ff71bSJunchao Zhang /* z = A x + y */
416d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
417d71ae5a4SJacob Faibussowitsch {
4188c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
419152b3e56SJunchao Zhang   ConstPetscScalarKokkosView xv, yv;
420152b3e56SJunchao Zhang   PetscScalarKokkosView      zv;
4218c3ff71bSJunchao Zhang 
4228c3ff71bSJunchao Zhang   PetscFunctionBegin;
4239566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4249566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
4259566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
4269566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(yy, &yv));
4279566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(zz, &zv));
4288c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv, yv);
4298c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
4309d52486cSJunchao Zhang   PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
4319566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
4329566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(yy, &yv));
4339566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
4349566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
4359566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
4363ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4378c3ff71bSJunchao Zhang }
4388c3ff71bSJunchao Zhang 
4398c3ff71bSJunchao Zhang /* z = A^T x + y */
440d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
441d71ae5a4SJacob Faibussowitsch {
4428c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
443152b3e56SJunchao Zhang   const char                *mode;
444152b3e56SJunchao Zhang   ConstPetscScalarKokkosView xv, yv;
445152b3e56SJunchao Zhang   PetscScalarKokkosView      zv;
4460e3ece09SJunchao Zhang   KokkosCsrMatrix            csrmat;
4478c3ff71bSJunchao Zhang 
4488c3ff71bSJunchao Zhang   PetscFunctionBegin;
4499566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4509566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
4519566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
4529566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(yy, &yv));
4539566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(zz, &zv));
4548c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv, yv);
455152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
4569566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
457152b3e56SJunchao Zhang     mode = "N";
458152b3e56SJunchao Zhang   } else {
459076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
4600e3ece09SJunchao Zhang     csrmat = aijkok->csrmat;
461152b3e56SJunchao Zhang     mode   = "T";
462152b3e56SJunchao Zhang   }
4630e3ece09SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
4649566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
4659566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(yy, &yv));
4669566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
4670e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
4689566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
4693ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4708c3ff71bSJunchao Zhang }
4718c3ff71bSJunchao Zhang 
4728c3ff71bSJunchao Zhang /* z = A^H x + y */
473d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
474d71ae5a4SJacob Faibussowitsch {
4758c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
476152b3e56SJunchao Zhang   const char                *mode;
477152b3e56SJunchao Zhang   ConstPetscScalarKokkosView xv, yv;
478152b3e56SJunchao Zhang   PetscScalarKokkosView      zv;
4790e3ece09SJunchao Zhang   KokkosCsrMatrix            csrmat;
4808c3ff71bSJunchao Zhang 
4818c3ff71bSJunchao Zhang   PetscFunctionBegin;
4829566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4839566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
4849566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
4859566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(yy, &yv));
4869566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(zz, &zv));
4878c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv, yv);
488152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
4899566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
490152b3e56SJunchao Zhang     mode = "N";
491152b3e56SJunchao Zhang   } else {
492076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
4930e3ece09SJunchao Zhang     csrmat = aijkok->csrmat;
494152b3e56SJunchao Zhang     mode   = "C";
495152b3e56SJunchao Zhang   }
4960e3ece09SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
4979566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
4989566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(yy, &yv));
4999566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
5000e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
5019566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
5023ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
503152b3e56SJunchao Zhang }
504152b3e56SJunchao Zhang 
505d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
506d71ae5a4SJacob Faibussowitsch {
507152b3e56SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
508152b3e56SJunchao Zhang 
509152b3e56SJunchao Zhang   PetscFunctionBegin;
510152b3e56SJunchao Zhang   switch (op) {
511152b3e56SJunchao Zhang   case MAT_FORM_EXPLICIT_TRANSPOSE:
512152b3e56SJunchao Zhang     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
5139566063dSJacob Faibussowitsch     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
514152b3e56SJunchao Zhang     A->form_explicit_transpose = flg;
515152b3e56SJunchao Zhang     break;
516d71ae5a4SJacob Faibussowitsch   default:
517d71ae5a4SJacob Faibussowitsch     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
518d71ae5a4SJacob Faibussowitsch     break;
519152b3e56SJunchao Zhang   }
5203ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5218c3ff71bSJunchao Zhang }
5228c3ff71bSJunchao Zhang 
523076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */
524d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
525d71ae5a4SJacob Faibussowitsch {
526076ba34aSJunchao Zhang   Mat_SeqAIJ *aseq;
5278c3ff71bSJunchao Zhang 
5288c3ff71bSJunchao Zhang   PetscFunctionBegin;
5299566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
530076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
5319566063dSJacob Faibussowitsch     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));  /* the returned newmat is a SeqAIJKokkos */
5328c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
5339566063dSJacob Faibussowitsch     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
534076ba34aSJunchao Zhang   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
5355f80ce2aSJacob Faibussowitsch     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
5369566063dSJacob Faibussowitsch     PetscCall(PetscFree(A->defaultvectype));
5379566063dSJacob Faibussowitsch     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
5389566063dSJacob Faibussowitsch     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
5399566063dSJacob Faibussowitsch     PetscCall(MatSetOps_SeqAIJKokkos(A));
540076ba34aSJunchao Zhang     aseq = static_cast<Mat_SeqAIJ *>(A->data);
541394ed5ebSJunchao Zhang     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
5425f80ce2aSJacob Faibussowitsch       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
543076ba34aSJunchao Zhang       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE);
5448c3ff71bSJunchao Zhang     }
545076ba34aSJunchao Zhang   }
5463ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5478c3ff71bSJunchao Zhang }
5488c3ff71bSJunchao Zhang 
549076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
550076ba34aSJunchao Zhang    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
551076ba34aSJunchao Zhang  */
552d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
553d71ae5a4SJacob Faibussowitsch {
554076ba34aSJunchao Zhang   Mat_SeqAIJ       *bseq;
555076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
556076ba34aSJunchao Zhang   Mat               mat;
5578c3ff71bSJunchao Zhang 
5588c3ff71bSJunchao Zhang   PetscFunctionBegin;
559076ba34aSJunchao Zhang   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
5609566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
561076ba34aSJunchao Zhang   mat = *B;
562076ba34aSJunchao Zhang   if (A->assembled) {
563076ba34aSJunchao Zhang     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
564076ba34aSJunchao Zhang     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE);
565076ba34aSJunchao Zhang     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
566076ba34aSJunchao Zhang     /* Now copy values to B if needed */
567076ba34aSJunchao Zhang     if (dupOption == MAT_COPY_VALUES) {
568076ba34aSJunchao Zhang       if (akok->a_dual.need_sync_device()) {
569076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
570076ba34aSJunchao Zhang         bkok->a_dual.modify_host();
571076ba34aSJunchao Zhang       } else { /* If device has the latest data, we only copy data on device */
572076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
573076ba34aSJunchao Zhang         bkok->a_dual.modify_device();
574076ba34aSJunchao Zhang       }
575076ba34aSJunchao Zhang     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
576076ba34aSJunchao Zhang       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
577076ba34aSJunchao Zhang       bkok->a_dual.modify_host();
578076ba34aSJunchao Zhang     }
579076ba34aSJunchao Zhang     mat->spptr = bkok;
580076ba34aSJunchao Zhang   }
581076ba34aSJunchao Zhang 
5829566063dSJacob Faibussowitsch   PetscCall(PetscFree(mat->defaultvectype));
5839566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
5849566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
5859566063dSJacob Faibussowitsch   PetscCall(MatSetOps_SeqAIJKokkos(mat));
5863ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5878c3ff71bSJunchao Zhang }
5888c3ff71bSJunchao Zhang 
589d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
590d71ae5a4SJacob Faibussowitsch {
5910ecb592aSJunchao Zhang   Mat               At;
5920e3ece09SJunchao Zhang   KokkosCsrMatrix   internT;
5930ecb592aSJunchao Zhang   Mat_SeqAIJKokkos *atkok, *bkok;
5940ecb592aSJunchao Zhang 
5950ecb592aSJunchao Zhang   PetscFunctionBegin;
5967fb60732SBarry Smith   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
5979566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
5980ecb592aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
599ff751488SJunchao Zhang     /* Deep copy internT, as we want to isolate the internal transpose */
6000e3ece09SJunchao Zhang     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
6019566063dSJacob Faibussowitsch     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
6020ecb592aSJunchao Zhang     if (reuse == MAT_INITIAL_MATRIX) *B = At;
6039566063dSJacob Faibussowitsch     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
6040ecb592aSJunchao Zhang   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
6050ecb592aSJunchao Zhang     if ((*B)->assembled) {
6060ecb592aSJunchao Zhang       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
6070e3ece09SJunchao Zhang       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
6089566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
6090ecb592aSJunchao Zhang     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
6100ecb592aSJunchao Zhang       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
6110e3ece09SJunchao Zhang       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
6120e3ece09SJunchao Zhang       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
6130e3ece09SJunchao Zhang       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
6140e3ece09SJunchao Zhang       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
6150ecb592aSJunchao Zhang     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
6160ecb592aSJunchao Zhang   }
6173ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6180ecb592aSJunchao Zhang }
6190ecb592aSJunchao Zhang 
620d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
621d71ae5a4SJacob Faibussowitsch {
62286a27549SJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
6238c3ff71bSJunchao Zhang 
6248c3ff71bSJunchao Zhang   PetscFunctionBegin;
62586a27549SJunchao Zhang   if (A->factortype == MAT_FACTOR_NONE) {
62686a27549SJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
6278c3ff71bSJunchao Zhang     delete aijkok;
62886a27549SJunchao Zhang   } else {
62986a27549SJunchao Zhang     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
63086a27549SJunchao Zhang   }
631cbc6b225SStefano Zampini   A->spptr = NULL;
6329566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
6339566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
6349566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
6359566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
6363ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6378c3ff71bSJunchao Zhang }
6388c3ff71bSJunchao Zhang 
6393f3ba80aSJunchao Zhang /*MC
6403f3ba80aSJunchao Zhang    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
6413f3ba80aSJunchao Zhang 
6423f3ba80aSJunchao Zhang    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
6433f3ba80aSJunchao Zhang 
6442ef1f0ffSBarry Smith    Options Database Key:
64511a5261eSBarry Smith .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
6463f3ba80aSJunchao Zhang 
6473f3ba80aSJunchao Zhang   Level: beginner
6483f3ba80aSJunchao Zhang 
6492ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
6503f3ba80aSJunchao Zhang M*/
651d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
652d71ae5a4SJacob Faibussowitsch {
65386a27549SJunchao Zhang   PetscFunctionBegin;
6549566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
6559566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(A));
6569566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
6573ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
65886a27549SJunchao Zhang }
65986a27549SJunchao Zhang 
660076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
661d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
662d71ae5a4SJacob Faibussowitsch {
663076ba34aSJunchao Zhang   Mat_SeqAIJ         *a, *b;
664076ba34aSJunchao Zhang   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
665076ba34aSJunchao Zhang   MatScalarKokkosView aa, ba, ca;
666076ba34aSJunchao Zhang   MatRowMapKokkosView ai, bi, ci;
667076ba34aSJunchao Zhang   MatColIdxKokkosView aj, bj, cj;
668076ba34aSJunchao Zhang   PetscInt            m, n, nnz, aN;
669a3f881fbSStefano Zampini 
670a3f881fbSStefano Zampini   PetscFunctionBegin;
671076ba34aSJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
672076ba34aSJunchao Zhang   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
673076ba34aSJunchao Zhang   PetscValidPointer(C, 4);
674076ba34aSJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
675076ba34aSJunchao Zhang   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
6765f80ce2aSJacob Faibussowitsch   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
6775f80ce2aSJacob Faibussowitsch   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
678076ba34aSJunchao Zhang 
6799566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
6809566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B));
681076ba34aSJunchao Zhang   a    = static_cast<Mat_SeqAIJ *>(A->data);
682076ba34aSJunchao Zhang   b    = static_cast<Mat_SeqAIJ *>(B->data);
683076ba34aSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
684076ba34aSJunchao Zhang   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
685076ba34aSJunchao Zhang   aa   = akok->a_dual.view_device();
686076ba34aSJunchao Zhang   ai   = akok->i_dual.view_device();
687076ba34aSJunchao Zhang   ba   = bkok->a_dual.view_device();
688076ba34aSJunchao Zhang   bi   = bkok->i_dual.view_device();
689076ba34aSJunchao Zhang   m    = A->rmap->n; /* M, N and nnz of C */
690076ba34aSJunchao Zhang   n    = A->cmap->n + B->cmap->n;
691076ba34aSJunchao Zhang   nnz  = a->nz + b->nz;
692076ba34aSJunchao Zhang   aN   = A->cmap->n; /* N of A */
693076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) {
694076ba34aSJunchao Zhang     aj           = akok->j_dual.view_device();
695076ba34aSJunchao Zhang     bj           = bkok->j_dual.view_device();
696076ba34aSJunchao Zhang     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
697076ba34aSJunchao Zhang     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
698076ba34aSJunchao Zhang     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
699076ba34aSJunchao Zhang     ca           = ca_dual.view_device();
700076ba34aSJunchao Zhang     ci           = ci_dual.view_device();
701076ba34aSJunchao Zhang     cj           = cj_dual.view_device();
702076ba34aSJunchao Zhang 
703076ba34aSJunchao Zhang     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
7049371c9d4SSatish Balay     Kokkos::parallel_for(
7059371c9d4SSatish Balay       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
706076ba34aSJunchao Zhang         PetscInt i       = t.league_rank(); /* row i */
707076ba34aSJunchao Zhang         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
708076ba34aSJunchao Zhang 
709076ba34aSJunchao Zhang         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
710076ba34aSJunchao Zhang                                                    ci(i) = coffset;
711076ba34aSJunchao Zhang                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
712076ba34aSJunchao Zhang         });
713076ba34aSJunchao Zhang 
714076ba34aSJunchao Zhang         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
715076ba34aSJunchao Zhang           if (k < alen) {
716076ba34aSJunchao Zhang             ca(coffset + k) = aa(ai(i) + k);
717076ba34aSJunchao Zhang             cj(coffset + k) = aj(ai(i) + k);
718076ba34aSJunchao Zhang           } else {
719076ba34aSJunchao Zhang             ca(coffset + k) = ba(bi(i) + k - alen);
720076ba34aSJunchao Zhang             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
721076ba34aSJunchao Zhang           }
722076ba34aSJunchao Zhang         });
723076ba34aSJunchao Zhang       });
724076ba34aSJunchao Zhang     ca_dual.modify_device();
725076ba34aSJunchao Zhang     ci_dual.modify_device();
726076ba34aSJunchao Zhang     cj_dual.modify_device();
7279566063dSJacob Faibussowitsch     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
7289566063dSJacob Faibussowitsch     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
729076ba34aSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {
730076ba34aSJunchao Zhang     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
731076ba34aSJunchao Zhang     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
732076ba34aSJunchao Zhang     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
733076ba34aSJunchao Zhang     ca   = ckok->a_dual.view_device();
734076ba34aSJunchao Zhang     ci   = ckok->i_dual.view_device();
735076ba34aSJunchao Zhang 
7369371c9d4SSatish Balay     Kokkos::parallel_for(
7379371c9d4SSatish Balay       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
738076ba34aSJunchao Zhang         PetscInt i    = t.league_rank(); /* row i */
739076ba34aSJunchao Zhang         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
740076ba34aSJunchao Zhang         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
741076ba34aSJunchao Zhang           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
742076ba34aSJunchao Zhang           else ca(ci(i) + k) = ba(bi(i) + k - alen);
743076ba34aSJunchao Zhang         });
744076ba34aSJunchao Zhang       });
7459566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
746076ba34aSJunchao Zhang   }
7473ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
748076ba34aSJunchao Zhang }
749076ba34aSJunchao Zhang 
750d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
751d71ae5a4SJacob Faibussowitsch {
752076ba34aSJunchao Zhang   PetscFunctionBegin;
753076ba34aSJunchao Zhang   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
7543ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
755a3f881fbSStefano Zampini }
756a3f881fbSStefano Zampini 
757d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
758d71ae5a4SJacob Faibussowitsch {
759a3f881fbSStefano Zampini   Mat_Product                 *product = C->product;
760a3f881fbSStefano Zampini   Mat                          A, B;
761076ba34aSJunchao Zhang   bool                         transA, transB; /* use bool, since KK needs this type */
762a3f881fbSStefano Zampini   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
763a3f881fbSStefano Zampini   Mat_SeqAIJ                  *c;
764076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos *pdata;
7650e3ece09SJunchao Zhang   KokkosCsrMatrix              csrmatA, csrmatB;
766a3f881fbSStefano Zampini 
767a3f881fbSStefano Zampini   PetscFunctionBegin;
768a3f881fbSStefano Zampini   MatCheckProduct(C, 1);
7695f80ce2aSJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
770076ba34aSJunchao Zhang   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
771076ba34aSJunchao Zhang 
7720e3ece09SJunchao Zhang   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
7730e3ece09SJunchao Zhang   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
7740e3ece09SJunchao Zhang   // we still do numeric.
7750e3ece09SJunchao Zhang   if (pdata->reusesym) { // numeric reuses results from symbolic
7760e3ece09SJunchao Zhang     pdata->reusesym = PETSC_FALSE;
7773ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
778076ba34aSJunchao Zhang   }
779076ba34aSJunchao Zhang 
780076ba34aSJunchao Zhang   switch (product->type) {
7819371c9d4SSatish Balay   case MATPRODUCT_AB:
7829371c9d4SSatish Balay     transA = false;
7839371c9d4SSatish Balay     transB = false;
7849371c9d4SSatish Balay     break;
7859371c9d4SSatish Balay   case MATPRODUCT_AtB:
7869371c9d4SSatish Balay     transA = true;
7879371c9d4SSatish Balay     transB = false;
7889371c9d4SSatish Balay     break;
7899371c9d4SSatish Balay   case MATPRODUCT_ABt:
7909371c9d4SSatish Balay     transA = false;
7919371c9d4SSatish Balay     transB = true;
7929371c9d4SSatish Balay     break;
793d71ae5a4SJacob Faibussowitsch   default:
794d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
795076ba34aSJunchao Zhang   }
796076ba34aSJunchao Zhang 
797a3f881fbSStefano Zampini   A = product->A;
798a3f881fbSStefano Zampini   B = product->B;
7999566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
8009566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B));
801a3f881fbSStefano Zampini   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
802a3f881fbSStefano Zampini   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
803a3f881fbSStefano Zampini   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
804076ba34aSJunchao Zhang 
8055f80ce2aSJacob Faibussowitsch   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
806076ba34aSJunchao Zhang 
8070e3ece09SJunchao Zhang   csrmatA = akok->csrmat;
8080e3ece09SJunchao Zhang   csrmatB = bkok->csrmat;
809076ba34aSJunchao Zhang 
810076ba34aSJunchao Zhang   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
811076ba34aSJunchao Zhang   if (transA) {
8129566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
813076ba34aSJunchao Zhang     transA = false;
814a3f881fbSStefano Zampini   }
815a3f881fbSStefano Zampini 
816076ba34aSJunchao Zhang   if (transB) {
8179566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
818076ba34aSJunchao Zhang     transB = false;
819076ba34aSJunchao Zhang   }
8209566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
8210e3ece09SJunchao Zhang   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
8220e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
823866eb059SJunchao Zhang   auto spgemmHandle = pdata->kh.get_spgemm_handle();
824866eb059SJunchao Zhang   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
825e944a159SJunchao Zhang #endif
826866eb059SJunchao Zhang 
8279566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
8289566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(C));
829a3f881fbSStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
830a3f881fbSStefano Zampini   c = (Mat_SeqAIJ *)C->data;
8319566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
8329566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
8339566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
834a3f881fbSStefano Zampini   c->reallocs         = 0;
835076ba34aSJunchao Zhang   C->info.mallocs     = 0;
836a3f881fbSStefano Zampini   C->info.nz_unneeded = 0;
837a3f881fbSStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
838a3f881fbSStefano Zampini   C->num_ass++;
8393ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
840a3f881fbSStefano Zampini }
841a3f881fbSStefano Zampini 
842d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
843d71ae5a4SJacob Faibussowitsch {
844076ba34aSJunchao Zhang   Mat_Product                 *product = C->product;
845076ba34aSJunchao Zhang   MatProductType               ptype;
846076ba34aSJunchao Zhang   Mat                          A, B;
847076ba34aSJunchao Zhang   bool                         transA, transB;
848076ba34aSJunchao Zhang   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
849076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos *pdata;
850076ba34aSJunchao Zhang   MPI_Comm                     comm;
8510e3ece09SJunchao Zhang   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
852a3f881fbSStefano Zampini 
853a3f881fbSStefano Zampini   PetscFunctionBegin;
854a3f881fbSStefano Zampini   MatCheckProduct(C, 1);
8559566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
8565f80ce2aSJacob Faibussowitsch   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
857a3f881fbSStefano Zampini   A = product->A;
858a3f881fbSStefano Zampini   B = product->B;
8599566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
8609566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B));
861a3f881fbSStefano Zampini   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
862a3f881fbSStefano Zampini   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
8630e3ece09SJunchao Zhang   csrmatA = akok->csrmat;
8640e3ece09SJunchao Zhang   csrmatB = bkok->csrmat;
865076ba34aSJunchao Zhang 
866a3f881fbSStefano Zampini   ptype = product->type;
8670e3ece09SJunchao Zhang   // Take advantage of the symmetry if true
8680e3ece09SJunchao Zhang   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
8690e3ece09SJunchao Zhang     ptype                                          = MATPRODUCT_AB;
8700e3ece09SJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
8710e3ece09SJunchao Zhang   }
8720e3ece09SJunchao Zhang   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
8730e3ece09SJunchao Zhang     ptype                                          = MATPRODUCT_AB;
8740e3ece09SJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
8750e3ece09SJunchao Zhang   }
8760e3ece09SJunchao Zhang 
877a3f881fbSStefano Zampini   switch (ptype) {
8789371c9d4SSatish Balay   case MATPRODUCT_AB:
8799371c9d4SSatish Balay     transA = false;
8809371c9d4SSatish Balay     transB = false;
8819371c9d4SSatish Balay     break;
8829371c9d4SSatish Balay   case MATPRODUCT_AtB:
8839371c9d4SSatish Balay     transA = true;
8849371c9d4SSatish Balay     transB = false;
8859371c9d4SSatish Balay     break;
8869371c9d4SSatish Balay   case MATPRODUCT_ABt:
8879371c9d4SSatish Balay     transA = false;
8889371c9d4SSatish Balay     transB = true;
8899371c9d4SSatish Balay     break;
890d71ae5a4SJacob Faibussowitsch   default:
891d71ae5a4SJacob Faibussowitsch     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
892a3f881fbSStefano Zampini   }
8930e3ece09SJunchao Zhang   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
894076ba34aSJunchao Zhang   pdata->reusesym = product->api_user;
895a3f881fbSStefano Zampini 
896076ba34aSJunchao Zhang   /* TODO: add command line options to select spgemm algorithms */
897866eb059SJunchao Zhang   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
898866eb059SJunchao Zhang 
899866eb059SJunchao Zhang   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
900866eb059SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
901866eb059SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
902866eb059SJunchao Zhang   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
903866eb059SJunchao Zhang   #endif
904866eb059SJunchao Zhang #endif
9050e3ece09SJunchao Zhang   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
906076ba34aSJunchao Zhang 
9079566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
908076ba34aSJunchao Zhang   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
909076ba34aSJunchao Zhang   if (transA) {
9109566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
911076ba34aSJunchao Zhang     transA = false;
912076ba34aSJunchao Zhang   }
913076ba34aSJunchao Zhang 
914076ba34aSJunchao Zhang   if (transB) {
9159566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
916076ba34aSJunchao Zhang     transB = false;
917076ba34aSJunchao Zhang   }
918076ba34aSJunchao Zhang 
9190e3ece09SJunchao Zhang   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
920076ba34aSJunchao Zhang   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
921076ba34aSJunchao Zhang     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
922076ba34aSJunchao Zhang     calling new Mat_SeqAIJKokkos().
923076ba34aSJunchao Zhang     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
924076ba34aSJunchao Zhang   */
9250e3ece09SJunchao Zhang   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
9260e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
927866eb059SJunchao Zhang   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
928866eb059SJunchao Zhang   auto spgemmHandle = pdata->kh.get_spgemm_handle();
929866eb059SJunchao Zhang   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
930e944a159SJunchao Zhang #endif
9319566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
932076ba34aSJunchao Zhang 
9339566063dSJacob Faibussowitsch   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
9349566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
935076ba34aSJunchao Zhang   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
9363ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
937a3f881fbSStefano Zampini }
938a3f881fbSStefano Zampini 
939a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */
940d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
941d71ae5a4SJacob Faibussowitsch {
942076ba34aSJunchao Zhang   Mat_Product *product = mat->product;
943a3f881fbSStefano Zampini   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
944a3f881fbSStefano Zampini 
945a3f881fbSStefano Zampini   PetscFunctionBegin;
946a3f881fbSStefano Zampini   MatCheckProduct(mat, 1);
9479566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
94848a46eb9SPierre Jolivet   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
949a3f881fbSStefano Zampini   if (Biskok && Ciskok) {
950a3f881fbSStefano Zampini     switch (product->type) {
951a3f881fbSStefano Zampini     case MATPRODUCT_AB:
952a3f881fbSStefano Zampini     case MATPRODUCT_AtB:
953d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABt:
954d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
955d71ae5a4SJacob Faibussowitsch       break;
956a3f881fbSStefano Zampini     case MATPRODUCT_PtAP:
957a3f881fbSStefano Zampini     case MATPRODUCT_RARt:
958d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABC:
959d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
960d71ae5a4SJacob Faibussowitsch       break;
961d71ae5a4SJacob Faibussowitsch     default:
962d71ae5a4SJacob Faibussowitsch       break;
963a3f881fbSStefano Zampini     }
964a3f881fbSStefano Zampini   } else { /* fallback for AIJ */
9659566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
966a3f881fbSStefano Zampini   }
9673ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
968a3f881fbSStefano Zampini }
969a587d139SMark 
970d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
971d71ae5a4SJacob Faibussowitsch {
972f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok;
973f0cf5187SStefano Zampini 
974f0cf5187SStefano Zampini   PetscFunctionBegin;
9759566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
9769566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
977f0cf5187SStefano Zampini   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
978076ba34aSJunchao Zhang   KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
9799566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(A));
9809566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
9819566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
9823ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
983f0cf5187SStefano Zampini }
984f0cf5187SStefano Zampini 
985d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
986d71ae5a4SJacob Faibussowitsch {
987076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
988a587d139SMark 
989a587d139SMark   PetscFunctionBegin;
990076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
9912328674fSJunchao Zhang   if (aijkok) { /* Only zero the device if data is already there */
992076ba34aSJunchao Zhang     KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0);
9939566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(A));
9942328674fSJunchao Zhang   } else { /* Might be preallocated but not assembled */
9959566063dSJacob Faibussowitsch     PetscCall(MatZeroEntries_SeqAIJ(A));
9962328674fSJunchao Zhang   }
9973ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
998a587d139SMark }
999a587d139SMark 
1000d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1001d71ae5a4SJacob Faibussowitsch {
1002f78ce678SMark Adams   Mat_SeqAIJ           *aijseq;
1003f78ce678SMark Adams   Mat_SeqAIJKokkos     *aijkok;
1004f78ce678SMark Adams   PetscInt              n;
1005f78ce678SMark Adams   PetscScalarKokkosView xv;
1006f78ce678SMark Adams 
1007f78ce678SMark Adams   PetscFunctionBegin;
1008f78ce678SMark Adams   PetscCall(VecGetLocalSize(x, &n));
1009f78ce678SMark Adams   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1010f78ce678SMark Adams   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1011f78ce678SMark Adams 
1012f78ce678SMark Adams   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1013f78ce678SMark Adams   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1014f78ce678SMark Adams 
1015f78ce678SMark Adams   if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */
1016f78ce678SMark Adams     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1017f78ce678SMark Adams     aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1018f78ce678SMark Adams     aijkok->SetDiagonal(aijseq->diag);
1019f78ce678SMark Adams   }
1020f78ce678SMark Adams 
1021f78ce678SMark Adams   const auto &Aa    = aijkok->a_dual.view_device();
1022f78ce678SMark Adams   const auto &Ai    = aijkok->i_dual.view_device();
1023f78ce678SMark Adams   const auto &Adiag = aijkok->diag_dual.view_device();
1024f78ce678SMark Adams 
1025f78ce678SMark Adams   PetscCall(VecGetKokkosViewWrite(x, &xv));
10269371c9d4SSatish Balay   Kokkos::parallel_for(
10279371c9d4SSatish Balay     n, KOKKOS_LAMBDA(const PetscInt i) {
1028f78ce678SMark Adams       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1029f78ce678SMark Adams       else xv(i) = 0;
1030f78ce678SMark Adams     });
1031f78ce678SMark Adams   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
10323ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1033f78ce678SMark Adams }
1034f78ce678SMark Adams 
1035db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1036d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1037d71ae5a4SJacob Faibussowitsch {
1038db78de30SJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
1039db78de30SJunchao Zhang 
1040db78de30SJunchao Zhang   PetscFunctionBegin;
1041db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1042db78de30SJunchao Zhang   PetscValidPointer(kv, 2);
1043db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
10449566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1045db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1046076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
10473ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1048db78de30SJunchao Zhang }
1049db78de30SJunchao Zhang 
1050d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1051d71ae5a4SJacob Faibussowitsch {
1052db78de30SJunchao Zhang   PetscFunctionBegin;
1053db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1054db78de30SJunchao Zhang   PetscValidPointer(kv, 2);
1055db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
10563ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1057db78de30SJunchao Zhang }
1058db78de30SJunchao Zhang 
1059d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1060d71ae5a4SJacob Faibussowitsch {
1061db78de30SJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
1062db78de30SJunchao Zhang 
1063db78de30SJunchao Zhang   PetscFunctionBegin;
1064db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1065db78de30SJunchao Zhang   PetscValidPointer(kv, 2);
1066db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
10679566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1068db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1069076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
10703ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1071db78de30SJunchao Zhang }
1072db78de30SJunchao Zhang 
1073d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1074d71ae5a4SJacob Faibussowitsch {
1075db78de30SJunchao Zhang   PetscFunctionBegin;
1076db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1077db78de30SJunchao Zhang   PetscValidPointer(kv, 2);
1078db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
10799566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(A));
10803ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1081db78de30SJunchao Zhang }
1082db78de30SJunchao Zhang 
1083d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1084d71ae5a4SJacob Faibussowitsch {
1085db78de30SJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
1086db78de30SJunchao Zhang 
1087db78de30SJunchao Zhang   PetscFunctionBegin;
1088db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1089db78de30SJunchao Zhang   PetscValidPointer(kv, 2);
1090db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1091db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1092076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
10933ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1094db78de30SJunchao Zhang }
1095db78de30SJunchao Zhang 
1096d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1097d71ae5a4SJacob Faibussowitsch {
1098db78de30SJunchao Zhang   PetscFunctionBegin;
1099db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1100db78de30SJunchao Zhang   PetscValidPointer(kv, 2);
1101db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
11029566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(A));
11033ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1104db78de30SJunchao Zhang }
1105db78de30SJunchao Zhang 
1106c17cf699SJunchao Zhang /* Computes Y += alpha X */
1107d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1108d71ae5a4SJacob Faibussowitsch {
1109a587d139SMark   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1110c17cf699SJunchao Zhang   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1111c17cf699SJunchao Zhang   ConstMatScalarKokkosView Xa;
1112c17cf699SJunchao Zhang   MatScalarKokkosView      Ya;
1113a587d139SMark 
1114a587d139SMark   PetscFunctionBegin;
1115c17cf699SJunchao Zhang   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1116c17cf699SJunchao Zhang   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
11179566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
11189566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(X));
11199566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1120db78de30SJunchao Zhang 
1121c17cf699SJunchao Zhang   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1122a587d139SMark     PetscBool e;
11239566063dSJacob Faibussowitsch     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1124a587d139SMark     if (e) {
11259566063dSJacob Faibussowitsch       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1126c17cf699SJunchao Zhang       if (e) pattern = SAME_NONZERO_PATTERN;
1127a587d139SMark     }
1128a587d139SMark   }
1129db78de30SJunchao Zhang 
1130c17cf699SJunchao Zhang   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1131c17cf699SJunchao Zhang     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1132c17cf699SJunchao Zhang     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1133c17cf699SJunchao Zhang     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1134c17cf699SJunchao Zhang   */
1135c17cf699SJunchao Zhang   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1136c17cf699SJunchao Zhang   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1137c17cf699SJunchao Zhang   Xa   = xkok->a_dual.view_device();
1138c17cf699SJunchao Zhang   Ya   = ykok->a_dual.view_device();
1139c17cf699SJunchao Zhang 
1140c17cf699SJunchao Zhang   if (pattern == SAME_NONZERO_PATTERN) {
1141c17cf699SJunchao Zhang     KokkosBlas::axpy(alpha, Xa, Ya);
11429566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1143c17cf699SJunchao Zhang   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1144c17cf699SJunchao Zhang     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1145c17cf699SJunchao Zhang     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1146c17cf699SJunchao Zhang 
11479371c9d4SSatish Balay     Kokkos::parallel_for(
11489371c9d4SSatish Balay       Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
11490e3ece09SJunchao Zhang         PetscInt i = t.league_rank(); // row i
11500e3ece09SJunchao Zhang         Kokkos::single(Kokkos::PerTeam(t), [=]() {
11510e3ece09SJunchao Zhang           // Only one thread works in a team
1152c17cf699SJunchao Zhang           PetscInt p, q = Yi(i);
11530e3ece09SJunchao Zhang           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
11540e3ece09SJunchao Zhang             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
11550e3ece09SJunchao Zhang             if (Xj(p) == Yj(q)) {                        // Found it
1156c17cf699SJunchao Zhang               Ya(q) += alpha * Xa(p);
1157c17cf699SJunchao Zhang               q++;
1158a587d139SMark             } else {
11590e3ece09SJunchao Zhang             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
11600e3ece09SJunchao Zhang             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
11610e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
11620e3ece09SJunchao Zhang               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
11638b8b16f9SJunchao Zhang #else
11640e3ece09SJunchao Zhang               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
11658b8b16f9SJunchao Zhang #endif
1166a587d139SMark             }
1167c17cf699SJunchao Zhang           }
1168c17cf699SJunchao Zhang         });
1169c17cf699SJunchao Zhang       });
11709566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
11710e3ece09SJunchao Zhang   } else { // different nonzero patterns
1172c17cf699SJunchao Zhang     Mat             Z;
1173c17cf699SJunchao Zhang     KokkosCsrMatrix zcsr;
1174c17cf699SJunchao Zhang     KernelHandle    kh;
11750e3ece09SJunchao Zhang     kh.create_spadd_handle(true); // X, Y are sorted
1176c17cf699SJunchao Zhang     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1177c17cf699SJunchao Zhang     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1178c17cf699SJunchao Zhang     zkok = new Mat_SeqAIJKokkos(zcsr);
11799566063dSJacob Faibussowitsch     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
11809566063dSJacob Faibussowitsch     PetscCall(MatHeaderReplace(Y, &Z));
1181c17cf699SJunchao Zhang     kh.destroy_spadd_handle();
1182c17cf699SJunchao Zhang   }
11839566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
11840e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
11853ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1186a587d139SMark }
1187a587d139SMark 
1188d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1189d71ae5a4SJacob Faibussowitsch {
119042550becSJunchao Zhang   Mat_SeqAIJKokkos *akok;
119142550becSJunchao Zhang   Mat_SeqAIJ       *aseq;
119242550becSJunchao Zhang 
119342550becSJunchao Zhang   PetscFunctionBegin;
11949566063dSJacob Faibussowitsch   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1195394ed5ebSJunchao Zhang   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
119642550becSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1197cbc6b225SStefano Zampini   delete akok;
1198cbc6b225SStefano Zampini   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE);
11999566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1200394ed5ebSJunchao Zhang   akok->SetUpCOO(aseq);
12013ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
120242550becSJunchao Zhang }
120342550becSJunchao Zhang 
1204d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1205d71ae5a4SJacob Faibussowitsch {
120642550becSJunchao Zhang   Mat_SeqAIJ                 *aseq = static_cast<Mat_SeqAIJ *>(A->data);
120742550becSJunchao Zhang   Mat_SeqAIJKokkos           *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1208394ed5ebSJunchao Zhang   PetscCount                  Annz = aseq->nz;
1209394ed5ebSJunchao Zhang   const PetscCountKokkosView &jmap = akok->jmap_d;
1210394ed5ebSJunchao Zhang   const PetscCountKokkosView &perm = akok->perm_d;
121142550becSJunchao Zhang   MatScalarKokkosView         Aa;
121242550becSJunchao Zhang   ConstMatScalarKokkosView    kv;
121342550becSJunchao Zhang   PetscMemType                memtype;
121442550becSJunchao Zhang 
121542550becSJunchao Zhang   PetscFunctionBegin;
12169566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(v, &memtype));
121742550becSJunchao Zhang   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1218394ed5ebSJunchao Zhang     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, aseq->coo_n));
121942550becSJunchao Zhang   } else {
1220394ed5ebSJunchao Zhang     kv = ConstMatScalarKokkosView(v, aseq->coo_n); /* Directly use v[]'s memory */
122142550becSJunchao Zhang   }
122242550becSJunchao Zhang 
1223c7b718f4SJunchao Zhang   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1224c7b718f4SJunchao Zhang   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
122542550becSJunchao Zhang 
12269371c9d4SSatish Balay   Kokkos::parallel_for(
12279371c9d4SSatish Balay     Annz, KOKKOS_LAMBDA(const PetscCount i) {
1228c7b718f4SJunchao Zhang       PetscScalar sum = 0.0;
1229c7b718f4SJunchao Zhang       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1230c7b718f4SJunchao Zhang       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1231c7b718f4SJunchao Zhang     });
1232394ed5ebSJunchao Zhang 
12339566063dSJacob Faibussowitsch   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
12349566063dSJacob Faibussowitsch   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
12353ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
123642550becSJunchao Zhang }
123742550becSJunchao Zhang 
1238d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A, const PetscInt *diag)
1239d71ae5a4SJacob Faibussowitsch {
12405fbaff96SJunchao Zhang   Mat_SeqAIJKokkos          *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
12415fbaff96SJunchao Zhang   MatScalarKokkosView        Aa;
12425fbaff96SJunchao Zhang   const MatRowMapKokkosView &Ai = akok->i_dual.view_device();
12435fbaff96SJunchao Zhang   PetscInt                   m  = A->rmap->n;
12445fbaff96SJunchao Zhang   ConstMatRowMapKokkosView   Adiag(diag, m); /* diag is a device pointer */
12455fbaff96SJunchao Zhang 
12465fbaff96SJunchao Zhang   PetscFunctionBegin;
12475fbaff96SJunchao Zhang   PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa));
12489371c9d4SSatish Balay   Kokkos::parallel_for(
12499371c9d4SSatish Balay     m, KOKKOS_LAMBDA(const PetscInt i) {
12505fbaff96SJunchao Zhang       PetscScalar tmp;
12515fbaff96SJunchao Zhang       if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i + 1)) { /* The diagonal element exists */
12525fbaff96SJunchao Zhang         tmp          = Aa(Ai(i));
12535fbaff96SJunchao Zhang         Aa(Ai(i))    = Aa(Adiag(i));
12545fbaff96SJunchao Zhang         Aa(Adiag(i)) = tmp;
12555fbaff96SJunchao Zhang       }
12565fbaff96SJunchao Zhang     });
12575fbaff96SJunchao Zhang   PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
12583ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
12595fbaff96SJunchao Zhang }
12605fbaff96SJunchao Zhang 
1261d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1262d71ae5a4SJacob Faibussowitsch {
12638f7e8f9dSMark Adams   PetscFunctionBegin;
12649566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncHost(A));
12659566063dSJacob Faibussowitsch   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
12668f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
12673ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
12688f7e8f9dSMark Adams }
12698f7e8f9dSMark Adams 
1270d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1271d71ae5a4SJacob Faibussowitsch {
1272076ba34aSJunchao Zhang   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1273076ba34aSJunchao Zhang 
12748c3ff71bSJunchao Zhang   PetscFunctionBegin;
1275076ba34aSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
12766f3d89d0SStefano Zampini   A->boundtocpu  = PETSC_FALSE;
12776f3d89d0SStefano Zampini 
12788c3ff71bSJunchao Zhang   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
12798c3ff71bSJunchao Zhang   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
12808c3ff71bSJunchao Zhang   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1281a587d139SMark   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1282f0cf5187SStefano Zampini   A->ops->scale                     = MatScale_SeqAIJKokkos;
1283a587d139SMark   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1284076ba34aSJunchao Zhang   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
12858c3ff71bSJunchao Zhang   A->ops->mult                      = MatMult_SeqAIJKokkos;
12868c3ff71bSJunchao Zhang   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
12878c3ff71bSJunchao Zhang   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
12888c3ff71bSJunchao Zhang   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
12898c3ff71bSJunchao Zhang   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
12908c3ff71bSJunchao Zhang   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1291076ba34aSJunchao Zhang   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
12920ecb592aSJunchao Zhang   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1293152b3e56SJunchao Zhang   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1294f78ce678SMark Adams   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1295076ba34aSJunchao Zhang   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1296076ba34aSJunchao Zhang   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1297076ba34aSJunchao Zhang   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1298076ba34aSJunchao Zhang   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1299076ba34aSJunchao Zhang   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1300076ba34aSJunchao Zhang   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
13017ee59b9bSJunchao Zhang   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
130242550becSJunchao Zhang 
13039566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
13049566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
13053ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1306076ba34aSJunchao Zhang }
1307076ba34aSJunchao Zhang 
1308d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1309d71ae5a4SJacob Faibussowitsch {
1310076ba34aSJunchao Zhang   Mat_SeqAIJ *aseq;
1311076ba34aSJunchao Zhang   PetscInt    i, m, n;
1312076ba34aSJunchao Zhang 
1313076ba34aSJunchao Zhang   PetscFunctionBegin;
13145f80ce2aSJacob Faibussowitsch   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1315076ba34aSJunchao Zhang 
1316076ba34aSJunchao Zhang   m = akok->nrows();
1317076ba34aSJunchao Zhang   n = akok->ncols();
13189566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(A, m, n, m, n));
13199566063dSJacob Faibussowitsch   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1320076ba34aSJunchao Zhang 
1321076ba34aSJunchao Zhang   /* Set up data structures of A as a MATSEQAIJ */
13229566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1323076ba34aSJunchao Zhang   aseq = (Mat_SeqAIJ *)(A)->data;
1324076ba34aSJunchao Zhang 
1325076ba34aSJunchao Zhang   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1326076ba34aSJunchao Zhang   akok->j_dual.sync_host();
1327076ba34aSJunchao Zhang 
1328076ba34aSJunchao Zhang   aseq->i            = akok->i_host_data();
1329076ba34aSJunchao Zhang   aseq->j            = akok->j_host_data();
1330076ba34aSJunchao Zhang   aseq->a            = akok->a_host_data();
1331076ba34aSJunchao Zhang   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1332076ba34aSJunchao Zhang   aseq->singlemalloc = PETSC_FALSE;
1333076ba34aSJunchao Zhang   aseq->free_a       = PETSC_FALSE;
1334076ba34aSJunchao Zhang   aseq->free_ij      = PETSC_FALSE;
1335076ba34aSJunchao Zhang   aseq->nz           = akok->nnz();
1336076ba34aSJunchao Zhang   aseq->maxnz        = aseq->nz;
1337076ba34aSJunchao Zhang 
13389566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &aseq->imax));
13399566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &aseq->ilen));
1340ad540459SPierre Jolivet   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1341076ba34aSJunchao Zhang 
1342076ba34aSJunchao Zhang   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1343076ba34aSJunchao Zhang   akok->nonzerostate = A->nonzerostate;
1344ff751488SJunchao Zhang   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
13459566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
13469566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
13473ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1348076ba34aSJunchao Zhang }
1349076ba34aSJunchao Zhang 
13500e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
13510e3ece09SJunchao Zhang {
13520e3ece09SJunchao Zhang   PetscFunctionBegin;
13530e3ece09SJunchao Zhang   PetscCall(MatSeqAIJKokkosSyncDevice(A));
13540e3ece09SJunchao Zhang   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
13550e3ece09SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
13560e3ece09SJunchao Zhang }
13570e3ece09SJunchao Zhang 
13580e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
13590e3ece09SJunchao Zhang {
13600e3ece09SJunchao Zhang   Mat_SeqAIJKokkos *akok;
13610e3ece09SJunchao Zhang   PetscFunctionBegin;
13620e3ece09SJunchao Zhang   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
13630e3ece09SJunchao Zhang   PetscCall(MatCreate(comm, A));
13640e3ece09SJunchao Zhang   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
13650e3ece09SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
13660e3ece09SJunchao Zhang }
13670e3ece09SJunchao Zhang 
1368076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1369076ba34aSJunchao Zhang 
1370076ba34aSJunchao Zhang    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1371076ba34aSJunchao Zhang  */
1372d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1373d71ae5a4SJacob Faibussowitsch {
1374076ba34aSJunchao Zhang   PetscFunctionBegin;
13759566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
13769566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
13773ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
13788c3ff71bSJunchao Zhang }
13798c3ff71bSJunchao Zhang 
1380152b3e56SJunchao Zhang /*@C
138111a5261eSBarry Smith    MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
13828c3ff71bSJunchao Zhang    (the default parallel PETSc format). This matrix will ultimately be handled by
138320f4b53cSBarry Smith    Kokkos for calculations.
13848c3ff71bSJunchao Zhang 
13858c3ff71bSJunchao Zhang    Collective
13868c3ff71bSJunchao Zhang 
13878c3ff71bSJunchao Zhang    Input Parameters:
138811a5261eSBarry Smith +  comm - MPI communicator, set to `PETSC_COMM_SELF`
13898c3ff71bSJunchao Zhang .  m - number of rows
13908c3ff71bSJunchao Zhang .  n - number of columns
139120f4b53cSBarry Smith .  nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
139220f4b53cSBarry Smith -  nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
13938c3ff71bSJunchao Zhang 
13948c3ff71bSJunchao Zhang    Output Parameter:
13958c3ff71bSJunchao Zhang .  A - the matrix
13968c3ff71bSJunchao Zhang 
13972ef1f0ffSBarry Smith    Level: intermediate
13982ef1f0ffSBarry Smith 
13992ef1f0ffSBarry Smith    Notes:
140011a5261eSBarry Smith    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
14018c3ff71bSJunchao Zhang    MatXXXXSetPreallocation() paradgm instead of this routine directly.
140211a5261eSBarry Smith    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
14038c3ff71bSJunchao Zhang 
140411a5261eSBarry Smith    The AIJ format, also called
14052ef1f0ffSBarry Smith    compressed row storage, is fully compatible with standard Fortran
14068c3ff71bSJunchao Zhang    storage.  That is, the stored row and column indices can begin at
140720f4b53cSBarry Smith    either one (as in Fortran) or zero.
14088c3ff71bSJunchao Zhang 
14092ef1f0ffSBarry Smith    Specify the preallocated storage with either `nz` or `nnz` (not both).
14102ef1f0ffSBarry Smith    Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
14112ef1f0ffSBarry Smith    allocation.
14128c3ff71bSJunchao Zhang 
14132ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`
14148c3ff71bSJunchao Zhang @*/
1415d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1416d71ae5a4SJacob Faibussowitsch {
14178c3ff71bSJunchao Zhang   PetscFunctionBegin;
14189566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
14199566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
14209566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, m, n));
14219566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
14229566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
14233ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
14248c3ff71bSJunchao Zhang }
1425930e68a5SMark Adams 
1426d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1427d71ae5a4SJacob Faibussowitsch {
1428930e68a5SMark Adams   PetscFunctionBegin;
14299566063dSJacob Faibussowitsch   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
143086a27549SJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
14313ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
143286a27549SJunchao Zhang }
143386a27549SJunchao Zhang 
1434d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1435d71ae5a4SJacob Faibussowitsch {
143686a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
143786a27549SJunchao Zhang 
143886a27549SJunchao Zhang   PetscFunctionBegin;
143986a27549SJunchao Zhang   if (!factors->sptrsv_symbolic_completed) {
144086a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
144186a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
144286a27549SJunchao Zhang     factors->sptrsv_symbolic_completed = PETSC_TRUE;
144386a27549SJunchao Zhang   }
14443ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
144586a27549SJunchao Zhang }
144686a27549SJunchao Zhang 
144786a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */
1448d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1449d71ae5a4SJacob Faibussowitsch {
145086a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1451076ba34aSJunchao Zhang   MatColIdxType               n       = A->rmap->n;
145286a27549SJunchao Zhang 
145386a27549SJunchao Zhang   PetscFunctionBegin;
145486a27549SJunchao Zhang   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
145586a27549SJunchao Zhang     /* Update L^T and do sptrsv symbolic */
1456*7b8d4ba6SJunchao Zhang     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0
1457*7b8d4ba6SJunchao Zhang     factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1458*7b8d4ba6SJunchao Zhang     factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
145986a27549SJunchao Zhang 
14609371c9d4SSatish Balay     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
146186a27549SJunchao Zhang                                                                                                                                                                                                               factors->iLt_d, factors->jLt_d, factors->aLt_d);
146286a27549SJunchao Zhang 
146386a27549SJunchao Zhang     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
146486a27549SJunchao Zhang       We have to sort the indices, until KK provides finer control options.
146586a27549SJunchao Zhang     */
14669371c9d4SSatish Balay     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);
146786a27549SJunchao Zhang 
146886a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);
146986a27549SJunchao Zhang 
147086a27549SJunchao Zhang     /* Update U^T and do sptrsv symbolic */
1471*7b8d4ba6SJunchao Zhang     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0
1472*7b8d4ba6SJunchao Zhang     factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1473*7b8d4ba6SJunchao Zhang     factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
147486a27549SJunchao Zhang 
14759371c9d4SSatish Balay     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
147686a27549SJunchao Zhang                                                                                                                                                                                                               factors->iUt_d, factors->jUt_d, factors->aUt_d);
147786a27549SJunchao Zhang 
147886a27549SJunchao Zhang     /* Sort indices. See comments above */
14799371c9d4SSatish Balay     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);
148086a27549SJunchao Zhang 
148186a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
148286a27549SJunchao Zhang     factors->transpose_updated = PETSC_TRUE;
148386a27549SJunchao Zhang   }
14843ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
148586a27549SJunchao Zhang }
148686a27549SJunchao Zhang 
148786a27549SJunchao Zhang /* Solve Ax = b, with A = LU */
1488d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x)
1489d71ae5a4SJacob Faibussowitsch {
149086a27549SJunchao Zhang   ConstPetscScalarKokkosView  bv;
149186a27549SJunchao Zhang   PetscScalarKokkosView       xv;
149286a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
149386a27549SJunchao Zhang 
149486a27549SJunchao Zhang   PetscFunctionBegin;
14959566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
14969566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
14979566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(b, &bv));
14989566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(x, &xv));
149986a27549SJunchao Zhang   /* Solve L tmpv = b */
15009566063dSJacob Faibussowitsch   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector));
150186a27549SJunchao Zhang   /* Solve Ux = tmpv */
15029566063dSJacob Faibussowitsch   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv));
15039566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(b, &bv));
15049566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
15059566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
15063ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
150786a27549SJunchao Zhang }
150886a27549SJunchao Zhang 
1509076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */
1510d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x)
1511d71ae5a4SJacob Faibussowitsch {
151286a27549SJunchao Zhang   ConstPetscScalarKokkosView  bv;
151386a27549SJunchao Zhang   PetscScalarKokkosView       xv;
151486a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
151586a27549SJunchao Zhang 
151686a27549SJunchao Zhang   PetscFunctionBegin;
15179566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
15189566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
15199566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(b, &bv));
15209566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(x, &xv));
152186a27549SJunchao Zhang   /* Solve U^T tmpv = b */
152286a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);
152386a27549SJunchao Zhang 
152486a27549SJunchao Zhang   /* Solve L^T x = tmpv */
152586a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
15269566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(b, &bv));
15279566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
15289566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
15293ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
153086a27549SJunchao Zhang }
153186a27549SJunchao Zhang 
1532d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1533d71ae5a4SJacob Faibussowitsch {
153486a27549SJunchao Zhang   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
153586a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
153686a27549SJunchao Zhang   PetscInt                    fill_lev = info->levels;
153786a27549SJunchao Zhang 
153886a27549SJunchao Zhang   PetscFunctionBegin;
15399566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
15409566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1541076ba34aSJunchao Zhang 
1542076ba34aSJunchao Zhang   auto a_d = aijkok->a_dual.view_device();
1543076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1544076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1545076ba34aSJunchao Zhang 
1546076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);
154786a27549SJunchao Zhang 
154886a27549SJunchao Zhang   B->assembled              = PETSC_TRUE;
154986a27549SJunchao Zhang   B->preallocated           = PETSC_TRUE;
155086a27549SJunchao Zhang   B->ops->solve             = MatSolve_SeqAIJKokkos;
155186a27549SJunchao Zhang   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos;
155286a27549SJunchao Zhang   B->ops->matsolve          = NULL;
155386a27549SJunchao Zhang   B->ops->matsolvetranspose = NULL;
155486a27549SJunchao Zhang   B->offloadmask            = PETSC_OFFLOAD_GPU;
155586a27549SJunchao Zhang 
155686a27549SJunchao Zhang   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
155786a27549SJunchao Zhang   factors->transpose_updated         = PETSC_FALSE;
155886a27549SJunchao Zhang   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1559eeadb341SJunchao Zhang   /* TODO: log flops, but how to know that? */
15609566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
15613ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
156286a27549SJunchao Zhang }
156386a27549SJunchao Zhang 
1564d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1565d71ae5a4SJacob Faibussowitsch {
156686a27549SJunchao Zhang   Mat_SeqAIJKokkos           *aijkok;
156786a27549SJunchao Zhang   Mat_SeqAIJ                 *b;
156886a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
156986a27549SJunchao Zhang   PetscInt                    fill_lev = info->levels;
157086a27549SJunchao Zhang   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
157186a27549SJunchao Zhang   PetscInt                    n        = A->rmap->n;
157286a27549SJunchao Zhang 
157386a27549SJunchao Zhang   PetscFunctionBegin;
15749566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
157586a27549SJunchao Zhang   /* Rebuild factors */
15769371c9d4SSatish Balay   if (factors) {
15779371c9d4SSatish Balay     factors->Destroy();
15789371c9d4SSatish Balay   } /* Destroy the old if it exists */
15799371c9d4SSatish Balay   else {
15809371c9d4SSatish Balay     B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
15819371c9d4SSatish Balay   }
158286a27549SJunchao Zhang 
158386a27549SJunchao Zhang   /* Create a spiluk handle and then do symbolic factorization */
158486a27549SJunchao Zhang   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
158586a27549SJunchao Zhang   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
158686a27549SJunchao Zhang 
158786a27549SJunchao Zhang   auto spiluk_handle = factors->kh.get_spiluk_handle();
158886a27549SJunchao Zhang 
158986a27549SJunchao Zhang   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
159086a27549SJunchao Zhang   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
159186a27549SJunchao Zhang   Kokkos::realloc(factors->iU_d, n + 1);
159286a27549SJunchao Zhang   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
159386a27549SJunchao Zhang 
159486a27549SJunchao Zhang   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1595076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1596076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1597076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
159886a27549SJunchao Zhang   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
159986a27549SJunchao Zhang 
160086a27549SJunchao Zhang   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
160186a27549SJunchao Zhang   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
160286a27549SJunchao Zhang   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
160386a27549SJunchao Zhang   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
160486a27549SJunchao Zhang 
160586a27549SJunchao Zhang   /* TODO: add options to select sptrsv algorithms */
160686a27549SJunchao Zhang   /* Create sptrsv handles for L, U and their transpose */
160786a27549SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
160886a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
160986a27549SJunchao Zhang #else
161086a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
161186a27549SJunchao Zhang #endif
161286a27549SJunchao Zhang 
161386a27549SJunchao Zhang   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
161486a27549SJunchao Zhang   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
161586a27549SJunchao Zhang   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
161686a27549SJunchao Zhang   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
161786a27549SJunchao Zhang 
161886a27549SJunchao Zhang   /* Fill fields of the factor matrix B */
16199566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
162086a27549SJunchao Zhang   b     = (Mat_SeqAIJ *)B->data;
162186a27549SJunchao Zhang   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
162286a27549SJunchao Zhang   B->info.fill_ratio_given  = info->fill;
1623a1e4e190SStefano Zampini   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
162486a27549SJunchao Zhang 
162586a27549SJunchao Zhang   B->offloadmask          = PETSC_OFFLOAD_GPU;
162686a27549SJunchao Zhang   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
16273ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1628930e68a5SMark Adams }
1629930e68a5SMark Adams 
1630d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type)
1631d71ae5a4SJacob Faibussowitsch {
1632930e68a5SMark Adams   PetscFunctionBegin;
1633930e68a5SMark Adams   *type = MATSOLVERKOKKOS;
16343ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1635930e68a5SMark Adams }
1636930e68a5SMark Adams 
1637930e68a5SMark Adams /*MC
163886a27549SJunchao Zhang   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
163911a5261eSBarry Smith   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
1640930e68a5SMark Adams 
1641930e68a5SMark Adams   Level: beginner
1642930e68a5SMark Adams 
16432ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1644930e68a5SMark Adams M*/
164586a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1646930e68a5SMark Adams {
1647930e68a5SMark Adams   PetscInt n = A->rmap->n;
1648930e68a5SMark Adams 
1649930e68a5SMark Adams   PetscFunctionBegin;
16509566063dSJacob Faibussowitsch   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
16519566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*B, n, n, n, n));
1652930e68a5SMark Adams   (*B)->factortype = ftype;
16539566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
16549566063dSJacob Faibussowitsch   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1655930e68a5SMark Adams 
16568f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
16579566063dSJacob Faibussowitsch     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
165886a27549SJunchao Zhang     (*B)->canuseordering        = PETSC_TRUE;
165986a27549SJunchao Zhang     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
166086a27549SJunchao Zhang   } else if (ftype == MAT_FACTOR_ILU) {
16619566063dSJacob Faibussowitsch     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
166286a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_FALSE;
166386a27549SJunchao Zhang     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
166498921bdaSJacob Faibussowitsch   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1665930e68a5SMark Adams 
16669566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
16679566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos));
16683ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1669930e68a5SMark Adams }
16708f7e8f9dSMark Adams 
1671d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1672d71ae5a4SJacob Faibussowitsch {
167386a27549SJunchao Zhang   PetscFunctionBegin;
16749566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
16759566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
16763ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
167786a27549SJunchao Zhang }
167886a27549SJunchao Zhang 
1679076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */
1680d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
1681d71ae5a4SJacob Faibussowitsch {
1682076ba34aSJunchao Zhang   const auto        &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map);
1683076ba34aSJunchao Zhang   const auto        &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries);
1684076ba34aSJunchao Zhang   const auto        &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values);
1685076ba34aSJunchao Zhang   const PetscInt    *i  = iv.data();
1686076ba34aSJunchao Zhang   const PetscInt    *j  = jv.data();
1687076ba34aSJunchao Zhang   const PetscScalar *a  = av.data();
1688076ba34aSJunchao Zhang   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
1689076ba34aSJunchao Zhang 
1690076ba34aSJunchao Zhang   PetscFunctionBegin;
16919566063dSJacob Faibussowitsch   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
1692076ba34aSJunchao Zhang   for (PetscInt k = 0; k < m; k++) {
16939566063dSJacob Faibussowitsch     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
169448a46eb9SPierre Jolivet     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
16959566063dSJacob Faibussowitsch     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
1696076ba34aSJunchao Zhang   }
16973ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1698076ba34aSJunchao Zhang }
1699