xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision cc6e31f1f717aeb75039aa862adac3b66a42690f)
1e36ced11SJunchao Zhang #include <petsc_kokkos.hpp>
211d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp>
3076ba34aSJunchao Zhang #include <petscpkg_version.h>
4152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h>
542550becSJunchao Zhang #include <petsc/private/sfimpl.h>
68c3ff71bSJunchao Zhang #include <petscsystypes.h>
78c3ff71bSJunchao Zhang #include <petscerror.h>
88c3ff71bSJunchao Zhang 
98c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp>
10f0cf5187SStefano Zampini #include <KokkosBlas.hpp>
118c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp>
12*cc6e31f1SJunchao Zhang 
13*cc6e31f1SJunchao Zhang // To suppress compiler warnings:
14*cc6e31f1SJunchao Zhang // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
15*cc6e31f1SJunchao Zhang // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
16*cc6e31f1SJunchao Zhang // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
17*cc6e31f1SJunchao Zhang // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
18*cc6e31f1SJunchao Zhang PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
198c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
20*cc6e31f1SJunchao Zhang PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
21*cc6e31f1SJunchao Zhang 
2286a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp>
2386a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp>
24076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp>
25076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp>
269d13fa56SJunchao Zhang #include <KokkosBatched_LU_Decl.hpp>
279d13fa56SJunchao Zhang #include <KokkosBatched_InverseLU_Decl.hpp>
2886a27549SJunchao Zhang 
2942550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
308c3ff71bSJunchao Zhang 
310e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
32f98996d3SJunchao Zhang   #include <KokkosSparse_Utils.hpp>
33f98996d3SJunchao Zhang using KokkosSparse::sort_crs_matrix;
349371c9d4SSatish Balay using KokkosSparse::Impl::transpose_matrix;
35f98996d3SJunchao Zhang #else
36f98996d3SJunchao Zhang   #include <KokkosKernels_Sorting.hpp>
37f98996d3SJunchao Zhang using KokkosKernels::sort_crs_matrix;
389371c9d4SSatish Balay using KokkosKernels::Impl::transpose_matrix;
39f98996d3SJunchao Zhang #endif
40f98996d3SJunchao Zhang 
418c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
428c3ff71bSJunchao Zhang 
43076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
44076ba34aSJunchao Zhang    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
45076ba34aSJunchao Zhang    In the latter case, it is important to set a_dual's sync state correctly.
46076ba34aSJunchao Zhang  */
47d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
48d71ae5a4SJacob Faibussowitsch {
49076ba34aSJunchao Zhang   Mat_SeqAIJ       *aijseq;
50076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
518c3ff71bSJunchao Zhang 
528c3ff71bSJunchao Zhang   PetscFunctionBegin;
533ba16761SJacob Faibussowitsch   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
549566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
55076ba34aSJunchao Zhang 
56076ba34aSJunchao Zhang   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
57076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
58076ba34aSJunchao Zhang 
59076ba34aSJunchao Zhang   /* If aijkok does not exist, we just copy i, j to device.
60076ba34aSJunchao Zhang      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
61076ba34aSJunchao Zhang      In both cases, we build a new aijkok structure.
62076ba34aSJunchao Zhang   */
63076ba34aSJunchao Zhang   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
64076ba34aSJunchao Zhang     delete aijkok;
65f4747e26SJunchao Zhang     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
66076ba34aSJunchao Zhang     A->spptr = aijkok;
67f4747e26SJunchao Zhang   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
68f4747e26SJunchao Zhang     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
69f4747e26SJunchao Zhang     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
70f4747e26SJunchao Zhang     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
71076ba34aSJunchao Zhang   }
723ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
738c3ff71bSJunchao Zhang }
748c3ff71bSJunchao Zhang 
7586a27549SJunchao Zhang /* Sync CSR data to device if not yet */
76d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
77d71ae5a4SJacob Faibussowitsch {
788c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
798c3ff71bSJunchao Zhang 
808c3ff71bSJunchao Zhang   PetscFunctionBegin;
81aaa8cc7dSPierre Jolivet   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
825f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
83076ba34aSJunchao Zhang   if (aijkok->a_dual.need_sync_device()) {
84076ba34aSJunchao Zhang     aijkok->a_dual.sync_device();
85580c7c76SPierre Jolivet     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
8686a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_FALSE;
878c3ff71bSJunchao Zhang   }
883ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
898c3ff71bSJunchao Zhang }
908c3ff71bSJunchao Zhang 
91076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */
92d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
93d71ae5a4SJacob Faibussowitsch {
9486a27549SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
9586a27549SJunchao Zhang 
9686a27549SJunchao Zhang   PetscFunctionBegin;
975f80ce2aSJacob Faibussowitsch   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
9886a27549SJunchao Zhang   aijkok->a_dual.clear_sync_state();
9986a27549SJunchao Zhang   aijkok->a_dual.modify_device();
10086a27549SJunchao Zhang   aijkok->transpose_updated = PETSC_FALSE;
10186a27549SJunchao Zhang   aijkok->hermitian_updated = PETSC_FALSE;
1029566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
1039566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
1043ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
10586a27549SJunchao Zhang }
10686a27549SJunchao Zhang 
107d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
108d71ae5a4SJacob Faibussowitsch {
109f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
110e36ced11SJunchao Zhang   auto             &exec   = PetscGetKokkosExecutionSpace();
111f0cf5187SStefano Zampini 
112f0cf5187SStefano Zampini   PetscFunctionBegin;
113f0cf5187SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
11486a27549SJunchao Zhang   /* We do not expect one needs factors on host  */
115aaa8cc7dSPierre Jolivet   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
1165f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
117e36ced11SJunchao Zhang   PetscCallCXX(aijkok->a_dual.sync_host(exec));
118e36ced11SJunchao Zhang   PetscCallCXX(exec.fence());
1193ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
120f0cf5187SStefano Zampini }
121f0cf5187SStefano Zampini 
122d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
123d71ae5a4SJacob Faibussowitsch {
124076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
125f0cf5187SStefano Zampini 
126f0cf5187SStefano Zampini   PetscFunctionBegin;
1275519a089SJose E. Roman   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
1285519a089SJose E. Roman     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
1295519a089SJose E. Roman     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
1305519a089SJose E. Roman     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
1315519a089SJose E. Roman   */
1325519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
133e36ced11SJunchao Zhang     auto &exec = PetscGetKokkosExecutionSpace();
134e36ced11SJunchao Zhang     PetscCallCXX(aijkok->a_dual.sync_host(exec));
135e36ced11SJunchao Zhang     PetscCallCXX(exec.fence());
136076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
137076ba34aSJunchao Zhang   } else { /* Happens when calling MatSetValues on a newly created matrix */
138076ba34aSJunchao Zhang     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
139076ba34aSJunchao Zhang   }
1403ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
141076ba34aSJunchao Zhang }
142076ba34aSJunchao Zhang 
143d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
144d71ae5a4SJacob Faibussowitsch {
145076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
146076ba34aSJunchao Zhang 
147076ba34aSJunchao Zhang   PetscFunctionBegin;
1485519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
1493ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
150076ba34aSJunchao Zhang }
151076ba34aSJunchao Zhang 
152d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
153d71ae5a4SJacob Faibussowitsch {
154076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
155076ba34aSJunchao Zhang 
156076ba34aSJunchao Zhang   PetscFunctionBegin;
1575519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
158e36ced11SJunchao Zhang     auto &exec = PetscGetKokkosExecutionSpace();
159e36ced11SJunchao Zhang     PetscCallCXX(aijkok->a_dual.sync_host(exec));
160e36ced11SJunchao Zhang     PetscCallCXX(exec.fence());
161076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1622328674fSJunchao Zhang   } else {
1632328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
1642328674fSJunchao Zhang   }
1653ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
166076ba34aSJunchao Zhang }
167076ba34aSJunchao Zhang 
168d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
169d71ae5a4SJacob Faibussowitsch {
170076ba34aSJunchao Zhang   PetscFunctionBegin;
171076ba34aSJunchao Zhang   *array = NULL;
1723ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
173076ba34aSJunchao Zhang }
174076ba34aSJunchao Zhang 
175d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
176d71ae5a4SJacob Faibussowitsch {
177076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
178076ba34aSJunchao Zhang 
179076ba34aSJunchao Zhang   PetscFunctionBegin;
1805519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
181076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1822328674fSJunchao Zhang   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
1832328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
1842328674fSJunchao Zhang   }
1853ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
186076ba34aSJunchao Zhang }
187076ba34aSJunchao Zhang 
188d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
189d71ae5a4SJacob Faibussowitsch {
190076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
191076ba34aSJunchao Zhang 
192076ba34aSJunchao Zhang   PetscFunctionBegin;
1935519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
194076ba34aSJunchao Zhang     aijkok->a_dual.clear_sync_state();
195076ba34aSJunchao Zhang     aijkok->a_dual.modify_host();
1962328674fSJunchao Zhang   }
1973ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
198f0cf5187SStefano Zampini }
199f0cf5187SStefano Zampini 
200d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
201d71ae5a4SJacob Faibussowitsch {
2027ee59b9bSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
2037ee59b9bSJunchao Zhang 
2047ee59b9bSJunchao Zhang   PetscFunctionBegin;
2057ee59b9bSJunchao Zhang   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
2067ee59b9bSJunchao Zhang 
2077ee59b9bSJunchao Zhang   if (i) *i = aijkok->i_device_data();
2087ee59b9bSJunchao Zhang   if (j) *j = aijkok->j_device_data();
2097ee59b9bSJunchao Zhang   if (a) {
2107ee59b9bSJunchao Zhang     aijkok->a_dual.sync_device();
2117ee59b9bSJunchao Zhang     *a = aijkok->a_device_data();
2127ee59b9bSJunchao Zhang   }
2137ee59b9bSJunchao Zhang   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
2143ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2157ee59b9bSJunchao Zhang }
2167ee59b9bSJunchao Zhang 
2170e3ece09SJunchao Zhang /*
2180e3ece09SJunchao Zhang   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
2190e3ece09SJunchao Zhang 
2200e3ece09SJunchao Zhang   Input Parameter:
2210e3ece09SJunchao Zhang .  A       - the MATSEQAIJKOKKOS matrix
2220e3ece09SJunchao Zhang 
2230e3ece09SJunchao Zhang   Output Parameters:
2240e3ece09SJunchao Zhang +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
225aaa8cc7dSPierre Jolivet -  T_d    - the transpose on device, whose value array is allocated but not initialized
2260e3ece09SJunchao Zhang */
2270e3ece09SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
228d71ae5a4SJacob Faibussowitsch {
2290e3ece09SJunchao Zhang   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
2300e3ece09SJunchao Zhang   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
2310e3ece09SJunchao Zhang   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
2327b8d4ba6SJunchao Zhang   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
2330e3ece09SJunchao Zhang   MatRowMapType          *Ti = Ti_h.data();
2347b8d4ba6SJunchao Zhang   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
2357b8d4ba6SJunchao Zhang   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
2360e3ece09SJunchao Zhang   PetscInt               *Tj   = Tj_h.data();
2370e3ece09SJunchao Zhang   PetscInt               *perm = perm_h.data();
2380e3ece09SJunchao Zhang   PetscInt               *offset;
239152b3e56SJunchao Zhang 
240152b3e56SJunchao Zhang   PetscFunctionBegin;
2410e3ece09SJunchao Zhang   // Populate Ti
2420e3ece09SJunchao Zhang   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
2430e3ece09SJunchao Zhang   Ti++;
2440e3ece09SJunchao Zhang   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
2450e3ece09SJunchao Zhang   Ti--;
2460e3ece09SJunchao Zhang   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
2470e3ece09SJunchao Zhang 
2480e3ece09SJunchao Zhang   // Populate Tj and the permutation array
2490e3ece09SJunchao Zhang   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
2500e3ece09SJunchao Zhang   for (PetscInt i = 0; i < m; i++) {
2510e3ece09SJunchao Zhang     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
2520e3ece09SJunchao Zhang       PetscInt r    = Aj[j];                       // row r of T
2530e3ece09SJunchao Zhang       PetscInt disp = Ti[r] + offset[r];
2540e3ece09SJunchao Zhang 
2550e3ece09SJunchao Zhang       Tj[disp]   = i; // col i of T
2560e3ece09SJunchao Zhang       perm[disp] = j;
2570e3ece09SJunchao Zhang       offset[r]++;
258076ba34aSJunchao Zhang     }
2590e3ece09SJunchao Zhang   }
2600e3ece09SJunchao Zhang   PetscCall(PetscFree(offset));
2610e3ece09SJunchao Zhang 
2620e3ece09SJunchao Zhang   // Sort each row of T, along with the permutation array
2630e3ece09SJunchao Zhang   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
2640e3ece09SJunchao Zhang 
2650e3ece09SJunchao Zhang   // Output perm and T on device
2660e3ece09SJunchao Zhang   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
2670e3ece09SJunchao Zhang   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
2680e3ece09SJunchao Zhang   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
2690e3ece09SJunchao Zhang   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
2703ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
271152b3e56SJunchao Zhang }
272152b3e56SJunchao Zhang 
2730e3ece09SJunchao Zhang // Generate the transpose on device and cache it internally
2740e3ece09SJunchao Zhang // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
2750e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
276d71ae5a4SJacob Faibussowitsch {
2770e3ece09SJunchao Zhang   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
2780e3ece09SJunchao Zhang   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
2790e3ece09SJunchao Zhang   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
2800e3ece09SJunchao Zhang   KokkosCsrMatrix  &T = akok->csrmatT;
281152b3e56SJunchao Zhang 
282152b3e56SJunchao Zhang   PetscFunctionBegin;
2830e3ece09SJunchao Zhang   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
284145b44c9SPierre Jolivet   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
2850e3ece09SJunchao Zhang 
2860e3ece09SJunchao Zhang   const auto &Aa = akok->a_dual.view_device();
2870e3ece09SJunchao Zhang 
2880e3ece09SJunchao Zhang   if (A->symmetric == PETSC_BOOL3_TRUE) {
2890e3ece09SJunchao Zhang     *csrmatT = akok->csrmat;
2900e3ece09SJunchao Zhang   } else {
2910e3ece09SJunchao Zhang     // See if we already have a cached transpose and its value is up to date
2920e3ece09SJunchao Zhang     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
2930e3ece09SJunchao Zhang       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
2940e3ece09SJunchao Zhang         const auto &perm = akok->transpose_perm; // get the permutation array
2950e3ece09SJunchao Zhang         auto       &Ta   = T.values;
2960e3ece09SJunchao Zhang 
297d326c3f1SJunchao Zhang         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
298076ba34aSJunchao Zhang       }
2990e3ece09SJunchao Zhang     } else { // Generate T of size n x m for the first time
3000e3ece09SJunchao Zhang       MatRowMapKokkosView perm;
3010e3ece09SJunchao Zhang 
3020e3ece09SJunchao Zhang       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
3030e3ece09SJunchao Zhang       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
304d326c3f1SJunchao Zhang       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
3050e3ece09SJunchao Zhang     }
3060e3ece09SJunchao Zhang     akok->transpose_updated = PETSC_TRUE;
3070e3ece09SJunchao Zhang     *csrmatT                = akok->csrmatT;
3080e3ece09SJunchao Zhang   }
3090e3ece09SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
3100e3ece09SJunchao Zhang }
3110e3ece09SJunchao Zhang 
3120e3ece09SJunchao Zhang // Generate the Hermitian on device and cache it internally
3130e3ece09SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
3140e3ece09SJunchao Zhang {
3150e3ece09SJunchao Zhang   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
3160e3ece09SJunchao Zhang   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
3170e3ece09SJunchao Zhang   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
3180e3ece09SJunchao Zhang   KokkosCsrMatrix  &T = akok->csrmatH;
3190e3ece09SJunchao Zhang 
3200e3ece09SJunchao Zhang   PetscFunctionBegin;
3210e3ece09SJunchao Zhang   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
3220e3ece09SJunchao Zhang   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
3230e3ece09SJunchao Zhang 
3240e3ece09SJunchao Zhang   const auto &Aa = akok->a_dual.view_device();
3250e3ece09SJunchao Zhang 
3260e3ece09SJunchao Zhang   if (A->hermitian == PETSC_BOOL3_TRUE) {
3270e3ece09SJunchao Zhang     *csrmatH = akok->csrmat;
3280e3ece09SJunchao Zhang   } else {
3290e3ece09SJunchao Zhang     // See if we already have a cached hermitian and its value is up to date
3300e3ece09SJunchao Zhang     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
3310e3ece09SJunchao Zhang       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
3320e3ece09SJunchao Zhang         const auto &perm = akok->transpose_perm; // get the permutation array
3330e3ece09SJunchao Zhang         auto       &Ta   = T.values;
3340e3ece09SJunchao Zhang 
335d326c3f1SJunchao Zhang         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
3360e3ece09SJunchao Zhang       }
3370e3ece09SJunchao Zhang     } else { // Generate T of size n x m for the first time
3380e3ece09SJunchao Zhang       MatRowMapKokkosView perm;
3390e3ece09SJunchao Zhang 
3400e3ece09SJunchao Zhang       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
3410e3ece09SJunchao Zhang       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
342d326c3f1SJunchao Zhang       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
3430e3ece09SJunchao Zhang     }
3440e3ece09SJunchao Zhang     akok->hermitian_updated = PETSC_TRUE;
3450e3ece09SJunchao Zhang     *csrmatH                = akok->csrmatH;
3460e3ece09SJunchao Zhang   }
3473ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
348152b3e56SJunchao Zhang }
349a587d139SMark 
3508c3ff71bSJunchao Zhang /* y = A x */
351d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
352d71ae5a4SJacob Faibussowitsch {
3538c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
354152b3e56SJunchao Zhang   ConstPetscScalarKokkosView xv;
355152b3e56SJunchao Zhang   PetscScalarKokkosView      yv;
3568c3ff71bSJunchao Zhang 
3578c3ff71bSJunchao Zhang   PetscFunctionBegin;
3589566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3599566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
3609566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
3619566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(yy, &yv));
3628c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
363d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
3649566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
3659566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
366076ba34aSJunchao Zhang   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
3679566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
3689566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3693ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3708c3ff71bSJunchao Zhang }
3718c3ff71bSJunchao Zhang 
3728c3ff71bSJunchao Zhang /* y = A^T x */
373d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
374d71ae5a4SJacob Faibussowitsch {
3758c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
376152b3e56SJunchao Zhang   const char                *mode;
377152b3e56SJunchao Zhang   ConstPetscScalarKokkosView xv;
378152b3e56SJunchao Zhang   PetscScalarKokkosView      yv;
3790e3ece09SJunchao Zhang   KokkosCsrMatrix            csrmat;
3808c3ff71bSJunchao Zhang 
3818c3ff71bSJunchao Zhang   PetscFunctionBegin;
3829566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3839566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
3849566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
3859566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(yy, &yv));
386152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
3879566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
388152b3e56SJunchao Zhang     mode = "N";
389152b3e56SJunchao Zhang   } else {
390076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
3910e3ece09SJunchao Zhang     csrmat = aijkok->csrmat;
392152b3e56SJunchao Zhang     mode   = "T";
393152b3e56SJunchao Zhang   }
394d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
3959566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
3969566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
3970e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
3989566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3993ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4008c3ff71bSJunchao Zhang }
4018c3ff71bSJunchao Zhang 
4028c3ff71bSJunchao Zhang /* y = A^H x */
403d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
404d71ae5a4SJacob Faibussowitsch {
4058c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
406152b3e56SJunchao Zhang   const char                *mode;
407152b3e56SJunchao Zhang   ConstPetscScalarKokkosView xv;
408152b3e56SJunchao Zhang   PetscScalarKokkosView      yv;
4090e3ece09SJunchao Zhang   KokkosCsrMatrix            csrmat;
4108c3ff71bSJunchao Zhang 
4118c3ff71bSJunchao Zhang   PetscFunctionBegin;
4129566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4139566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
4149566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
4159566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(yy, &yv));
416152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
4179566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
418152b3e56SJunchao Zhang     mode = "N";
419152b3e56SJunchao Zhang   } else {
420076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
4210e3ece09SJunchao Zhang     csrmat = aijkok->csrmat;
422152b3e56SJunchao Zhang     mode   = "C";
423152b3e56SJunchao Zhang   }
424d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
4259566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
4269566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
4270e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
4289566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
4293ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4308c3ff71bSJunchao Zhang }
4318c3ff71bSJunchao Zhang 
4328c3ff71bSJunchao Zhang /* z = A x + y */
433d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
434d71ae5a4SJacob Faibussowitsch {
4358c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
43692896123SJunchao Zhang   ConstPetscScalarKokkosView xv;
437152b3e56SJunchao Zhang   PetscScalarKokkosView      zv;
4388c3ff71bSJunchao Zhang 
4398c3ff71bSJunchao Zhang   PetscFunctionBegin;
4409566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4419566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
44292896123SJunchao Zhang   if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
4439566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
44492896123SJunchao Zhang   PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
4458c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
446d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
4479566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
44892896123SJunchao Zhang   PetscCall(VecRestoreKokkosView(zz, &zv));
4499566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
4509566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
4513ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4528c3ff71bSJunchao Zhang }
4538c3ff71bSJunchao Zhang 
4548c3ff71bSJunchao Zhang /* z = A^T x + y */
455d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
456d71ae5a4SJacob Faibussowitsch {
4578c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
458152b3e56SJunchao Zhang   const char                *mode;
45992896123SJunchao Zhang   ConstPetscScalarKokkosView xv;
460152b3e56SJunchao Zhang   PetscScalarKokkosView      zv;
4610e3ece09SJunchao Zhang   KokkosCsrMatrix            csrmat;
4628c3ff71bSJunchao Zhang 
4638c3ff71bSJunchao Zhang   PetscFunctionBegin;
4649566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4659566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
46692896123SJunchao Zhang   if (zz != yy) PetscCall(VecCopy(yy, zz));
4679566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
46892896123SJunchao Zhang   PetscCall(VecGetKokkosView(zz, &zv));
469152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
4709566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
471152b3e56SJunchao Zhang     mode = "N";
472152b3e56SJunchao Zhang   } else {
473076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
4740e3ece09SJunchao Zhang     csrmat = aijkok->csrmat;
475152b3e56SJunchao Zhang     mode   = "T";
476152b3e56SJunchao Zhang   }
477d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
4789566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
47992896123SJunchao Zhang   PetscCall(VecRestoreKokkosView(zz, &zv));
4800e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
4819566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
4823ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4838c3ff71bSJunchao Zhang }
4848c3ff71bSJunchao Zhang 
4858c3ff71bSJunchao Zhang /* z = A^H x + y */
486d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
487d71ae5a4SJacob Faibussowitsch {
4888c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
489152b3e56SJunchao Zhang   const char                *mode;
49092896123SJunchao Zhang   ConstPetscScalarKokkosView xv;
491152b3e56SJunchao Zhang   PetscScalarKokkosView      zv;
4920e3ece09SJunchao Zhang   KokkosCsrMatrix            csrmat;
4938c3ff71bSJunchao Zhang 
4948c3ff71bSJunchao Zhang   PetscFunctionBegin;
4959566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4969566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
49792896123SJunchao Zhang   if (zz != yy) PetscCall(VecCopy(yy, zz));
4989566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
49992896123SJunchao Zhang   PetscCall(VecGetKokkosView(zz, &zv));
500152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
5019566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
502152b3e56SJunchao Zhang     mode = "N";
503152b3e56SJunchao Zhang   } else {
504076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
5050e3ece09SJunchao Zhang     csrmat = aijkok->csrmat;
506152b3e56SJunchao Zhang     mode   = "C";
507152b3e56SJunchao Zhang   }
508d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
5099566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
51092896123SJunchao Zhang   PetscCall(VecRestoreKokkosView(zz, &zv));
5110e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
5129566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
5133ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
514152b3e56SJunchao Zhang }
515152b3e56SJunchao Zhang 
51666976f2fSJacob Faibussowitsch static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
517d71ae5a4SJacob Faibussowitsch {
518152b3e56SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
519152b3e56SJunchao Zhang 
520152b3e56SJunchao Zhang   PetscFunctionBegin;
521152b3e56SJunchao Zhang   switch (op) {
522152b3e56SJunchao Zhang   case MAT_FORM_EXPLICIT_TRANSPOSE:
523152b3e56SJunchao Zhang     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
5249566063dSJacob Faibussowitsch     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
525152b3e56SJunchao Zhang     A->form_explicit_transpose = flg;
526152b3e56SJunchao Zhang     break;
527d71ae5a4SJacob Faibussowitsch   default:
528d71ae5a4SJacob Faibussowitsch     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
529d71ae5a4SJacob Faibussowitsch     break;
530152b3e56SJunchao Zhang   }
5313ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5328c3ff71bSJunchao Zhang }
5338c3ff71bSJunchao Zhang 
534076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */
535d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
536d71ae5a4SJacob Faibussowitsch {
537076ba34aSJunchao Zhang   Mat_SeqAIJ *aseq;
5388c3ff71bSJunchao Zhang 
5398c3ff71bSJunchao Zhang   PetscFunctionBegin;
5409566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
541076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
5429566063dSJacob Faibussowitsch     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));  /* the returned newmat is a SeqAIJKokkos */
5438c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
5449566063dSJacob Faibussowitsch     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
545076ba34aSJunchao Zhang   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
5465f80ce2aSJacob Faibussowitsch     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
5479566063dSJacob Faibussowitsch     PetscCall(PetscFree(A->defaultvectype));
5489566063dSJacob Faibussowitsch     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
5499566063dSJacob Faibussowitsch     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
5509566063dSJacob Faibussowitsch     PetscCall(MatSetOps_SeqAIJKokkos(A));
551076ba34aSJunchao Zhang     aseq = static_cast<Mat_SeqAIJ *>(A->data);
552394ed5ebSJunchao Zhang     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
5535f80ce2aSJacob Faibussowitsch       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
554f4747e26SJunchao Zhang       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
5558c3ff71bSJunchao Zhang     }
556076ba34aSJunchao Zhang   }
5573ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5588c3ff71bSJunchao Zhang }
5598c3ff71bSJunchao Zhang 
560076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
561076ba34aSJunchao Zhang    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
562076ba34aSJunchao Zhang  */
563d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
564d71ae5a4SJacob Faibussowitsch {
565076ba34aSJunchao Zhang   Mat_SeqAIJ       *bseq;
566076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
567076ba34aSJunchao Zhang   Mat               mat;
5688c3ff71bSJunchao Zhang 
5698c3ff71bSJunchao Zhang   PetscFunctionBegin;
570076ba34aSJunchao Zhang   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
5719566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
572076ba34aSJunchao Zhang   mat = *B;
573f4747e26SJunchao Zhang   if (A->assembled) {
574076ba34aSJunchao Zhang     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
575f4747e26SJunchao Zhang     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
576076ba34aSJunchao Zhang     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
577076ba34aSJunchao Zhang     /* Now copy values to B if needed */
578076ba34aSJunchao Zhang     if (dupOption == MAT_COPY_VALUES) {
579076ba34aSJunchao Zhang       if (akok->a_dual.need_sync_device()) {
580076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
581076ba34aSJunchao Zhang         bkok->a_dual.modify_host();
582076ba34aSJunchao Zhang       } else { /* If device has the latest data, we only copy data on device */
583076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
584076ba34aSJunchao Zhang         bkok->a_dual.modify_device();
585076ba34aSJunchao Zhang       }
586076ba34aSJunchao Zhang     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
587076ba34aSJunchao Zhang       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
588076ba34aSJunchao Zhang       bkok->a_dual.modify_host();
589076ba34aSJunchao Zhang     }
590076ba34aSJunchao Zhang     mat->spptr = bkok;
591076ba34aSJunchao Zhang   }
592076ba34aSJunchao Zhang 
5939566063dSJacob Faibussowitsch   PetscCall(PetscFree(mat->defaultvectype));
5949566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
5959566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
5969566063dSJacob Faibussowitsch   PetscCall(MatSetOps_SeqAIJKokkos(mat));
5973ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5988c3ff71bSJunchao Zhang }
5998c3ff71bSJunchao Zhang 
600d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
601d71ae5a4SJacob Faibussowitsch {
6020ecb592aSJunchao Zhang   Mat               At;
6030e3ece09SJunchao Zhang   KokkosCsrMatrix   internT;
6040ecb592aSJunchao Zhang   Mat_SeqAIJKokkos *atkok, *bkok;
6050ecb592aSJunchao Zhang 
6060ecb592aSJunchao Zhang   PetscFunctionBegin;
6077fb60732SBarry Smith   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
6089566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
6090ecb592aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
610ff751488SJunchao Zhang     /* Deep copy internT, as we want to isolate the internal transpose */
6110e3ece09SJunchao Zhang     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
6129566063dSJacob Faibussowitsch     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
6130ecb592aSJunchao Zhang     if (reuse == MAT_INITIAL_MATRIX) *B = At;
6149566063dSJacob Faibussowitsch     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
6150ecb592aSJunchao Zhang   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
6160ecb592aSJunchao Zhang     if ((*B)->assembled) {
6170ecb592aSJunchao Zhang       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
6180e3ece09SJunchao Zhang       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
6199566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
6200ecb592aSJunchao Zhang     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
6210ecb592aSJunchao Zhang       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
6220e3ece09SJunchao Zhang       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
6230e3ece09SJunchao Zhang       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
6240e3ece09SJunchao Zhang       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
6250e3ece09SJunchao Zhang       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
6260ecb592aSJunchao Zhang     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
6270ecb592aSJunchao Zhang   }
6283ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6290ecb592aSJunchao Zhang }
6300ecb592aSJunchao Zhang 
631d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
632d71ae5a4SJacob Faibussowitsch {
63386a27549SJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
6348c3ff71bSJunchao Zhang 
6358c3ff71bSJunchao Zhang   PetscFunctionBegin;
63686a27549SJunchao Zhang   if (A->factortype == MAT_FACTOR_NONE) {
63786a27549SJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
6388c3ff71bSJunchao Zhang     delete aijkok;
63986a27549SJunchao Zhang   } else {
64086a27549SJunchao Zhang     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
64186a27549SJunchao Zhang   }
642cbc6b225SStefano Zampini   A->spptr = NULL;
6439566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
6449566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
6459566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
64657761e9aSJunchao Zhang #if defined(PETSC_HAVE_HYPRE)
64757761e9aSJunchao Zhang   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
64857761e9aSJunchao Zhang #endif
6499566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
6503ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6518c3ff71bSJunchao Zhang }
6528c3ff71bSJunchao Zhang 
6533f3ba80aSJunchao Zhang /*MC
6543f3ba80aSJunchao Zhang    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
6553f3ba80aSJunchao Zhang 
65615229ffcSPierre Jolivet    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
6573f3ba80aSJunchao Zhang 
6582ef1f0ffSBarry Smith    Options Database Key:
65911a5261eSBarry Smith .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
6603f3ba80aSJunchao Zhang 
6613f3ba80aSJunchao Zhang   Level: beginner
6623f3ba80aSJunchao Zhang 
6631cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
6643f3ba80aSJunchao Zhang M*/
665d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
666d71ae5a4SJacob Faibussowitsch {
66786a27549SJunchao Zhang   PetscFunctionBegin;
6689566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
6699566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(A));
6709566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
6713ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
67286a27549SJunchao Zhang }
67386a27549SJunchao Zhang 
674076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
675d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
676d71ae5a4SJacob Faibussowitsch {
677076ba34aSJunchao Zhang   Mat_SeqAIJ         *a, *b;
678076ba34aSJunchao Zhang   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
679076ba34aSJunchao Zhang   MatScalarKokkosView aa, ba, ca;
680076ba34aSJunchao Zhang   MatRowMapKokkosView ai, bi, ci;
681076ba34aSJunchao Zhang   MatColIdxKokkosView aj, bj, cj;
682076ba34aSJunchao Zhang   PetscInt            m, n, nnz, aN;
683a3f881fbSStefano Zampini 
684a3f881fbSStefano Zampini   PetscFunctionBegin;
685076ba34aSJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
686076ba34aSJunchao Zhang   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
6874f572ea9SToby Isaac   PetscAssertPointer(C, 4);
688076ba34aSJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
689076ba34aSJunchao Zhang   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
6905f80ce2aSJacob Faibussowitsch   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
6915f80ce2aSJacob Faibussowitsch   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
692076ba34aSJunchao Zhang 
6939566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
6949566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B));
695076ba34aSJunchao Zhang   a    = static_cast<Mat_SeqAIJ *>(A->data);
696076ba34aSJunchao Zhang   b    = static_cast<Mat_SeqAIJ *>(B->data);
697076ba34aSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
698076ba34aSJunchao Zhang   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
699076ba34aSJunchao Zhang   aa   = akok->a_dual.view_device();
700076ba34aSJunchao Zhang   ai   = akok->i_dual.view_device();
701076ba34aSJunchao Zhang   ba   = bkok->a_dual.view_device();
702076ba34aSJunchao Zhang   bi   = bkok->i_dual.view_device();
703076ba34aSJunchao Zhang   m    = A->rmap->n; /* M, N and nnz of C */
704076ba34aSJunchao Zhang   n    = A->cmap->n + B->cmap->n;
705076ba34aSJunchao Zhang   nnz  = a->nz + b->nz;
706076ba34aSJunchao Zhang   aN   = A->cmap->n; /* N of A */
707076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) {
708076ba34aSJunchao Zhang     aj           = akok->j_dual.view_device();
709076ba34aSJunchao Zhang     bj           = bkok->j_dual.view_device();
710076ba34aSJunchao Zhang     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
711076ba34aSJunchao Zhang     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
712076ba34aSJunchao Zhang     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
713076ba34aSJunchao Zhang     ca           = ca_dual.view_device();
714076ba34aSJunchao Zhang     ci           = ci_dual.view_device();
715076ba34aSJunchao Zhang     cj           = cj_dual.view_device();
716076ba34aSJunchao Zhang 
717076ba34aSJunchao Zhang     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
7189371c9d4SSatish Balay     Kokkos::parallel_for(
719d326c3f1SJunchao Zhang       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
720076ba34aSJunchao Zhang         PetscInt i       = t.league_rank(); /* row i */
721076ba34aSJunchao Zhang         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
722076ba34aSJunchao Zhang 
723076ba34aSJunchao Zhang         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
724076ba34aSJunchao Zhang                                                    ci(i) = coffset;
725076ba34aSJunchao Zhang                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
726076ba34aSJunchao Zhang         });
727076ba34aSJunchao Zhang 
728076ba34aSJunchao Zhang         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
729076ba34aSJunchao Zhang           if (k < alen) {
730076ba34aSJunchao Zhang             ca(coffset + k) = aa(ai(i) + k);
731076ba34aSJunchao Zhang             cj(coffset + k) = aj(ai(i) + k);
732076ba34aSJunchao Zhang           } else {
733076ba34aSJunchao Zhang             ca(coffset + k) = ba(bi(i) + k - alen);
734076ba34aSJunchao Zhang             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
735076ba34aSJunchao Zhang           }
736076ba34aSJunchao Zhang         });
737076ba34aSJunchao Zhang       });
738076ba34aSJunchao Zhang     ca_dual.modify_device();
739076ba34aSJunchao Zhang     ci_dual.modify_device();
740076ba34aSJunchao Zhang     cj_dual.modify_device();
7419566063dSJacob Faibussowitsch     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
7429566063dSJacob Faibussowitsch     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
743076ba34aSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {
744076ba34aSJunchao Zhang     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
745076ba34aSJunchao Zhang     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
746076ba34aSJunchao Zhang     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
747076ba34aSJunchao Zhang     ca   = ckok->a_dual.view_device();
748076ba34aSJunchao Zhang     ci   = ckok->i_dual.view_device();
749076ba34aSJunchao Zhang 
7509371c9d4SSatish Balay     Kokkos::parallel_for(
751d326c3f1SJunchao Zhang       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
752076ba34aSJunchao Zhang         PetscInt i    = t.league_rank(); /* row i */
753076ba34aSJunchao Zhang         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
754076ba34aSJunchao Zhang         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
755076ba34aSJunchao Zhang           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
756076ba34aSJunchao Zhang           else ca(ci(i) + k) = ba(bi(i) + k - alen);
757076ba34aSJunchao Zhang         });
758076ba34aSJunchao Zhang       });
7599566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
760076ba34aSJunchao Zhang   }
7613ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
762076ba34aSJunchao Zhang }
763076ba34aSJunchao Zhang 
764d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
765d71ae5a4SJacob Faibussowitsch {
766076ba34aSJunchao Zhang   PetscFunctionBegin;
767076ba34aSJunchao Zhang   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
7683ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
769a3f881fbSStefano Zampini }
770a3f881fbSStefano Zampini 
771d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
772d71ae5a4SJacob Faibussowitsch {
773a3f881fbSStefano Zampini   Mat_Product                 *product = C->product;
774a3f881fbSStefano Zampini   Mat                          A, B;
775076ba34aSJunchao Zhang   bool                         transA, transB; /* use bool, since KK needs this type */
776a3f881fbSStefano Zampini   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
777a3f881fbSStefano Zampini   Mat_SeqAIJ                  *c;
778076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos *pdata;
7790e3ece09SJunchao Zhang   KokkosCsrMatrix              csrmatA, csrmatB;
780a3f881fbSStefano Zampini 
781a3f881fbSStefano Zampini   PetscFunctionBegin;
782a3f881fbSStefano Zampini   MatCheckProduct(C, 1);
7835f80ce2aSJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
784076ba34aSJunchao Zhang   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
785076ba34aSJunchao Zhang 
7860e3ece09SJunchao Zhang   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
7870e3ece09SJunchao Zhang   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
7880e3ece09SJunchao Zhang   // we still do numeric.
7890e3ece09SJunchao Zhang   if (pdata->reusesym) { // numeric reuses results from symbolic
7900e3ece09SJunchao Zhang     pdata->reusesym = PETSC_FALSE;
7913ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
792076ba34aSJunchao Zhang   }
793076ba34aSJunchao Zhang 
794076ba34aSJunchao Zhang   switch (product->type) {
7959371c9d4SSatish Balay   case MATPRODUCT_AB:
7969371c9d4SSatish Balay     transA = false;
7979371c9d4SSatish Balay     transB = false;
7989371c9d4SSatish Balay     break;
7999371c9d4SSatish Balay   case MATPRODUCT_AtB:
8009371c9d4SSatish Balay     transA = true;
8019371c9d4SSatish Balay     transB = false;
8029371c9d4SSatish Balay     break;
8039371c9d4SSatish Balay   case MATPRODUCT_ABt:
8049371c9d4SSatish Balay     transA = false;
8059371c9d4SSatish Balay     transB = true;
8069371c9d4SSatish Balay     break;
807d71ae5a4SJacob Faibussowitsch   default:
808d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
809076ba34aSJunchao Zhang   }
810076ba34aSJunchao Zhang 
811a3f881fbSStefano Zampini   A = product->A;
812a3f881fbSStefano Zampini   B = product->B;
8139566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
8149566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B));
815a3f881fbSStefano Zampini   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
816a3f881fbSStefano Zampini   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
817a3f881fbSStefano Zampini   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
818076ba34aSJunchao Zhang 
8195f80ce2aSJacob Faibussowitsch   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
820076ba34aSJunchao Zhang 
8210e3ece09SJunchao Zhang   csrmatA = akok->csrmat;
8220e3ece09SJunchao Zhang   csrmatB = bkok->csrmat;
823076ba34aSJunchao Zhang 
824076ba34aSJunchao Zhang   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
825076ba34aSJunchao Zhang   if (transA) {
8269566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
827076ba34aSJunchao Zhang     transA = false;
828a3f881fbSStefano Zampini   }
829a3f881fbSStefano Zampini 
830076ba34aSJunchao Zhang   if (transB) {
8319566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
832076ba34aSJunchao Zhang     transB = false;
833076ba34aSJunchao Zhang   }
8349566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
8350e3ece09SJunchao Zhang   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
8360e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
837866eb059SJunchao Zhang   auto spgemmHandle = pdata->kh.get_spgemm_handle();
838866eb059SJunchao Zhang   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
839e944a159SJunchao Zhang #endif
840866eb059SJunchao Zhang 
8419566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
8429566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(C));
843a3f881fbSStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
844a3f881fbSStefano Zampini   c = (Mat_SeqAIJ *)C->data;
8459566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
8469566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
8479566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
848a3f881fbSStefano Zampini   c->reallocs         = 0;
849076ba34aSJunchao Zhang   C->info.mallocs     = 0;
850a3f881fbSStefano Zampini   C->info.nz_unneeded = 0;
851a3f881fbSStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
852a3f881fbSStefano Zampini   C->num_ass++;
8533ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
854a3f881fbSStefano Zampini }
855a3f881fbSStefano Zampini 
856d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
857d71ae5a4SJacob Faibussowitsch {
858076ba34aSJunchao Zhang   Mat_Product                 *product = C->product;
859076ba34aSJunchao Zhang   MatProductType               ptype;
860076ba34aSJunchao Zhang   Mat                          A, B;
861076ba34aSJunchao Zhang   bool                         transA, transB;
862076ba34aSJunchao Zhang   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
863076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos *pdata;
864076ba34aSJunchao Zhang   MPI_Comm                     comm;
8650e3ece09SJunchao Zhang   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
866a3f881fbSStefano Zampini 
867a3f881fbSStefano Zampini   PetscFunctionBegin;
868a3f881fbSStefano Zampini   MatCheckProduct(C, 1);
8699566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
8705f80ce2aSJacob Faibussowitsch   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
871a3f881fbSStefano Zampini   A = product->A;
872a3f881fbSStefano Zampini   B = product->B;
8739566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
8749566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B));
875a3f881fbSStefano Zampini   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
876a3f881fbSStefano Zampini   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
8770e3ece09SJunchao Zhang   csrmatA = akok->csrmat;
8780e3ece09SJunchao Zhang   csrmatB = bkok->csrmat;
879076ba34aSJunchao Zhang 
880a3f881fbSStefano Zampini   ptype = product->type;
8810e3ece09SJunchao Zhang   // Take advantage of the symmetry if true
8820e3ece09SJunchao Zhang   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
8830e3ece09SJunchao Zhang     ptype                                          = MATPRODUCT_AB;
8840e3ece09SJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
8850e3ece09SJunchao Zhang   }
8860e3ece09SJunchao Zhang   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
8870e3ece09SJunchao Zhang     ptype                                          = MATPRODUCT_AB;
8880e3ece09SJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
8890e3ece09SJunchao Zhang   }
8900e3ece09SJunchao Zhang 
891a3f881fbSStefano Zampini   switch (ptype) {
8929371c9d4SSatish Balay   case MATPRODUCT_AB:
8939371c9d4SSatish Balay     transA = false;
8949371c9d4SSatish Balay     transB = false;
8950e6a1e94SMark Adams     PetscCall(MatSetBlockSizesFromMats(C, A, B));
8969371c9d4SSatish Balay     break;
8979371c9d4SSatish Balay   case MATPRODUCT_AtB:
8989371c9d4SSatish Balay     transA = true;
8999371c9d4SSatish Balay     transB = false;
9000e6a1e94SMark Adams     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
9010e6a1e94SMark Adams     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
9029371c9d4SSatish Balay     break;
9039371c9d4SSatish Balay   case MATPRODUCT_ABt:
9049371c9d4SSatish Balay     transA = false;
9059371c9d4SSatish Balay     transB = true;
9060e6a1e94SMark Adams     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
9070e6a1e94SMark Adams     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
9089371c9d4SSatish Balay     break;
909d71ae5a4SJacob Faibussowitsch   default:
910d71ae5a4SJacob Faibussowitsch     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
911a3f881fbSStefano Zampini   }
9120e3ece09SJunchao Zhang   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
913076ba34aSJunchao Zhang   pdata->reusesym = product->api_user;
914a3f881fbSStefano Zampini 
915076ba34aSJunchao Zhang   /* TODO: add command line options to select spgemm algorithms */
916866eb059SJunchao Zhang   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
917866eb059SJunchao Zhang 
918866eb059SJunchao Zhang   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
919866eb059SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
920866eb059SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
921866eb059SJunchao Zhang   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
922866eb059SJunchao Zhang   #endif
923866eb059SJunchao Zhang #endif
9240e3ece09SJunchao Zhang   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
925076ba34aSJunchao Zhang 
9269566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
927076ba34aSJunchao Zhang   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
928076ba34aSJunchao Zhang   if (transA) {
9299566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
930076ba34aSJunchao Zhang     transA = false;
931076ba34aSJunchao Zhang   }
932076ba34aSJunchao Zhang 
933076ba34aSJunchao Zhang   if (transB) {
9349566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
935076ba34aSJunchao Zhang     transB = false;
936076ba34aSJunchao Zhang   }
937076ba34aSJunchao Zhang 
9380e3ece09SJunchao Zhang   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
939076ba34aSJunchao Zhang   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
940076ba34aSJunchao Zhang     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
941076ba34aSJunchao Zhang     calling new Mat_SeqAIJKokkos().
942076ba34aSJunchao Zhang     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
943076ba34aSJunchao Zhang   */
9440e3ece09SJunchao Zhang   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
9450e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
946866eb059SJunchao Zhang   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
947866eb059SJunchao Zhang   auto spgemmHandle = pdata->kh.get_spgemm_handle();
948866eb059SJunchao Zhang   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
949e944a159SJunchao Zhang #endif
9509566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
951076ba34aSJunchao Zhang 
9529566063dSJacob Faibussowitsch   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
9539566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
954076ba34aSJunchao Zhang   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
9553ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
956a3f881fbSStefano Zampini }
957a3f881fbSStefano Zampini 
958a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */
959d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
960d71ae5a4SJacob Faibussowitsch {
961076ba34aSJunchao Zhang   Mat_Product *product = mat->product;
962a3f881fbSStefano Zampini   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
963a3f881fbSStefano Zampini 
964a3f881fbSStefano Zampini   PetscFunctionBegin;
965a3f881fbSStefano Zampini   MatCheckProduct(mat, 1);
9669566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
96748a46eb9SPierre Jolivet   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
968a3f881fbSStefano Zampini   if (Biskok && Ciskok) {
969a3f881fbSStefano Zampini     switch (product->type) {
970a3f881fbSStefano Zampini     case MATPRODUCT_AB:
971a3f881fbSStefano Zampini     case MATPRODUCT_AtB:
972d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABt:
973d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
974d71ae5a4SJacob Faibussowitsch       break;
975a3f881fbSStefano Zampini     case MATPRODUCT_PtAP:
976a3f881fbSStefano Zampini     case MATPRODUCT_RARt:
977d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABC:
978d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
979d71ae5a4SJacob Faibussowitsch       break;
980d71ae5a4SJacob Faibussowitsch     default:
981d71ae5a4SJacob Faibussowitsch       break;
982a3f881fbSStefano Zampini     }
983a3f881fbSStefano Zampini   } else { /* fallback for AIJ */
9849566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
985a3f881fbSStefano Zampini   }
9863ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
987a3f881fbSStefano Zampini }
988a587d139SMark 
989d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
990d71ae5a4SJacob Faibussowitsch {
991f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok;
992f0cf5187SStefano Zampini 
993f0cf5187SStefano Zampini   PetscFunctionBegin;
9949566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
9959566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
996f0cf5187SStefano Zampini   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
997d326c3f1SJunchao Zhang   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
9989566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(A));
9999566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
10009566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
10013ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1002f0cf5187SStefano Zampini }
1003f0cf5187SStefano Zampini 
1004f4747e26SJunchao Zhang // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
1005f4747e26SJunchao Zhang static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1006f4747e26SJunchao Zhang {
1007f4747e26SJunchao Zhang   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1008f4747e26SJunchao Zhang 
1009f4747e26SJunchao Zhang   PetscFunctionBegin;
1010f4747e26SJunchao Zhang   if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
1011f4747e26SJunchao Zhang     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1012f4747e26SJunchao Zhang 
1013f4747e26SJunchao Zhang     PetscCall(PetscLogGpuTimeBegin());
1014f4747e26SJunchao Zhang     PetscCall(MatSeqAIJKokkosSyncDevice(A));
1015f4747e26SJunchao Zhang     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1016f4747e26SJunchao Zhang     const auto &Aa     = aijkok->a_dual.view_device();
1017f4747e26SJunchao Zhang     const auto &Adiag  = aijkok->diag_dual.view_device();
1018d326c3f1SJunchao Zhang     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1019f4747e26SJunchao Zhang     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1020f4747e26SJunchao Zhang     PetscCall(PetscLogGpuFlops(n));
1021f4747e26SJunchao Zhang     PetscCall(PetscLogGpuTimeEnd());
1022f4747e26SJunchao Zhang   } else { // need reassembly, very slow!
1023f4747e26SJunchao Zhang     PetscCall(MatShift_Basic(A, a));
1024f4747e26SJunchao Zhang   }
1025f4747e26SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1026f4747e26SJunchao Zhang }
1027f4747e26SJunchao Zhang 
1028f4747e26SJunchao Zhang static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1029f4747e26SJunchao Zhang {
1030f4747e26SJunchao Zhang   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1031f4747e26SJunchao Zhang 
1032f4747e26SJunchao Zhang   PetscFunctionBegin;
1033f4747e26SJunchao Zhang   if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1034f4747e26SJunchao Zhang     ConstPetscScalarKokkosView dv;
1035f4747e26SJunchao Zhang     PetscInt                   n, nv;
1036f4747e26SJunchao Zhang 
1037f4747e26SJunchao Zhang     PetscCall(PetscLogGpuTimeBegin());
1038f4747e26SJunchao Zhang     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1039f4747e26SJunchao Zhang     PetscCall(VecGetKokkosView(D, &dv));
1040f4747e26SJunchao Zhang     PetscCall(VecGetLocalSize(D, &nv));
1041f4747e26SJunchao Zhang     n = PetscMin(Y->rmap->n, Y->cmap->n);
1042f4747e26SJunchao Zhang     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1043f4747e26SJunchao Zhang 
1044f4747e26SJunchao Zhang     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1045f4747e26SJunchao Zhang     const auto &Aa     = aijkok->a_dual.view_device();
1046f4747e26SJunchao Zhang     const auto &Adiag  = aijkok->diag_dual.view_device();
1047f4747e26SJunchao Zhang     PetscCallCXX(Kokkos::parallel_for(
1048d326c3f1SJunchao Zhang       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1049f4747e26SJunchao Zhang         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1050f4747e26SJunchao Zhang         else Aa(Adiag(i)) += dv(i);
1051f4747e26SJunchao Zhang       }));
1052f4747e26SJunchao Zhang     PetscCall(VecRestoreKokkosView(D, &dv));
1053f4747e26SJunchao Zhang     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1054f4747e26SJunchao Zhang     PetscCall(PetscLogGpuFlops(n));
1055f4747e26SJunchao Zhang     PetscCall(PetscLogGpuTimeEnd());
1056f4747e26SJunchao Zhang   } else { // need reassembly, very slow!
1057f4747e26SJunchao Zhang     PetscCall(MatDiagonalSet_Default(Y, D, is));
1058f4747e26SJunchao Zhang   }
1059f4747e26SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1060f4747e26SJunchao Zhang }
1061f4747e26SJunchao Zhang 
1062f4747e26SJunchao Zhang static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1063f4747e26SJunchao Zhang {
1064f4747e26SJunchao Zhang   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1065f4747e26SJunchao Zhang   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1066f4747e26SJunchao Zhang   ConstPetscScalarKokkosView lv, rv;
1067f4747e26SJunchao Zhang 
1068f4747e26SJunchao Zhang   PetscFunctionBegin;
1069f4747e26SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1070f4747e26SJunchao Zhang   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1071f4747e26SJunchao Zhang   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1072f4747e26SJunchao Zhang   const auto &Aa     = aijkok->a_dual.view_device();
1073f4747e26SJunchao Zhang   const auto &Ai     = aijkok->i_dual.view_device();
1074f4747e26SJunchao Zhang   const auto &Aj     = aijkok->j_dual.view_device();
1075f4747e26SJunchao Zhang   if (ll) {
1076f4747e26SJunchao Zhang     PetscCall(VecGetLocalSize(ll, &m));
1077f4747e26SJunchao Zhang     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1078f4747e26SJunchao Zhang     PetscCall(VecGetKokkosView(ll, &lv));
1079f4747e26SJunchao Zhang     PetscCallCXX(Kokkos::parallel_for( // for each row
1080d326c3f1SJunchao Zhang       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1081f4747e26SJunchao Zhang         PetscInt i   = t.league_rank(); // row i
1082f4747e26SJunchao Zhang         PetscInt len = Ai(i + 1) - Ai(i);
1083f4747e26SJunchao Zhang         // scale entries on the row
1084f4747e26SJunchao Zhang         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1085f4747e26SJunchao Zhang       }));
1086f4747e26SJunchao Zhang     PetscCall(VecRestoreKokkosView(ll, &lv));
1087f4747e26SJunchao Zhang     PetscCall(PetscLogGpuFlops(nz));
1088f4747e26SJunchao Zhang   }
1089f4747e26SJunchao Zhang   if (rr) {
1090f4747e26SJunchao Zhang     PetscCall(VecGetLocalSize(rr, &n));
1091f4747e26SJunchao Zhang     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1092f4747e26SJunchao Zhang     PetscCall(VecGetKokkosView(rr, &rv));
1093f4747e26SJunchao Zhang     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1094d326c3f1SJunchao Zhang       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1095f4747e26SJunchao Zhang     PetscCall(VecRestoreKokkosView(rr, &lv));
1096f4747e26SJunchao Zhang     PetscCall(PetscLogGpuFlops(nz));
1097f4747e26SJunchao Zhang   }
1098f4747e26SJunchao Zhang   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1099f4747e26SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1100f4747e26SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1101f4747e26SJunchao Zhang }
1102f4747e26SJunchao Zhang 
1103d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1104d71ae5a4SJacob Faibussowitsch {
1105076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
1106a587d139SMark 
1107a587d139SMark   PetscFunctionBegin;
1108076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
11092328674fSJunchao Zhang   if (aijkok) { /* Only zero the device if data is already there */
1110d326c3f1SJunchao Zhang     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
11119566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(A));
11122328674fSJunchao Zhang   } else { /* Might be preallocated but not assembled */
11139566063dSJacob Faibussowitsch     PetscCall(MatZeroEntries_SeqAIJ(A));
11142328674fSJunchao Zhang   }
11153ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1116a587d139SMark }
1117a587d139SMark 
1118d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1119d71ae5a4SJacob Faibussowitsch {
1120f78ce678SMark Adams   Mat_SeqAIJKokkos     *aijkok;
1121f78ce678SMark Adams   PetscInt              n;
1122f78ce678SMark Adams   PetscScalarKokkosView xv;
1123f78ce678SMark Adams 
1124f78ce678SMark Adams   PetscFunctionBegin;
1125f78ce678SMark Adams   PetscCall(VecGetLocalSize(x, &n));
1126f78ce678SMark Adams   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1127f78ce678SMark Adams   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1128f78ce678SMark Adams 
1129f78ce678SMark Adams   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1130f78ce678SMark Adams   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1131f78ce678SMark Adams 
1132f78ce678SMark Adams   const auto &Aa    = aijkok->a_dual.view_device();
1133f78ce678SMark Adams   const auto &Ai    = aijkok->i_dual.view_device();
1134f78ce678SMark Adams   const auto &Adiag = aijkok->diag_dual.view_device();
1135f78ce678SMark Adams 
1136f78ce678SMark Adams   PetscCall(VecGetKokkosViewWrite(x, &xv));
11379371c9d4SSatish Balay   Kokkos::parallel_for(
1138d326c3f1SJunchao Zhang     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1139f78ce678SMark Adams       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1140f78ce678SMark Adams       else xv(i) = 0;
1141f78ce678SMark Adams     });
1142f78ce678SMark Adams   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
11433ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1144f78ce678SMark Adams }
1145f78ce678SMark Adams 
1146db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1147d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1148d71ae5a4SJacob Faibussowitsch {
1149db78de30SJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
1150db78de30SJunchao Zhang 
1151db78de30SJunchao Zhang   PetscFunctionBegin;
1152db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
11534f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1154db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
11559566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1156db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1157076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
11583ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1159db78de30SJunchao Zhang }
1160db78de30SJunchao Zhang 
1161d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1162d71ae5a4SJacob Faibussowitsch {
1163db78de30SJunchao Zhang   PetscFunctionBegin;
1164db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
11654f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1166db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
11673ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1168db78de30SJunchao Zhang }
1169db78de30SJunchao Zhang 
1170d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1171d71ae5a4SJacob Faibussowitsch {
1172db78de30SJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
1173db78de30SJunchao Zhang 
1174db78de30SJunchao Zhang   PetscFunctionBegin;
1175db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
11764f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1177db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
11789566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1179db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1180076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
11813ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1182db78de30SJunchao Zhang }
1183db78de30SJunchao Zhang 
1184d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1185d71ae5a4SJacob Faibussowitsch {
1186db78de30SJunchao Zhang   PetscFunctionBegin;
1187db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
11884f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1189db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
11909566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(A));
11913ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1192db78de30SJunchao Zhang }
1193db78de30SJunchao Zhang 
1194d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1195d71ae5a4SJacob Faibussowitsch {
1196db78de30SJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
1197db78de30SJunchao Zhang 
1198db78de30SJunchao Zhang   PetscFunctionBegin;
1199db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
12004f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1201db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1202db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1203076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
12043ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1205db78de30SJunchao Zhang }
1206db78de30SJunchao Zhang 
1207d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1208d71ae5a4SJacob Faibussowitsch {
1209db78de30SJunchao Zhang   PetscFunctionBegin;
1210db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
12114f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1212db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
12139566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(A));
12143ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1215db78de30SJunchao Zhang }
1216db78de30SJunchao Zhang 
1217c17cf699SJunchao Zhang /* Computes Y += alpha X */
1218d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1219d71ae5a4SJacob Faibussowitsch {
1220a587d139SMark   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1221c17cf699SJunchao Zhang   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1222c17cf699SJunchao Zhang   ConstMatScalarKokkosView Xa;
1223c17cf699SJunchao Zhang   MatScalarKokkosView      Ya;
1224d326c3f1SJunchao Zhang   auto                    &exec = PetscGetKokkosExecutionSpace();
1225a587d139SMark 
1226a587d139SMark   PetscFunctionBegin;
1227c17cf699SJunchao Zhang   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1228c17cf699SJunchao Zhang   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
12299566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
12309566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(X));
12319566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1232db78de30SJunchao Zhang 
1233c17cf699SJunchao Zhang   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1234a587d139SMark     PetscBool e;
12359566063dSJacob Faibussowitsch     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1236a587d139SMark     if (e) {
12379566063dSJacob Faibussowitsch       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1238c17cf699SJunchao Zhang       if (e) pattern = SAME_NONZERO_PATTERN;
1239a587d139SMark     }
1240a587d139SMark   }
1241db78de30SJunchao Zhang 
1242c17cf699SJunchao Zhang   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1243c17cf699SJunchao Zhang     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1244c17cf699SJunchao Zhang     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1245c17cf699SJunchao Zhang     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1246c17cf699SJunchao Zhang   */
1247c17cf699SJunchao Zhang   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1248c17cf699SJunchao Zhang   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1249c17cf699SJunchao Zhang   Xa   = xkok->a_dual.view_device();
1250c17cf699SJunchao Zhang   Ya   = ykok->a_dual.view_device();
1251c17cf699SJunchao Zhang 
1252c17cf699SJunchao Zhang   if (pattern == SAME_NONZERO_PATTERN) {
1253d326c3f1SJunchao Zhang     KokkosBlas::axpy(exec, alpha, Xa, Ya);
12549566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1255c17cf699SJunchao Zhang   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1256c17cf699SJunchao Zhang     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1257c17cf699SJunchao Zhang     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1258c17cf699SJunchao Zhang 
12599371c9d4SSatish Balay     Kokkos::parallel_for(
1260d326c3f1SJunchao Zhang       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
12610e3ece09SJunchao Zhang         PetscInt i = t.league_rank(); // row i
12620e3ece09SJunchao Zhang         Kokkos::single(Kokkos::PerTeam(t), [=]() {
12630e3ece09SJunchao Zhang           // Only one thread works in a team
1264c17cf699SJunchao Zhang           PetscInt p, q = Yi(i);
12650e3ece09SJunchao Zhang           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
12660e3ece09SJunchao Zhang             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
12670e3ece09SJunchao Zhang             if (Xj(p) == Yj(q)) {                        // Found it
1268c17cf699SJunchao Zhang               Ya(q) += alpha * Xa(p);
1269c17cf699SJunchao Zhang               q++;
1270a587d139SMark             } else {
12710e3ece09SJunchao Zhang             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
12720e3ece09SJunchao Zhang             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
12730e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
12740e3ece09SJunchao Zhang               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
12758b8b16f9SJunchao Zhang #else
12760e3ece09SJunchao Zhang               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
12778b8b16f9SJunchao Zhang #endif
1278a587d139SMark             }
1279c17cf699SJunchao Zhang           }
1280c17cf699SJunchao Zhang         });
1281c17cf699SJunchao Zhang       });
12829566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
12830e3ece09SJunchao Zhang   } else { // different nonzero patterns
1284c17cf699SJunchao Zhang     Mat             Z;
1285c17cf699SJunchao Zhang     KokkosCsrMatrix zcsr;
1286c17cf699SJunchao Zhang     KernelHandle    kh;
12870e3ece09SJunchao Zhang     kh.create_spadd_handle(true); // X, Y are sorted
1288c17cf699SJunchao Zhang     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1289c17cf699SJunchao Zhang     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1290c17cf699SJunchao Zhang     zkok = new Mat_SeqAIJKokkos(zcsr);
12919566063dSJacob Faibussowitsch     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
12929566063dSJacob Faibussowitsch     PetscCall(MatHeaderReplace(Y, &Z));
1293c17cf699SJunchao Zhang     kh.destroy_spadd_handle();
1294c17cf699SJunchao Zhang   }
12959566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
12960e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
12973ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1298a587d139SMark }
1299a587d139SMark 
13002c4ab24aSJunchao Zhang struct MatCOOStruct_SeqAIJKokkos {
13012c4ab24aSJunchao Zhang   PetscCount           n;
13022c4ab24aSJunchao Zhang   PetscCount           Atot;
13032c4ab24aSJunchao Zhang   PetscInt             nz;
13042c4ab24aSJunchao Zhang   PetscCountKokkosView jmap;
13052c4ab24aSJunchao Zhang   PetscCountKokkosView perm;
13062c4ab24aSJunchao Zhang 
13072c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
13082c4ab24aSJunchao Zhang   {
13092c4ab24aSJunchao Zhang     nz   = coo_h->nz;
13102c4ab24aSJunchao Zhang     n    = coo_h->n;
13112c4ab24aSJunchao Zhang     Atot = coo_h->Atot;
13122c4ab24aSJunchao Zhang     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
13132c4ab24aSJunchao Zhang     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
13142c4ab24aSJunchao Zhang   }
13152c4ab24aSJunchao Zhang };
13162c4ab24aSJunchao Zhang 
131749abdd8aSBarry Smith static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
13182c4ab24aSJunchao Zhang {
13192c4ab24aSJunchao Zhang   PetscFunctionBegin;
132049abdd8aSBarry Smith   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
13212c4ab24aSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
13222c4ab24aSJunchao Zhang }
13232c4ab24aSJunchao Zhang 
1324d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1325d71ae5a4SJacob Faibussowitsch {
132642550becSJunchao Zhang   Mat_SeqAIJKokkos          *akok;
132742550becSJunchao Zhang   Mat_SeqAIJ                *aseq;
132803e76207SPierre Jolivet   PetscContainer             container_h;
13292c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJ       *coo_h;
13302c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJKokkos *coo_d;
133142550becSJunchao Zhang 
133242550becSJunchao Zhang   PetscFunctionBegin;
13339566063dSJacob Faibussowitsch   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1334394ed5ebSJunchao Zhang   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
133542550becSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1336cbc6b225SStefano Zampini   delete akok;
1337f4747e26SJunchao Zhang   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
13389566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
13392c4ab24aSJunchao Zhang 
13402c4ab24aSJunchao Zhang   // Copy the COO struct to device
13412c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
13422c4ab24aSJunchao Zhang   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
13432c4ab24aSJunchao Zhang   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
13442c4ab24aSJunchao Zhang 
13452c4ab24aSJunchao Zhang   // Put the COO struct in a container and then attach that to the matrix
134603e76207SPierre Jolivet   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
13473ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
134842550becSJunchao Zhang }
134942550becSJunchao Zhang 
1350d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1351d71ae5a4SJacob Faibussowitsch {
135242550becSJunchao Zhang   MatScalarKokkosView        Aa;
135342550becSJunchao Zhang   ConstMatScalarKokkosView   kv;
135442550becSJunchao Zhang   PetscMemType               memtype;
13552c4ab24aSJunchao Zhang   PetscContainer             container;
13562c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJKokkos *coo;
135742550becSJunchao Zhang 
135842550becSJunchao Zhang   PetscFunctionBegin;
13592c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
13602c4ab24aSJunchao Zhang   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
13612c4ab24aSJunchao Zhang 
13622c4ab24aSJunchao Zhang   const auto &n    = coo->n;
13632c4ab24aSJunchao Zhang   const auto &Annz = coo->nz;
13642c4ab24aSJunchao Zhang   const auto &jmap = coo->jmap;
13652c4ab24aSJunchao Zhang   const auto &perm = coo->perm;
13662c4ab24aSJunchao Zhang 
13679566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(v, &memtype));
136842550becSJunchao Zhang   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
13692c4ab24aSJunchao Zhang     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
137042550becSJunchao Zhang   } else {
13712c4ab24aSJunchao Zhang     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
137242550becSJunchao Zhang   }
137342550becSJunchao Zhang 
1374c7b718f4SJunchao Zhang   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1375c7b718f4SJunchao Zhang   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
137642550becSJunchao Zhang 
137708bb9926SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
13789371c9d4SSatish Balay   Kokkos::parallel_for(
1379d326c3f1SJunchao Zhang     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1380c7b718f4SJunchao Zhang       PetscScalar sum = 0.0;
1381c7b718f4SJunchao Zhang       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1382c7b718f4SJunchao Zhang       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1383c7b718f4SJunchao Zhang     });
138408bb9926SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1385394ed5ebSJunchao Zhang 
13869566063dSJacob Faibussowitsch   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
13879566063dSJacob Faibussowitsch   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
13883ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
138942550becSJunchao Zhang }
139042550becSJunchao Zhang 
1391d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1392d71ae5a4SJacob Faibussowitsch {
13938f7e8f9dSMark Adams   PetscFunctionBegin;
13949566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncHost(A));
13959566063dSJacob Faibussowitsch   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
13968f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
13973ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
13988f7e8f9dSMark Adams }
13998f7e8f9dSMark Adams 
1400d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1401d71ae5a4SJacob Faibussowitsch {
1402076ba34aSJunchao Zhang   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1403076ba34aSJunchao Zhang 
14048c3ff71bSJunchao Zhang   PetscFunctionBegin;
1405076ba34aSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
14066f3d89d0SStefano Zampini   A->boundtocpu  = PETSC_FALSE;
14076f3d89d0SStefano Zampini 
14088c3ff71bSJunchao Zhang   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
14098c3ff71bSJunchao Zhang   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
14108c3ff71bSJunchao Zhang   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1411a587d139SMark   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1412f0cf5187SStefano Zampini   A->ops->scale                     = MatScale_SeqAIJKokkos;
1413a587d139SMark   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1414076ba34aSJunchao Zhang   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
14158c3ff71bSJunchao Zhang   A->ops->mult                      = MatMult_SeqAIJKokkos;
14168c3ff71bSJunchao Zhang   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
14178c3ff71bSJunchao Zhang   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
14188c3ff71bSJunchao Zhang   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
14198c3ff71bSJunchao Zhang   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
14208c3ff71bSJunchao Zhang   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1421076ba34aSJunchao Zhang   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
14220ecb592aSJunchao Zhang   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1423152b3e56SJunchao Zhang   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1424f78ce678SMark Adams   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1425f4747e26SJunchao Zhang   A->ops->shift                     = MatShift_SeqAIJKokkos;
1426f4747e26SJunchao Zhang   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1427f4747e26SJunchao Zhang   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1428076ba34aSJunchao Zhang   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1429076ba34aSJunchao Zhang   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1430076ba34aSJunchao Zhang   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1431076ba34aSJunchao Zhang   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1432076ba34aSJunchao Zhang   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1433076ba34aSJunchao Zhang   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
14347ee59b9bSJunchao Zhang   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
143542550becSJunchao Zhang 
14369566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
14379566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
143857761e9aSJunchao Zhang #if defined(PETSC_HAVE_HYPRE)
143957761e9aSJunchao Zhang   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
144057761e9aSJunchao Zhang #endif
14413ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1442076ba34aSJunchao Zhang }
1443076ba34aSJunchao Zhang 
14449d13fa56SJunchao Zhang /*
14459d13fa56SJunchao Zhang    Extract the (prescribled) diagonal blocks of the matrix and then invert them
14469d13fa56SJunchao Zhang 
14479d13fa56SJunchao Zhang   Input Parameters:
14489d13fa56SJunchao Zhang +  A       - the MATSEQAIJKOKKOS matrix
14499d13fa56SJunchao Zhang .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
14509d13fa56SJunchao Zhang .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
14519d13fa56SJunchao Zhang .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
14529d13fa56SJunchao Zhang -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine
14539d13fa56SJunchao Zhang 
14549d13fa56SJunchao Zhang   Output Parameter:
14559d13fa56SJunchao Zhang .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
14569d13fa56SJunchao Zhang */
14579d13fa56SJunchao Zhang PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
14589d13fa56SJunchao Zhang {
14599d13fa56SJunchao Zhang   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
14609d13fa56SJunchao Zhang   PetscInt          nblocks = bs.extent(0) - 1;
14619d13fa56SJunchao Zhang 
14629d13fa56SJunchao Zhang   PetscFunctionBegin;
14639d13fa56SJunchao Zhang   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
14649d13fa56SJunchao Zhang 
14659d13fa56SJunchao Zhang   // Pull out the diagonal blocks of the matrix and then invert the blocks
14669d13fa56SJunchao Zhang   auto Aa    = akok->a_dual.view_device();
14679d13fa56SJunchao Zhang   auto Ai    = akok->i_dual.view_device();
14689d13fa56SJunchao Zhang   auto Aj    = akok->j_dual.view_device();
14699d13fa56SJunchao Zhang   auto Adiag = akok->diag_dual.view_device();
14709d13fa56SJunchao Zhang   // TODO: how to tune the team size?
147145402d8aSJunchao Zhang #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
14729d13fa56SJunchao Zhang   auto ts = Kokkos::AUTO();
14739d13fa56SJunchao Zhang #else
14749d13fa56SJunchao Zhang   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
14759d13fa56SJunchao Zhang #endif
14769d13fa56SJunchao Zhang   PetscCallCXX(Kokkos::parallel_for(
1477d326c3f1SJunchao Zhang     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
14789d13fa56SJunchao Zhang       const PetscInt bid    = teamMember.league_rank();                                                   // block id
14799d13fa56SJunchao Zhang       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
14809d13fa56SJunchao Zhang       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
14819d13fa56SJunchao Zhang       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
14829d13fa56SJunchao Zhang       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);
14839d13fa56SJunchao Zhang 
14849d13fa56SJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
14859d13fa56SJunchao Zhang         PetscInt i = rstart + r;                                                            // i-th row in A
14869d13fa56SJunchao Zhang 
14879d13fa56SJunchao Zhang         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
14889d13fa56SJunchao Zhang           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row
14899d13fa56SJunchao Zhang 
14909d13fa56SJunchao Zhang           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
14919d13fa56SJunchao Zhang             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
14929d13fa56SJunchao Zhang               B(r, c) = 0.0;
14939d13fa56SJunchao Zhang             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
14949d13fa56SJunchao Zhang               B(r, c) = Aa(first + c);
14959d13fa56SJunchao Zhang             } else { // this entry does not show up in the CSR
14969d13fa56SJunchao Zhang               B(r, c) = 0.0;
14979d13fa56SJunchao Zhang             }
14989d13fa56SJunchao Zhang           }
14999d13fa56SJunchao Zhang         } else { // rare case that the diagonal does not exist
15009d13fa56SJunchao Zhang           const PetscInt begin = Ai(i);
15019d13fa56SJunchao Zhang           const PetscInt end   = Ai(i + 1);
15029d13fa56SJunchao Zhang           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
15039d13fa56SJunchao Zhang           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
15049d13fa56SJunchao Zhang             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
15059d13fa56SJunchao Zhang             else if (Aj(j) >= rstart + m) break;
15069d13fa56SJunchao Zhang           }
15079d13fa56SJunchao Zhang         }
15089d13fa56SJunchao Zhang       });
15099d13fa56SJunchao Zhang 
15109d13fa56SJunchao Zhang       // LU-decompose B (w/o pivoting) and then invert B
15119d13fa56SJunchao Zhang       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
15129d13fa56SJunchao Zhang       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
15139d13fa56SJunchao Zhang     }));
15149d13fa56SJunchao Zhang   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
15159d13fa56SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
15169d13fa56SJunchao Zhang }
15179d13fa56SJunchao Zhang 
1518d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1519d71ae5a4SJacob Faibussowitsch {
1520076ba34aSJunchao Zhang   Mat_SeqAIJ *aseq;
1521076ba34aSJunchao Zhang   PetscInt    i, m, n;
1522e36ced11SJunchao Zhang   auto       &exec = PetscGetKokkosExecutionSpace();
1523076ba34aSJunchao Zhang 
1524076ba34aSJunchao Zhang   PetscFunctionBegin;
15255f80ce2aSJacob Faibussowitsch   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1526076ba34aSJunchao Zhang 
1527076ba34aSJunchao Zhang   m = akok->nrows();
1528076ba34aSJunchao Zhang   n = akok->ncols();
15299566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(A, m, n, m, n));
15309566063dSJacob Faibussowitsch   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1531076ba34aSJunchao Zhang 
1532076ba34aSJunchao Zhang   /* Set up data structures of A as a MATSEQAIJ */
15339566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
153457508eceSPierre Jolivet   aseq = (Mat_SeqAIJ *)A->data;
1535076ba34aSJunchao Zhang 
1536e36ced11SJunchao Zhang   PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */
1537e36ced11SJunchao Zhang   PetscCallCXX(akok->j_dual.sync_host(exec));
1538e36ced11SJunchao Zhang   PetscCallCXX(exec.fence());
1539076ba34aSJunchao Zhang 
1540076ba34aSJunchao Zhang   aseq->i       = akok->i_host_data();
1541076ba34aSJunchao Zhang   aseq->j       = akok->j_host_data();
1542076ba34aSJunchao Zhang   aseq->a       = akok->a_host_data();
1543076ba34aSJunchao Zhang   aseq->nonew   = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1544076ba34aSJunchao Zhang   aseq->free_a  = PETSC_FALSE;
1545076ba34aSJunchao Zhang   aseq->free_ij = PETSC_FALSE;
1546076ba34aSJunchao Zhang   aseq->nz      = akok->nnz();
1547076ba34aSJunchao Zhang   aseq->maxnz   = aseq->nz;
1548076ba34aSJunchao Zhang 
15499566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &aseq->imax));
15509566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &aseq->ilen));
1551ad540459SPierre Jolivet   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1552076ba34aSJunchao Zhang 
1553076ba34aSJunchao Zhang   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1554076ba34aSJunchao Zhang   akok->nonzerostate = A->nonzerostate;
1555ff751488SJunchao Zhang   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
15569566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
15579566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
15583ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1559076ba34aSJunchao Zhang }
1560076ba34aSJunchao Zhang 
15610e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
15620e3ece09SJunchao Zhang {
15630e3ece09SJunchao Zhang   PetscFunctionBegin;
15640e3ece09SJunchao Zhang   PetscCall(MatSeqAIJKokkosSyncDevice(A));
15650e3ece09SJunchao Zhang   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
15660e3ece09SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
15670e3ece09SJunchao Zhang }
15680e3ece09SJunchao Zhang 
15690e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
15700e3ece09SJunchao Zhang {
15710e3ece09SJunchao Zhang   Mat_SeqAIJKokkos *akok;
15724d86920dSPierre Jolivet 
15730e3ece09SJunchao Zhang   PetscFunctionBegin;
15740e3ece09SJunchao Zhang   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
15750e3ece09SJunchao Zhang   PetscCall(MatCreate(comm, A));
15760e3ece09SJunchao Zhang   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
15770e3ece09SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
15780e3ece09SJunchao Zhang }
15790e3ece09SJunchao Zhang 
1580076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1581076ba34aSJunchao Zhang 
1582076ba34aSJunchao Zhang    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1583076ba34aSJunchao Zhang  */
1584d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1585d71ae5a4SJacob Faibussowitsch {
1586076ba34aSJunchao Zhang   PetscFunctionBegin;
15879566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
15889566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
15893ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
15908c3ff71bSJunchao Zhang }
15918c3ff71bSJunchao Zhang 
1592152b3e56SJunchao Zhang /*@C
159311a5261eSBarry Smith   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
15948c3ff71bSJunchao Zhang   (the default parallel PETSc format). This matrix will ultimately be handled by
159520f4b53cSBarry Smith   Kokkos for calculations.
15968c3ff71bSJunchao Zhang 
15978c3ff71bSJunchao Zhang   Collective
15988c3ff71bSJunchao Zhang 
15998c3ff71bSJunchao Zhang   Input Parameters:
160011a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF`
16018c3ff71bSJunchao Zhang . m    - number of rows
16028c3ff71bSJunchao Zhang . n    - number of columns
160320f4b53cSBarry Smith . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
160420f4b53cSBarry Smith - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
16058c3ff71bSJunchao Zhang 
16068c3ff71bSJunchao Zhang   Output Parameter:
16078c3ff71bSJunchao Zhang . A - the matrix
16088c3ff71bSJunchao Zhang 
16092ef1f0ffSBarry Smith   Level: intermediate
16102ef1f0ffSBarry Smith 
16112ef1f0ffSBarry Smith   Notes:
161211a5261eSBarry Smith   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
16138c3ff71bSJunchao Zhang   MatXXXXSetPreallocation() paradgm instead of this routine directly.
161411a5261eSBarry Smith   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
16158c3ff71bSJunchao Zhang 
161611a5261eSBarry Smith   The AIJ format, also called
16172ef1f0ffSBarry Smith   compressed row storage, is fully compatible with standard Fortran
16188c3ff71bSJunchao Zhang   storage.  That is, the stored row and column indices can begin at
161920f4b53cSBarry Smith   either one (as in Fortran) or zero.
16208c3ff71bSJunchao Zhang 
16212ef1f0ffSBarry Smith   Specify the preallocated storage with either `nz` or `nnz` (not both).
16222ef1f0ffSBarry Smith   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
16232ef1f0ffSBarry Smith   allocation.
16248c3ff71bSJunchao Zhang 
1625fe59aa6dSJacob Faibussowitsch .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
16268c3ff71bSJunchao Zhang @*/
1627d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1628d71ae5a4SJacob Faibussowitsch {
16298c3ff71bSJunchao Zhang   PetscFunctionBegin;
16309566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
16319566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
16329566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, m, n));
16339566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
16349566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
16353ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16368c3ff71bSJunchao Zhang }
1637930e68a5SMark Adams 
1638d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1639d71ae5a4SJacob Faibussowitsch {
1640930e68a5SMark Adams   PetscFunctionBegin;
16419566063dSJacob Faibussowitsch   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
164286a27549SJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
16433ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
164486a27549SJunchao Zhang }
164586a27549SJunchao Zhang 
1646d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1647d71ae5a4SJacob Faibussowitsch {
164886a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
164986a27549SJunchao Zhang 
165086a27549SJunchao Zhang   PetscFunctionBegin;
165186a27549SJunchao Zhang   if (!factors->sptrsv_symbolic_completed) {
165286a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
165386a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
165486a27549SJunchao Zhang     factors->sptrsv_symbolic_completed = PETSC_TRUE;
165586a27549SJunchao Zhang   }
16563ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
165786a27549SJunchao Zhang }
165886a27549SJunchao Zhang 
165986a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */
1660d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1661d71ae5a4SJacob Faibussowitsch {
166286a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1663076ba34aSJunchao Zhang   MatColIdxType               n       = A->rmap->n;
166486a27549SJunchao Zhang 
166586a27549SJunchao Zhang   PetscFunctionBegin;
166686a27549SJunchao Zhang   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
166786a27549SJunchao Zhang     /* Update L^T and do sptrsv symbolic */
16687b8d4ba6SJunchao Zhang     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0
16697b8d4ba6SJunchao Zhang     factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
16707b8d4ba6SJunchao Zhang     factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
167186a27549SJunchao Zhang 
16729371c9d4SSatish Balay     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
167386a27549SJunchao Zhang                                                                                                                                                                                                               factors->iLt_d, factors->jLt_d, factors->aLt_d);
167486a27549SJunchao Zhang 
167586a27549SJunchao Zhang     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
167686a27549SJunchao Zhang       We have to sort the indices, until KK provides finer control options.
167786a27549SJunchao Zhang     */
16789371c9d4SSatish Balay     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);
167986a27549SJunchao Zhang 
168086a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);
168186a27549SJunchao Zhang 
168286a27549SJunchao Zhang     /* Update U^T and do sptrsv symbolic */
16837b8d4ba6SJunchao Zhang     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0
16847b8d4ba6SJunchao Zhang     factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
16857b8d4ba6SJunchao Zhang     factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
168686a27549SJunchao Zhang 
16879371c9d4SSatish Balay     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
168886a27549SJunchao Zhang                                                                                                                                                                                                               factors->iUt_d, factors->jUt_d, factors->aUt_d);
168986a27549SJunchao Zhang 
169086a27549SJunchao Zhang     /* Sort indices. See comments above */
16919371c9d4SSatish Balay     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);
169286a27549SJunchao Zhang 
169386a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
169486a27549SJunchao Zhang     factors->transpose_updated = PETSC_TRUE;
169586a27549SJunchao Zhang   }
16963ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
169786a27549SJunchao Zhang }
169886a27549SJunchao Zhang 
169986a27549SJunchao Zhang /* Solve Ax = b, with A = LU */
1700d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x)
1701d71ae5a4SJacob Faibussowitsch {
170286a27549SJunchao Zhang   ConstPetscScalarKokkosView  bv;
170386a27549SJunchao Zhang   PetscScalarKokkosView       xv;
170486a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
170586a27549SJunchao Zhang 
170686a27549SJunchao Zhang   PetscFunctionBegin;
17079566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
17089566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
17099566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(b, &bv));
17109566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(x, &xv));
171186a27549SJunchao Zhang   /* Solve L tmpv = b */
17129566063dSJacob Faibussowitsch   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector));
171386a27549SJunchao Zhang   /* Solve Ux = tmpv */
17149566063dSJacob Faibussowitsch   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv));
17159566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(b, &bv));
17169566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
17179566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
17183ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
171986a27549SJunchao Zhang }
172086a27549SJunchao Zhang 
1721076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */
1722d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x)
1723d71ae5a4SJacob Faibussowitsch {
172486a27549SJunchao Zhang   ConstPetscScalarKokkosView  bv;
172586a27549SJunchao Zhang   PetscScalarKokkosView       xv;
172686a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
172786a27549SJunchao Zhang 
172886a27549SJunchao Zhang   PetscFunctionBegin;
17299566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
17309566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
17319566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(b, &bv));
17329566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(x, &xv));
173386a27549SJunchao Zhang   /* Solve U^T tmpv = b */
173486a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);
173586a27549SJunchao Zhang 
173686a27549SJunchao Zhang   /* Solve L^T x = tmpv */
173786a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
17389566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(b, &bv));
17399566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
17409566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
17413ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
174286a27549SJunchao Zhang }
174386a27549SJunchao Zhang 
1744d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1745d71ae5a4SJacob Faibussowitsch {
174686a27549SJunchao Zhang   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
174786a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
174886a27549SJunchao Zhang   PetscInt                    fill_lev = info->levels;
174986a27549SJunchao Zhang 
175086a27549SJunchao Zhang   PetscFunctionBegin;
17519566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
17529566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1753076ba34aSJunchao Zhang 
1754076ba34aSJunchao Zhang   auto a_d = aijkok->a_dual.view_device();
1755076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1756076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1757076ba34aSJunchao Zhang 
1758076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);
175986a27549SJunchao Zhang 
176086a27549SJunchao Zhang   B->assembled              = PETSC_TRUE;
176186a27549SJunchao Zhang   B->preallocated           = PETSC_TRUE;
176286a27549SJunchao Zhang   B->ops->solve             = MatSolve_SeqAIJKokkos;
176386a27549SJunchao Zhang   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos;
176486a27549SJunchao Zhang   B->ops->matsolve          = NULL;
176586a27549SJunchao Zhang   B->ops->matsolvetranspose = NULL;
176686a27549SJunchao Zhang   B->offloadmask            = PETSC_OFFLOAD_GPU;
176786a27549SJunchao Zhang 
176886a27549SJunchao Zhang   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
176986a27549SJunchao Zhang   factors->transpose_updated         = PETSC_FALSE;
177086a27549SJunchao Zhang   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1771eeadb341SJunchao Zhang   /* TODO: log flops, but how to know that? */
17729566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
17733ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
177486a27549SJunchao Zhang }
177586a27549SJunchao Zhang 
1776d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1777d71ae5a4SJacob Faibussowitsch {
177886a27549SJunchao Zhang   Mat_SeqAIJKokkos           *aijkok;
177986a27549SJunchao Zhang   Mat_SeqAIJ                 *b;
178086a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
178186a27549SJunchao Zhang   PetscInt                    fill_lev = info->levels;
178286a27549SJunchao Zhang   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
178386a27549SJunchao Zhang   PetscInt                    n        = A->rmap->n;
178486a27549SJunchao Zhang 
178586a27549SJunchao Zhang   PetscFunctionBegin;
17869566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
178786a27549SJunchao Zhang   /* Rebuild factors */
17889371c9d4SSatish Balay   if (factors) {
17899371c9d4SSatish Balay     factors->Destroy();
17909371c9d4SSatish Balay   } /* Destroy the old if it exists */
17919371c9d4SSatish Balay   else {
17929371c9d4SSatish Balay     B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
17939371c9d4SSatish Balay   }
179486a27549SJunchao Zhang 
179586a27549SJunchao Zhang   /* Create a spiluk handle and then do symbolic factorization */
179686a27549SJunchao Zhang   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
179786a27549SJunchao Zhang   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
179886a27549SJunchao Zhang 
179986a27549SJunchao Zhang   auto spiluk_handle = factors->kh.get_spiluk_handle();
180086a27549SJunchao Zhang 
180186a27549SJunchao Zhang   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
180286a27549SJunchao Zhang   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
180386a27549SJunchao Zhang   Kokkos::realloc(factors->iU_d, n + 1);
180486a27549SJunchao Zhang   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
180586a27549SJunchao Zhang 
180686a27549SJunchao Zhang   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1807076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1808076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1809076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
181086a27549SJunchao Zhang   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
181186a27549SJunchao Zhang 
181286a27549SJunchao Zhang   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
181386a27549SJunchao Zhang   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
181486a27549SJunchao Zhang   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
181586a27549SJunchao Zhang   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
181686a27549SJunchao Zhang 
181786a27549SJunchao Zhang   /* TODO: add options to select sptrsv algorithms */
181886a27549SJunchao Zhang   /* Create sptrsv handles for L, U and their transpose */
181986a27549SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
182086a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
182186a27549SJunchao Zhang #else
182286a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
182386a27549SJunchao Zhang #endif
182486a27549SJunchao Zhang 
182586a27549SJunchao Zhang   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
182686a27549SJunchao Zhang   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
182786a27549SJunchao Zhang   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
182886a27549SJunchao Zhang   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
182986a27549SJunchao Zhang 
183086a27549SJunchao Zhang   /* Fill fields of the factor matrix B */
18319566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
183286a27549SJunchao Zhang   b     = (Mat_SeqAIJ *)B->data;
183386a27549SJunchao Zhang   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
183486a27549SJunchao Zhang   B->info.fill_ratio_given  = info->fill;
1835a1e4e190SStefano Zampini   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
183686a27549SJunchao Zhang 
183786a27549SJunchao Zhang   B->offloadmask          = PETSC_OFFLOAD_GPU;
183886a27549SJunchao Zhang   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
18393ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1840930e68a5SMark Adams }
1841930e68a5SMark Adams 
1842d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type)
1843d71ae5a4SJacob Faibussowitsch {
1844930e68a5SMark Adams   PetscFunctionBegin;
1845930e68a5SMark Adams   *type = MATSOLVERKOKKOS;
18463ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1847930e68a5SMark Adams }
1848930e68a5SMark Adams 
1849930e68a5SMark Adams /*MC
185086a27549SJunchao Zhang   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
185111a5261eSBarry Smith   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
1852930e68a5SMark Adams 
1853930e68a5SMark Adams   Level: beginner
1854930e68a5SMark Adams 
18551cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1856930e68a5SMark Adams M*/
185786a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1858930e68a5SMark Adams {
1859930e68a5SMark Adams   PetscInt n = A->rmap->n;
1860930e68a5SMark Adams 
1861930e68a5SMark Adams   PetscFunctionBegin;
18629566063dSJacob Faibussowitsch   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
18639566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*B, n, n, n, n));
1864930e68a5SMark Adams   (*B)->factortype = ftype;
18659566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
18669566063dSJacob Faibussowitsch   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1867930e68a5SMark Adams 
18688f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
18699566063dSJacob Faibussowitsch     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
187086a27549SJunchao Zhang     (*B)->canuseordering        = PETSC_TRUE;
187186a27549SJunchao Zhang     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
187286a27549SJunchao Zhang   } else if (ftype == MAT_FACTOR_ILU) {
18739566063dSJacob Faibussowitsch     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
187486a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_FALSE;
187586a27549SJunchao Zhang     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
187698921bdaSJacob Faibussowitsch   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1877930e68a5SMark Adams 
18789566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1879f4f49eeaSPierre Jolivet   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos));
18803ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1881930e68a5SMark Adams }
18828f7e8f9dSMark Adams 
1883d1f0640dSPierre Jolivet PETSC_INTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1884d71ae5a4SJacob Faibussowitsch {
188586a27549SJunchao Zhang   PetscFunctionBegin;
18869566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
18879566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
18883ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
188986a27549SJunchao Zhang }
189086a27549SJunchao Zhang 
1891076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */
1892d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
1893d71ae5a4SJacob Faibussowitsch {
189445402d8aSJunchao Zhang   const auto        &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
189545402d8aSJunchao Zhang   const auto        &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
189645402d8aSJunchao Zhang   const auto        &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
1897076ba34aSJunchao Zhang   const PetscInt    *i  = iv.data();
1898076ba34aSJunchao Zhang   const PetscInt    *j  = jv.data();
1899076ba34aSJunchao Zhang   const PetscScalar *a  = av.data();
1900076ba34aSJunchao Zhang   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
1901076ba34aSJunchao Zhang 
1902076ba34aSJunchao Zhang   PetscFunctionBegin;
19039566063dSJacob Faibussowitsch   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
1904076ba34aSJunchao Zhang   for (PetscInt k = 0; k < m; k++) {
19059566063dSJacob Faibussowitsch     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
190648a46eb9SPierre Jolivet     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
19079566063dSJacob Faibussowitsch     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
1908076ba34aSJunchao Zhang   }
19093ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1910076ba34aSJunchao Zhang }
1911