xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision c0c276a7a9f347b22187dda26ae7d35d5b9ed8a2)
1e36ced11SJunchao Zhang #include <petsc_kokkos.hpp>
211d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp>
3*c0c276a7Ssdargavi #include <petscmat_kokkos.hpp>
4076ba34aSJunchao Zhang #include <petscpkg_version.h>
5152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h>
642550becSJunchao Zhang #include <petsc/private/sfimpl.h>
78c3ff71bSJunchao Zhang #include <petscsystypes.h>
88c3ff71bSJunchao Zhang #include <petscerror.h>
98c3ff71bSJunchao Zhang 
108c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp>
11f0cf5187SStefano Zampini #include <KokkosBlas.hpp>
128c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp>
13cc6e31f1SJunchao Zhang 
14cc6e31f1SJunchao Zhang // To suppress compiler warnings:
15cc6e31f1SJunchao Zhang // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
16cc6e31f1SJunchao Zhang // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
17cc6e31f1SJunchao Zhang // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
18cc6e31f1SJunchao Zhang // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
19cc6e31f1SJunchao Zhang PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
208c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
21cc6e31f1SJunchao Zhang PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
22cc6e31f1SJunchao Zhang 
2386a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp>
2486a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp>
25076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp>
26076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp>
279d13fa56SJunchao Zhang #include <KokkosBatched_LU_Decl.hpp>
289d13fa56SJunchao Zhang #include <KokkosBatched_InverseLU_Decl.hpp>
2986a27549SJunchao Zhang 
3042550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
318c3ff71bSJunchao Zhang 
320e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
33f98996d3SJunchao Zhang   #include <KokkosSparse_Utils.hpp>
34f98996d3SJunchao Zhang using KokkosSparse::sort_crs_matrix;
359371c9d4SSatish Balay using KokkosSparse::Impl::transpose_matrix;
36f98996d3SJunchao Zhang #else
37f98996d3SJunchao Zhang   #include <KokkosKernels_Sorting.hpp>
38f98996d3SJunchao Zhang using KokkosKernels::sort_crs_matrix;
399371c9d4SSatish Balay using KokkosKernels::Impl::transpose_matrix;
40f98996d3SJunchao Zhang #endif
41f98996d3SJunchao Zhang 
428c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
438c3ff71bSJunchao Zhang 
44076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
45076ba34aSJunchao Zhang    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
46076ba34aSJunchao Zhang    In the latter case, it is important to set a_dual's sync state correctly.
47076ba34aSJunchao Zhang  */
48d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
49d71ae5a4SJacob Faibussowitsch {
50076ba34aSJunchao Zhang   Mat_SeqAIJ       *aijseq;
51076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
528c3ff71bSJunchao Zhang 
538c3ff71bSJunchao Zhang   PetscFunctionBegin;
543ba16761SJacob Faibussowitsch   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
559566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
56076ba34aSJunchao Zhang 
57076ba34aSJunchao Zhang   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
58076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
59076ba34aSJunchao Zhang 
60076ba34aSJunchao Zhang   /* If aijkok does not exist, we just copy i, j to device.
61076ba34aSJunchao Zhang      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
62076ba34aSJunchao Zhang      In both cases, we build a new aijkok structure.
63076ba34aSJunchao Zhang   */
64076ba34aSJunchao Zhang   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
65076ba34aSJunchao Zhang     delete aijkok;
66f4747e26SJunchao Zhang     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
67076ba34aSJunchao Zhang     A->spptr = aijkok;
68f4747e26SJunchao Zhang   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
69f4747e26SJunchao Zhang     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
70f4747e26SJunchao Zhang     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
71f4747e26SJunchao Zhang     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
72076ba34aSJunchao Zhang   }
733ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
748c3ff71bSJunchao Zhang }
758c3ff71bSJunchao Zhang 
7686a27549SJunchao Zhang /* Sync CSR data to device if not yet */
77d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
78d71ae5a4SJacob Faibussowitsch {
798c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
808c3ff71bSJunchao Zhang 
818c3ff71bSJunchao Zhang   PetscFunctionBegin;
82aaa8cc7dSPierre Jolivet   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
835f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
84076ba34aSJunchao Zhang   if (aijkok->a_dual.need_sync_device()) {
85076ba34aSJunchao Zhang     aijkok->a_dual.sync_device();
86580c7c76SPierre Jolivet     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
8786a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_FALSE;
888c3ff71bSJunchao Zhang   }
893ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
908c3ff71bSJunchao Zhang }
918c3ff71bSJunchao Zhang 
92076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */
93d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
94d71ae5a4SJacob Faibussowitsch {
9586a27549SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
9686a27549SJunchao Zhang 
9786a27549SJunchao Zhang   PetscFunctionBegin;
985f80ce2aSJacob Faibussowitsch   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
9986a27549SJunchao Zhang   aijkok->a_dual.clear_sync_state();
10086a27549SJunchao Zhang   aijkok->a_dual.modify_device();
10186a27549SJunchao Zhang   aijkok->transpose_updated = PETSC_FALSE;
10286a27549SJunchao Zhang   aijkok->hermitian_updated = PETSC_FALSE;
1039566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
1049566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
1053ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
10686a27549SJunchao Zhang }
10786a27549SJunchao Zhang 
108d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
109d71ae5a4SJacob Faibussowitsch {
110f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1114df4a32cSJunchao Zhang   auto              exec   = PetscGetKokkosExecutionSpace();
112f0cf5187SStefano Zampini 
113f0cf5187SStefano Zampini   PetscFunctionBegin;
114f0cf5187SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
11586a27549SJunchao Zhang   /* We do not expect one needs factors on host  */
116aaa8cc7dSPierre Jolivet   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
1175f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
118e36ced11SJunchao Zhang   PetscCallCXX(aijkok->a_dual.sync_host(exec));
119e36ced11SJunchao Zhang   PetscCallCXX(exec.fence());
1203ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
121f0cf5187SStefano Zampini }
122f0cf5187SStefano Zampini 
123d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
124d71ae5a4SJacob Faibussowitsch {
125076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
126f0cf5187SStefano Zampini 
127f0cf5187SStefano Zampini   PetscFunctionBegin;
1285519a089SJose E. Roman   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
1295519a089SJose E. Roman     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
1305519a089SJose E. Roman     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
1315519a089SJose E. Roman     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
1325519a089SJose E. Roman   */
1335519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
1344df4a32cSJunchao Zhang     auto exec = PetscGetKokkosExecutionSpace();
135e36ced11SJunchao Zhang     PetscCallCXX(aijkok->a_dual.sync_host(exec));
136e36ced11SJunchao Zhang     PetscCallCXX(exec.fence());
137076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
138076ba34aSJunchao Zhang   } else { /* Happens when calling MatSetValues on a newly created matrix */
139076ba34aSJunchao Zhang     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
140076ba34aSJunchao Zhang   }
1413ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
142076ba34aSJunchao Zhang }
143076ba34aSJunchao Zhang 
144d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
145d71ae5a4SJacob Faibussowitsch {
146076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
147076ba34aSJunchao Zhang 
148076ba34aSJunchao Zhang   PetscFunctionBegin;
1495519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
1503ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
151076ba34aSJunchao Zhang }
152076ba34aSJunchao Zhang 
153d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
154d71ae5a4SJacob Faibussowitsch {
155076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
156076ba34aSJunchao Zhang 
157076ba34aSJunchao Zhang   PetscFunctionBegin;
1585519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
1594df4a32cSJunchao Zhang     auto exec = PetscGetKokkosExecutionSpace();
160e36ced11SJunchao Zhang     PetscCallCXX(aijkok->a_dual.sync_host(exec));
161e36ced11SJunchao Zhang     PetscCallCXX(exec.fence());
162076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1632328674fSJunchao Zhang   } else {
1642328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
1652328674fSJunchao Zhang   }
1663ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
167076ba34aSJunchao Zhang }
168076ba34aSJunchao Zhang 
169d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
170d71ae5a4SJacob Faibussowitsch {
171076ba34aSJunchao Zhang   PetscFunctionBegin;
172076ba34aSJunchao Zhang   *array = NULL;
1733ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
174076ba34aSJunchao Zhang }
175076ba34aSJunchao Zhang 
176d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
177d71ae5a4SJacob Faibussowitsch {
178076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
179076ba34aSJunchao Zhang 
180076ba34aSJunchao Zhang   PetscFunctionBegin;
1815519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
182076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1832328674fSJunchao Zhang   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
1842328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
1852328674fSJunchao Zhang   }
1863ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
187076ba34aSJunchao Zhang }
188076ba34aSJunchao Zhang 
189d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
190d71ae5a4SJacob Faibussowitsch {
191076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
192076ba34aSJunchao Zhang 
193076ba34aSJunchao Zhang   PetscFunctionBegin;
1945519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
195076ba34aSJunchao Zhang     aijkok->a_dual.clear_sync_state();
196076ba34aSJunchao Zhang     aijkok->a_dual.modify_host();
1972328674fSJunchao Zhang   }
1983ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
199f0cf5187SStefano Zampini }
200f0cf5187SStefano Zampini 
201d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
202d71ae5a4SJacob Faibussowitsch {
2037ee59b9bSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
2047ee59b9bSJunchao Zhang 
2057ee59b9bSJunchao Zhang   PetscFunctionBegin;
2067ee59b9bSJunchao Zhang   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
2077ee59b9bSJunchao Zhang 
2087ee59b9bSJunchao Zhang   if (i) *i = aijkok->i_device_data();
2097ee59b9bSJunchao Zhang   if (j) *j = aijkok->j_device_data();
2107ee59b9bSJunchao Zhang   if (a) {
2117ee59b9bSJunchao Zhang     aijkok->a_dual.sync_device();
2127ee59b9bSJunchao Zhang     *a = aijkok->a_device_data();
2137ee59b9bSJunchao Zhang   }
2147ee59b9bSJunchao Zhang   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
2153ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2167ee59b9bSJunchao Zhang }
2177ee59b9bSJunchao Zhang 
2180e3ece09SJunchao Zhang /*
2190e3ece09SJunchao Zhang   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
2200e3ece09SJunchao Zhang 
2210e3ece09SJunchao Zhang   Input Parameter:
2220e3ece09SJunchao Zhang .  A       - the MATSEQAIJKOKKOS matrix
2230e3ece09SJunchao Zhang 
2240e3ece09SJunchao Zhang   Output Parameters:
2250e3ece09SJunchao Zhang +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
226aaa8cc7dSPierre Jolivet -  T_d    - the transpose on device, whose value array is allocated but not initialized
2270e3ece09SJunchao Zhang */
2280e3ece09SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
229d71ae5a4SJacob Faibussowitsch {
2300e3ece09SJunchao Zhang   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
2310e3ece09SJunchao Zhang   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
2320e3ece09SJunchao Zhang   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
2337b8d4ba6SJunchao Zhang   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
2340e3ece09SJunchao Zhang   MatRowMapType          *Ti = Ti_h.data();
2357b8d4ba6SJunchao Zhang   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
2367b8d4ba6SJunchao Zhang   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
2370e3ece09SJunchao Zhang   PetscInt               *Tj   = Tj_h.data();
2380e3ece09SJunchao Zhang   PetscInt               *perm = perm_h.data();
2390e3ece09SJunchao Zhang   PetscInt               *offset;
240152b3e56SJunchao Zhang 
241152b3e56SJunchao Zhang   PetscFunctionBegin;
2420e3ece09SJunchao Zhang   // Populate Ti
2430e3ece09SJunchao Zhang   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
2440e3ece09SJunchao Zhang   Ti++;
2450e3ece09SJunchao Zhang   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
2460e3ece09SJunchao Zhang   Ti--;
2470e3ece09SJunchao Zhang   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
2480e3ece09SJunchao Zhang 
2490e3ece09SJunchao Zhang   // Populate Tj and the permutation array
2500e3ece09SJunchao Zhang   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
2510e3ece09SJunchao Zhang   for (PetscInt i = 0; i < m; i++) {
2520e3ece09SJunchao Zhang     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
2530e3ece09SJunchao Zhang       PetscInt r    = Aj[j];                       // row r of T
2540e3ece09SJunchao Zhang       PetscInt disp = Ti[r] + offset[r];
2550e3ece09SJunchao Zhang 
2560e3ece09SJunchao Zhang       Tj[disp]   = i; // col i of T
2570e3ece09SJunchao Zhang       perm[disp] = j;
2580e3ece09SJunchao Zhang       offset[r]++;
259076ba34aSJunchao Zhang     }
2600e3ece09SJunchao Zhang   }
2610e3ece09SJunchao Zhang   PetscCall(PetscFree(offset));
2620e3ece09SJunchao Zhang 
2630e3ece09SJunchao Zhang   // Sort each row of T, along with the permutation array
2640e3ece09SJunchao Zhang   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
2650e3ece09SJunchao Zhang 
2660e3ece09SJunchao Zhang   // Output perm and T on device
2670e3ece09SJunchao Zhang   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
2680e3ece09SJunchao Zhang   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
2690e3ece09SJunchao Zhang   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
2700e3ece09SJunchao Zhang   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
2713ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
272152b3e56SJunchao Zhang }
273152b3e56SJunchao Zhang 
2740e3ece09SJunchao Zhang // Generate the transpose on device and cache it internally
2750e3ece09SJunchao Zhang // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
2760e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
277d71ae5a4SJacob Faibussowitsch {
2780e3ece09SJunchao Zhang   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
2790e3ece09SJunchao Zhang   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
2800e3ece09SJunchao Zhang   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
2810e3ece09SJunchao Zhang   KokkosCsrMatrix  &T = akok->csrmatT;
282152b3e56SJunchao Zhang 
283152b3e56SJunchao Zhang   PetscFunctionBegin;
2840e3ece09SJunchao Zhang   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
285145b44c9SPierre Jolivet   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
2860e3ece09SJunchao Zhang 
2870e3ece09SJunchao Zhang   const auto &Aa = akok->a_dual.view_device();
2880e3ece09SJunchao Zhang 
2890e3ece09SJunchao Zhang   if (A->symmetric == PETSC_BOOL3_TRUE) {
2900e3ece09SJunchao Zhang     *csrmatT = akok->csrmat;
2910e3ece09SJunchao Zhang   } else {
2920e3ece09SJunchao Zhang     // See if we already have a cached transpose and its value is up to date
2930e3ece09SJunchao Zhang     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
2940e3ece09SJunchao Zhang       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
2950e3ece09SJunchao Zhang         const auto &perm = akok->transpose_perm; // get the permutation array
2960e3ece09SJunchao Zhang         auto       &Ta   = T.values;
2970e3ece09SJunchao Zhang 
298d326c3f1SJunchao Zhang         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
299076ba34aSJunchao Zhang       }
3000e3ece09SJunchao Zhang     } else { // Generate T of size n x m for the first time
3010e3ece09SJunchao Zhang       MatRowMapKokkosView perm;
3020e3ece09SJunchao Zhang 
3030e3ece09SJunchao Zhang       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
3040e3ece09SJunchao Zhang       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
305d326c3f1SJunchao Zhang       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
3060e3ece09SJunchao Zhang     }
3070e3ece09SJunchao Zhang     akok->transpose_updated = PETSC_TRUE;
3080e3ece09SJunchao Zhang     *csrmatT                = akok->csrmatT;
3090e3ece09SJunchao Zhang   }
3100e3ece09SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
3110e3ece09SJunchao Zhang }
3120e3ece09SJunchao Zhang 
3130e3ece09SJunchao Zhang // Generate the Hermitian on device and cache it internally
3140e3ece09SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
3150e3ece09SJunchao Zhang {
3160e3ece09SJunchao Zhang   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
3170e3ece09SJunchao Zhang   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
3180e3ece09SJunchao Zhang   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
3190e3ece09SJunchao Zhang   KokkosCsrMatrix  &T = akok->csrmatH;
3200e3ece09SJunchao Zhang 
3210e3ece09SJunchao Zhang   PetscFunctionBegin;
3220e3ece09SJunchao Zhang   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
3230e3ece09SJunchao Zhang   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
3240e3ece09SJunchao Zhang 
3250e3ece09SJunchao Zhang   const auto &Aa = akok->a_dual.view_device();
3260e3ece09SJunchao Zhang 
3270e3ece09SJunchao Zhang   if (A->hermitian == PETSC_BOOL3_TRUE) {
3280e3ece09SJunchao Zhang     *csrmatH = akok->csrmat;
3290e3ece09SJunchao Zhang   } else {
3300e3ece09SJunchao Zhang     // See if we already have a cached hermitian and its value is up to date
3310e3ece09SJunchao Zhang     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
3320e3ece09SJunchao Zhang       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
3330e3ece09SJunchao Zhang         const auto &perm = akok->transpose_perm; // get the permutation array
3340e3ece09SJunchao Zhang         auto       &Ta   = T.values;
3350e3ece09SJunchao Zhang 
336d326c3f1SJunchao Zhang         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
3370e3ece09SJunchao Zhang       }
3380e3ece09SJunchao Zhang     } else { // Generate T of size n x m for the first time
3390e3ece09SJunchao Zhang       MatRowMapKokkosView perm;
3400e3ece09SJunchao Zhang 
3410e3ece09SJunchao Zhang       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
3420e3ece09SJunchao Zhang       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
343d326c3f1SJunchao Zhang       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
3440e3ece09SJunchao Zhang     }
3450e3ece09SJunchao Zhang     akok->hermitian_updated = PETSC_TRUE;
3460e3ece09SJunchao Zhang     *csrmatH                = akok->csrmatH;
3470e3ece09SJunchao Zhang   }
3483ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
349152b3e56SJunchao Zhang }
350a587d139SMark 
3518c3ff71bSJunchao Zhang /* y = A x */
352d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
353d71ae5a4SJacob Faibussowitsch {
3548c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
355152b3e56SJunchao Zhang   ConstPetscScalarKokkosView xv;
356152b3e56SJunchao Zhang   PetscScalarKokkosView      yv;
3578c3ff71bSJunchao Zhang 
3588c3ff71bSJunchao Zhang   PetscFunctionBegin;
3599566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3609566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
3619566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
3629566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(yy, &yv));
3638c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
364d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
3659566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
3669566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
367076ba34aSJunchao Zhang   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
3689566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
3699566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3703ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3718c3ff71bSJunchao Zhang }
3728c3ff71bSJunchao Zhang 
3738c3ff71bSJunchao Zhang /* y = A^T x */
374d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
375d71ae5a4SJacob Faibussowitsch {
3768c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
377152b3e56SJunchao Zhang   const char                *mode;
378152b3e56SJunchao Zhang   ConstPetscScalarKokkosView xv;
379152b3e56SJunchao Zhang   PetscScalarKokkosView      yv;
3800e3ece09SJunchao Zhang   KokkosCsrMatrix            csrmat;
3818c3ff71bSJunchao Zhang 
3828c3ff71bSJunchao Zhang   PetscFunctionBegin;
3839566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3849566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
3859566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
3869566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(yy, &yv));
387152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
3889566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
389152b3e56SJunchao Zhang     mode = "N";
390152b3e56SJunchao Zhang   } else {
391076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
3920e3ece09SJunchao Zhang     csrmat = aijkok->csrmat;
393152b3e56SJunchao Zhang     mode   = "T";
394152b3e56SJunchao Zhang   }
395d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
3969566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
3979566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
3980e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
3999566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
4003ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4018c3ff71bSJunchao Zhang }
4028c3ff71bSJunchao Zhang 
4038c3ff71bSJunchao Zhang /* y = A^H x */
404d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
405d71ae5a4SJacob Faibussowitsch {
4068c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
407152b3e56SJunchao Zhang   const char                *mode;
408152b3e56SJunchao Zhang   ConstPetscScalarKokkosView xv;
409152b3e56SJunchao Zhang   PetscScalarKokkosView      yv;
4100e3ece09SJunchao Zhang   KokkosCsrMatrix            csrmat;
4118c3ff71bSJunchao Zhang 
4128c3ff71bSJunchao Zhang   PetscFunctionBegin;
4139566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4149566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
4159566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
4169566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(yy, &yv));
417152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
4189566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
419152b3e56SJunchao Zhang     mode = "N";
420152b3e56SJunchao Zhang   } else {
421076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
4220e3ece09SJunchao Zhang     csrmat = aijkok->csrmat;
423152b3e56SJunchao Zhang     mode   = "C";
424152b3e56SJunchao Zhang   }
425d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
4269566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
4279566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
4280e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
4299566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
4303ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4318c3ff71bSJunchao Zhang }
4328c3ff71bSJunchao Zhang 
4338c3ff71bSJunchao Zhang /* z = A x + y */
434d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
435d71ae5a4SJacob Faibussowitsch {
4368c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
43792896123SJunchao Zhang   ConstPetscScalarKokkosView xv;
438152b3e56SJunchao Zhang   PetscScalarKokkosView      zv;
4398c3ff71bSJunchao Zhang 
4408c3ff71bSJunchao Zhang   PetscFunctionBegin;
4419566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4429566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
44392896123SJunchao Zhang   if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
4449566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
44592896123SJunchao Zhang   PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
4468c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
447d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
4489566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
44992896123SJunchao Zhang   PetscCall(VecRestoreKokkosView(zz, &zv));
4509566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
4519566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
4523ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4538c3ff71bSJunchao Zhang }
4548c3ff71bSJunchao Zhang 
4558c3ff71bSJunchao Zhang /* z = A^T x + y */
456d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
457d71ae5a4SJacob Faibussowitsch {
4588c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
459152b3e56SJunchao Zhang   const char                *mode;
46092896123SJunchao Zhang   ConstPetscScalarKokkosView xv;
461152b3e56SJunchao Zhang   PetscScalarKokkosView      zv;
4620e3ece09SJunchao Zhang   KokkosCsrMatrix            csrmat;
4638c3ff71bSJunchao Zhang 
4648c3ff71bSJunchao Zhang   PetscFunctionBegin;
4659566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4669566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
46792896123SJunchao Zhang   if (zz != yy) PetscCall(VecCopy(yy, zz));
4689566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
46992896123SJunchao Zhang   PetscCall(VecGetKokkosView(zz, &zv));
470152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
4719566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
472152b3e56SJunchao Zhang     mode = "N";
473152b3e56SJunchao Zhang   } else {
474076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
4750e3ece09SJunchao Zhang     csrmat = aijkok->csrmat;
476152b3e56SJunchao Zhang     mode   = "T";
477152b3e56SJunchao Zhang   }
478d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
4799566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
48092896123SJunchao Zhang   PetscCall(VecRestoreKokkosView(zz, &zv));
4810e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
4829566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
4833ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4848c3ff71bSJunchao Zhang }
4858c3ff71bSJunchao Zhang 
4868c3ff71bSJunchao Zhang /* z = A^H x + y */
487d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
488d71ae5a4SJacob Faibussowitsch {
4898c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
490152b3e56SJunchao Zhang   const char                *mode;
49192896123SJunchao Zhang   ConstPetscScalarKokkosView xv;
492152b3e56SJunchao Zhang   PetscScalarKokkosView      zv;
4930e3ece09SJunchao Zhang   KokkosCsrMatrix            csrmat;
4948c3ff71bSJunchao Zhang 
4958c3ff71bSJunchao Zhang   PetscFunctionBegin;
4969566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4979566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
49892896123SJunchao Zhang   if (zz != yy) PetscCall(VecCopy(yy, zz));
4999566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
50092896123SJunchao Zhang   PetscCall(VecGetKokkosView(zz, &zv));
501152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
5029566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
503152b3e56SJunchao Zhang     mode = "N";
504152b3e56SJunchao Zhang   } else {
505076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
5060e3ece09SJunchao Zhang     csrmat = aijkok->csrmat;
507152b3e56SJunchao Zhang     mode   = "C";
508152b3e56SJunchao Zhang   }
509d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
5109566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
51192896123SJunchao Zhang   PetscCall(VecRestoreKokkosView(zz, &zv));
5120e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
5139566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
5143ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
515152b3e56SJunchao Zhang }
516152b3e56SJunchao Zhang 
51766976f2fSJacob Faibussowitsch static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
518d71ae5a4SJacob Faibussowitsch {
519152b3e56SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
520152b3e56SJunchao Zhang 
521152b3e56SJunchao Zhang   PetscFunctionBegin;
522152b3e56SJunchao Zhang   switch (op) {
523152b3e56SJunchao Zhang   case MAT_FORM_EXPLICIT_TRANSPOSE:
524152b3e56SJunchao Zhang     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
5259566063dSJacob Faibussowitsch     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
526152b3e56SJunchao Zhang     A->form_explicit_transpose = flg;
527152b3e56SJunchao Zhang     break;
528d71ae5a4SJacob Faibussowitsch   default:
529d71ae5a4SJacob Faibussowitsch     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
530d71ae5a4SJacob Faibussowitsch     break;
531152b3e56SJunchao Zhang   }
5323ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5338c3ff71bSJunchao Zhang }
5348c3ff71bSJunchao Zhang 
535076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */
536d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
537d71ae5a4SJacob Faibussowitsch {
538076ba34aSJunchao Zhang   Mat_SeqAIJ *aseq;
5398c3ff71bSJunchao Zhang 
5408c3ff71bSJunchao Zhang   PetscFunctionBegin;
5419566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
542076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
5439566063dSJacob Faibussowitsch     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));  /* the returned newmat is a SeqAIJKokkos */
5448c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
5459566063dSJacob Faibussowitsch     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
546076ba34aSJunchao Zhang   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
5475f80ce2aSJacob Faibussowitsch     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
5489566063dSJacob Faibussowitsch     PetscCall(PetscFree(A->defaultvectype));
5499566063dSJacob Faibussowitsch     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
5509566063dSJacob Faibussowitsch     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
5519566063dSJacob Faibussowitsch     PetscCall(MatSetOps_SeqAIJKokkos(A));
552076ba34aSJunchao Zhang     aseq = static_cast<Mat_SeqAIJ *>(A->data);
553394ed5ebSJunchao Zhang     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
5545f80ce2aSJacob Faibussowitsch       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
555f4747e26SJunchao Zhang       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
5568c3ff71bSJunchao Zhang     }
557076ba34aSJunchao Zhang   }
5583ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5598c3ff71bSJunchao Zhang }
5608c3ff71bSJunchao Zhang 
561076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
562076ba34aSJunchao Zhang    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
563076ba34aSJunchao Zhang  */
564d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
565d71ae5a4SJacob Faibussowitsch {
566076ba34aSJunchao Zhang   Mat_SeqAIJ       *bseq;
567076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
568076ba34aSJunchao Zhang   Mat               mat;
5698c3ff71bSJunchao Zhang 
5708c3ff71bSJunchao Zhang   PetscFunctionBegin;
571076ba34aSJunchao Zhang   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
5729566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
573076ba34aSJunchao Zhang   mat = *B;
574f4747e26SJunchao Zhang   if (A->assembled) {
575076ba34aSJunchao Zhang     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
576f4747e26SJunchao Zhang     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
577076ba34aSJunchao Zhang     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
578076ba34aSJunchao Zhang     /* Now copy values to B if needed */
579076ba34aSJunchao Zhang     if (dupOption == MAT_COPY_VALUES) {
580076ba34aSJunchao Zhang       if (akok->a_dual.need_sync_device()) {
581076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
582076ba34aSJunchao Zhang         bkok->a_dual.modify_host();
583076ba34aSJunchao Zhang       } else { /* If device has the latest data, we only copy data on device */
584076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
585076ba34aSJunchao Zhang         bkok->a_dual.modify_device();
586076ba34aSJunchao Zhang       }
587076ba34aSJunchao Zhang     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
588076ba34aSJunchao Zhang       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
589076ba34aSJunchao Zhang       bkok->a_dual.modify_host();
590076ba34aSJunchao Zhang     }
591076ba34aSJunchao Zhang     mat->spptr = bkok;
592076ba34aSJunchao Zhang   }
593076ba34aSJunchao Zhang 
5949566063dSJacob Faibussowitsch   PetscCall(PetscFree(mat->defaultvectype));
5959566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
5969566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
5979566063dSJacob Faibussowitsch   PetscCall(MatSetOps_SeqAIJKokkos(mat));
5983ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5998c3ff71bSJunchao Zhang }
6008c3ff71bSJunchao Zhang 
601d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
602d71ae5a4SJacob Faibussowitsch {
6030ecb592aSJunchao Zhang   Mat               At;
6040e3ece09SJunchao Zhang   KokkosCsrMatrix   internT;
6050ecb592aSJunchao Zhang   Mat_SeqAIJKokkos *atkok, *bkok;
6060ecb592aSJunchao Zhang 
6070ecb592aSJunchao Zhang   PetscFunctionBegin;
6087fb60732SBarry Smith   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
6099566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
6100ecb592aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
611ff751488SJunchao Zhang     /* Deep copy internT, as we want to isolate the internal transpose */
6120e3ece09SJunchao Zhang     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
6139566063dSJacob Faibussowitsch     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
6140ecb592aSJunchao Zhang     if (reuse == MAT_INITIAL_MATRIX) *B = At;
6159566063dSJacob Faibussowitsch     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
6160ecb592aSJunchao Zhang   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
6170ecb592aSJunchao Zhang     if ((*B)->assembled) {
6180ecb592aSJunchao Zhang       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
6190e3ece09SJunchao Zhang       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
6209566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
6210ecb592aSJunchao Zhang     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
6220ecb592aSJunchao Zhang       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
6230e3ece09SJunchao Zhang       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
6240e3ece09SJunchao Zhang       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
6250e3ece09SJunchao Zhang       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
6260e3ece09SJunchao Zhang       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
6270ecb592aSJunchao Zhang     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
6280ecb592aSJunchao Zhang   }
6293ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6300ecb592aSJunchao Zhang }
6310ecb592aSJunchao Zhang 
632d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
633d71ae5a4SJacob Faibussowitsch {
63486a27549SJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
6358c3ff71bSJunchao Zhang 
6368c3ff71bSJunchao Zhang   PetscFunctionBegin;
63786a27549SJunchao Zhang   if (A->factortype == MAT_FACTOR_NONE) {
63886a27549SJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
6398c3ff71bSJunchao Zhang     delete aijkok;
64086a27549SJunchao Zhang   } else {
64186a27549SJunchao Zhang     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
64286a27549SJunchao Zhang   }
643cbc6b225SStefano Zampini   A->spptr = NULL;
6449566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
6459566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
6469566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
64757761e9aSJunchao Zhang #if defined(PETSC_HAVE_HYPRE)
64857761e9aSJunchao Zhang   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
64957761e9aSJunchao Zhang #endif
6509566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
6513ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6528c3ff71bSJunchao Zhang }
6538c3ff71bSJunchao Zhang 
6543f3ba80aSJunchao Zhang /*MC
6553f3ba80aSJunchao Zhang    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
6563f3ba80aSJunchao Zhang 
65715229ffcSPierre Jolivet    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
6583f3ba80aSJunchao Zhang 
6592ef1f0ffSBarry Smith    Options Database Key:
66011a5261eSBarry Smith .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
6613f3ba80aSJunchao Zhang 
6623f3ba80aSJunchao Zhang   Level: beginner
6633f3ba80aSJunchao Zhang 
6641cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
6653f3ba80aSJunchao Zhang M*/
666d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
667d71ae5a4SJacob Faibussowitsch {
66886a27549SJunchao Zhang   PetscFunctionBegin;
6699566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
6709566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(A));
6719566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
6723ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
67386a27549SJunchao Zhang }
67486a27549SJunchao Zhang 
675076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
676d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
677d71ae5a4SJacob Faibussowitsch {
678076ba34aSJunchao Zhang   Mat_SeqAIJ         *a, *b;
679076ba34aSJunchao Zhang   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
680076ba34aSJunchao Zhang   MatScalarKokkosView aa, ba, ca;
681076ba34aSJunchao Zhang   MatRowMapKokkosView ai, bi, ci;
682076ba34aSJunchao Zhang   MatColIdxKokkosView aj, bj, cj;
683076ba34aSJunchao Zhang   PetscInt            m, n, nnz, aN;
684a3f881fbSStefano Zampini 
685a3f881fbSStefano Zampini   PetscFunctionBegin;
686076ba34aSJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
687076ba34aSJunchao Zhang   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
6884f572ea9SToby Isaac   PetscAssertPointer(C, 4);
689076ba34aSJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
690076ba34aSJunchao Zhang   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
6915f80ce2aSJacob Faibussowitsch   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
6925f80ce2aSJacob Faibussowitsch   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
693076ba34aSJunchao Zhang 
6949566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
6959566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B));
696076ba34aSJunchao Zhang   a    = static_cast<Mat_SeqAIJ *>(A->data);
697076ba34aSJunchao Zhang   b    = static_cast<Mat_SeqAIJ *>(B->data);
698076ba34aSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
699076ba34aSJunchao Zhang   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
700076ba34aSJunchao Zhang   aa   = akok->a_dual.view_device();
701076ba34aSJunchao Zhang   ai   = akok->i_dual.view_device();
702076ba34aSJunchao Zhang   ba   = bkok->a_dual.view_device();
703076ba34aSJunchao Zhang   bi   = bkok->i_dual.view_device();
704076ba34aSJunchao Zhang   m    = A->rmap->n; /* M, N and nnz of C */
705076ba34aSJunchao Zhang   n    = A->cmap->n + B->cmap->n;
706076ba34aSJunchao Zhang   nnz  = a->nz + b->nz;
707076ba34aSJunchao Zhang   aN   = A->cmap->n; /* N of A */
708076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) {
709076ba34aSJunchao Zhang     aj           = akok->j_dual.view_device();
710076ba34aSJunchao Zhang     bj           = bkok->j_dual.view_device();
711076ba34aSJunchao Zhang     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
712076ba34aSJunchao Zhang     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
713076ba34aSJunchao Zhang     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
714076ba34aSJunchao Zhang     ca           = ca_dual.view_device();
715076ba34aSJunchao Zhang     ci           = ci_dual.view_device();
716076ba34aSJunchao Zhang     cj           = cj_dual.view_device();
717076ba34aSJunchao Zhang 
718076ba34aSJunchao Zhang     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
7199371c9d4SSatish Balay     Kokkos::parallel_for(
720d326c3f1SJunchao Zhang       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
721076ba34aSJunchao Zhang         PetscInt i       = t.league_rank(); /* row i */
722076ba34aSJunchao Zhang         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
723076ba34aSJunchao Zhang 
724076ba34aSJunchao Zhang         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
725076ba34aSJunchao Zhang                                                    ci(i) = coffset;
726076ba34aSJunchao Zhang                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
727076ba34aSJunchao Zhang         });
728076ba34aSJunchao Zhang 
729076ba34aSJunchao Zhang         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
730076ba34aSJunchao Zhang           if (k < alen) {
731076ba34aSJunchao Zhang             ca(coffset + k) = aa(ai(i) + k);
732076ba34aSJunchao Zhang             cj(coffset + k) = aj(ai(i) + k);
733076ba34aSJunchao Zhang           } else {
734076ba34aSJunchao Zhang             ca(coffset + k) = ba(bi(i) + k - alen);
735076ba34aSJunchao Zhang             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
736076ba34aSJunchao Zhang           }
737076ba34aSJunchao Zhang         });
738076ba34aSJunchao Zhang       });
739076ba34aSJunchao Zhang     ca_dual.modify_device();
740076ba34aSJunchao Zhang     ci_dual.modify_device();
741076ba34aSJunchao Zhang     cj_dual.modify_device();
7429566063dSJacob Faibussowitsch     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
7439566063dSJacob Faibussowitsch     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
744076ba34aSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {
745076ba34aSJunchao Zhang     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
746076ba34aSJunchao Zhang     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
747076ba34aSJunchao Zhang     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
748076ba34aSJunchao Zhang     ca   = ckok->a_dual.view_device();
749076ba34aSJunchao Zhang     ci   = ckok->i_dual.view_device();
750076ba34aSJunchao Zhang 
7519371c9d4SSatish Balay     Kokkos::parallel_for(
752d326c3f1SJunchao Zhang       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
753076ba34aSJunchao Zhang         PetscInt i    = t.league_rank(); /* row i */
754076ba34aSJunchao Zhang         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
755076ba34aSJunchao Zhang         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
756076ba34aSJunchao Zhang           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
757076ba34aSJunchao Zhang           else ca(ci(i) + k) = ba(bi(i) + k - alen);
758076ba34aSJunchao Zhang         });
759076ba34aSJunchao Zhang       });
7609566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
761076ba34aSJunchao Zhang   }
7623ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
763076ba34aSJunchao Zhang }
764076ba34aSJunchao Zhang 
765d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
766d71ae5a4SJacob Faibussowitsch {
767076ba34aSJunchao Zhang   PetscFunctionBegin;
768076ba34aSJunchao Zhang   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
7693ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
770a3f881fbSStefano Zampini }
771a3f881fbSStefano Zampini 
772d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
773d71ae5a4SJacob Faibussowitsch {
774a3f881fbSStefano Zampini   Mat_Product                 *product = C->product;
775a3f881fbSStefano Zampini   Mat                          A, B;
776076ba34aSJunchao Zhang   bool                         transA, transB; /* use bool, since KK needs this type */
777a3f881fbSStefano Zampini   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
778a3f881fbSStefano Zampini   Mat_SeqAIJ                  *c;
779076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos *pdata;
7800e3ece09SJunchao Zhang   KokkosCsrMatrix              csrmatA, csrmatB;
781a3f881fbSStefano Zampini 
782a3f881fbSStefano Zampini   PetscFunctionBegin;
783a3f881fbSStefano Zampini   MatCheckProduct(C, 1);
7845f80ce2aSJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
785076ba34aSJunchao Zhang   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
786076ba34aSJunchao Zhang 
7870e3ece09SJunchao Zhang   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
7880e3ece09SJunchao Zhang   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
7890e3ece09SJunchao Zhang   // we still do numeric.
7900e3ece09SJunchao Zhang   if (pdata->reusesym) { // numeric reuses results from symbolic
7910e3ece09SJunchao Zhang     pdata->reusesym = PETSC_FALSE;
7923ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
793076ba34aSJunchao Zhang   }
794076ba34aSJunchao Zhang 
795076ba34aSJunchao Zhang   switch (product->type) {
7969371c9d4SSatish Balay   case MATPRODUCT_AB:
7979371c9d4SSatish Balay     transA = false;
7989371c9d4SSatish Balay     transB = false;
7999371c9d4SSatish Balay     break;
8009371c9d4SSatish Balay   case MATPRODUCT_AtB:
8019371c9d4SSatish Balay     transA = true;
8029371c9d4SSatish Balay     transB = false;
8039371c9d4SSatish Balay     break;
8049371c9d4SSatish Balay   case MATPRODUCT_ABt:
8059371c9d4SSatish Balay     transA = false;
8069371c9d4SSatish Balay     transB = true;
8079371c9d4SSatish Balay     break;
808d71ae5a4SJacob Faibussowitsch   default:
809d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
810076ba34aSJunchao Zhang   }
811076ba34aSJunchao Zhang 
812a3f881fbSStefano Zampini   A = product->A;
813a3f881fbSStefano Zampini   B = product->B;
8149566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
8159566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B));
816a3f881fbSStefano Zampini   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
817a3f881fbSStefano Zampini   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
818a3f881fbSStefano Zampini   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
819076ba34aSJunchao Zhang 
8205f80ce2aSJacob Faibussowitsch   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
821076ba34aSJunchao Zhang 
8220e3ece09SJunchao Zhang   csrmatA = akok->csrmat;
8230e3ece09SJunchao Zhang   csrmatB = bkok->csrmat;
824076ba34aSJunchao Zhang 
825076ba34aSJunchao Zhang   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
826076ba34aSJunchao Zhang   if (transA) {
8279566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
828076ba34aSJunchao Zhang     transA = false;
829a3f881fbSStefano Zampini   }
830a3f881fbSStefano Zampini 
831076ba34aSJunchao Zhang   if (transB) {
8329566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
833076ba34aSJunchao Zhang     transB = false;
834076ba34aSJunchao Zhang   }
8359566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
8360e3ece09SJunchao Zhang   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
8370e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
838866eb059SJunchao Zhang   auto spgemmHandle = pdata->kh.get_spgemm_handle();
839866eb059SJunchao Zhang   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
840e944a159SJunchao Zhang #endif
841866eb059SJunchao Zhang 
8429566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
8439566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(C));
844a3f881fbSStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
845a3f881fbSStefano Zampini   c = (Mat_SeqAIJ *)C->data;
8469566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
8479566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
8489566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
849a3f881fbSStefano Zampini   c->reallocs         = 0;
850076ba34aSJunchao Zhang   C->info.mallocs     = 0;
851a3f881fbSStefano Zampini   C->info.nz_unneeded = 0;
852a3f881fbSStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
853a3f881fbSStefano Zampini   C->num_ass++;
8543ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
855a3f881fbSStefano Zampini }
856a3f881fbSStefano Zampini 
857d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
858d71ae5a4SJacob Faibussowitsch {
859076ba34aSJunchao Zhang   Mat_Product                 *product = C->product;
860076ba34aSJunchao Zhang   MatProductType               ptype;
861076ba34aSJunchao Zhang   Mat                          A, B;
862076ba34aSJunchao Zhang   bool                         transA, transB;
863076ba34aSJunchao Zhang   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
864076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos *pdata;
865076ba34aSJunchao Zhang   MPI_Comm                     comm;
8660e3ece09SJunchao Zhang   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
867a3f881fbSStefano Zampini 
868a3f881fbSStefano Zampini   PetscFunctionBegin;
869a3f881fbSStefano Zampini   MatCheckProduct(C, 1);
8709566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
8715f80ce2aSJacob Faibussowitsch   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
872a3f881fbSStefano Zampini   A = product->A;
873a3f881fbSStefano Zampini   B = product->B;
8749566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
8759566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B));
876a3f881fbSStefano Zampini   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
877a3f881fbSStefano Zampini   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
8780e3ece09SJunchao Zhang   csrmatA = akok->csrmat;
8790e3ece09SJunchao Zhang   csrmatB = bkok->csrmat;
880076ba34aSJunchao Zhang 
881a3f881fbSStefano Zampini   ptype = product->type;
8820e3ece09SJunchao Zhang   // Take advantage of the symmetry if true
8830e3ece09SJunchao Zhang   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
8840e3ece09SJunchao Zhang     ptype                                          = MATPRODUCT_AB;
8850e3ece09SJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
8860e3ece09SJunchao Zhang   }
8870e3ece09SJunchao Zhang   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
8880e3ece09SJunchao Zhang     ptype                                          = MATPRODUCT_AB;
8890e3ece09SJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
8900e3ece09SJunchao Zhang   }
8910e3ece09SJunchao Zhang 
892a3f881fbSStefano Zampini   switch (ptype) {
8939371c9d4SSatish Balay   case MATPRODUCT_AB:
8949371c9d4SSatish Balay     transA = false;
8959371c9d4SSatish Balay     transB = false;
8960e6a1e94SMark Adams     PetscCall(MatSetBlockSizesFromMats(C, A, B));
8979371c9d4SSatish Balay     break;
8989371c9d4SSatish Balay   case MATPRODUCT_AtB:
8999371c9d4SSatish Balay     transA = true;
9009371c9d4SSatish Balay     transB = false;
9010e6a1e94SMark Adams     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
9020e6a1e94SMark Adams     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
9039371c9d4SSatish Balay     break;
9049371c9d4SSatish Balay   case MATPRODUCT_ABt:
9059371c9d4SSatish Balay     transA = false;
9069371c9d4SSatish Balay     transB = true;
9070e6a1e94SMark Adams     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
9080e6a1e94SMark Adams     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
9099371c9d4SSatish Balay     break;
910d71ae5a4SJacob Faibussowitsch   default:
911d71ae5a4SJacob Faibussowitsch     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
912a3f881fbSStefano Zampini   }
9130e3ece09SJunchao Zhang   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
914076ba34aSJunchao Zhang   pdata->reusesym = product->api_user;
915a3f881fbSStefano Zampini 
916076ba34aSJunchao Zhang   /* TODO: add command line options to select spgemm algorithms */
917866eb059SJunchao Zhang   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
918866eb059SJunchao Zhang 
919866eb059SJunchao Zhang   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
920866eb059SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
921866eb059SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
922866eb059SJunchao Zhang   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
923866eb059SJunchao Zhang   #endif
924866eb059SJunchao Zhang #endif
9250e3ece09SJunchao Zhang   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
926076ba34aSJunchao Zhang 
9279566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
928076ba34aSJunchao Zhang   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
929076ba34aSJunchao Zhang   if (transA) {
9309566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
931076ba34aSJunchao Zhang     transA = false;
932076ba34aSJunchao Zhang   }
933076ba34aSJunchao Zhang 
934076ba34aSJunchao Zhang   if (transB) {
9359566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
936076ba34aSJunchao Zhang     transB = false;
937076ba34aSJunchao Zhang   }
938076ba34aSJunchao Zhang 
9390e3ece09SJunchao Zhang   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
940076ba34aSJunchao Zhang   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
941076ba34aSJunchao Zhang     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
942076ba34aSJunchao Zhang     calling new Mat_SeqAIJKokkos().
943076ba34aSJunchao Zhang     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
944076ba34aSJunchao Zhang   */
9450e3ece09SJunchao Zhang   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
9460e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
947866eb059SJunchao Zhang   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
948866eb059SJunchao Zhang   auto spgemmHandle = pdata->kh.get_spgemm_handle();
949866eb059SJunchao Zhang   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
950e944a159SJunchao Zhang #endif
9519566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
952076ba34aSJunchao Zhang 
9539566063dSJacob Faibussowitsch   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
9549566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
955076ba34aSJunchao Zhang   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
9563ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
957a3f881fbSStefano Zampini }
958a3f881fbSStefano Zampini 
959a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */
960d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
961d71ae5a4SJacob Faibussowitsch {
962076ba34aSJunchao Zhang   Mat_Product *product = mat->product;
963a3f881fbSStefano Zampini   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
964a3f881fbSStefano Zampini 
965a3f881fbSStefano Zampini   PetscFunctionBegin;
966a3f881fbSStefano Zampini   MatCheckProduct(mat, 1);
9679566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
96848a46eb9SPierre Jolivet   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
969a3f881fbSStefano Zampini   if (Biskok && Ciskok) {
970a3f881fbSStefano Zampini     switch (product->type) {
971a3f881fbSStefano Zampini     case MATPRODUCT_AB:
972a3f881fbSStefano Zampini     case MATPRODUCT_AtB:
973d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABt:
974d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
975d71ae5a4SJacob Faibussowitsch       break;
976a3f881fbSStefano Zampini     case MATPRODUCT_PtAP:
977a3f881fbSStefano Zampini     case MATPRODUCT_RARt:
978d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABC:
979d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
980d71ae5a4SJacob Faibussowitsch       break;
981d71ae5a4SJacob Faibussowitsch     default:
982d71ae5a4SJacob Faibussowitsch       break;
983a3f881fbSStefano Zampini     }
984a3f881fbSStefano Zampini   } else { /* fallback for AIJ */
9859566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
986a3f881fbSStefano Zampini   }
9873ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
988a3f881fbSStefano Zampini }
989a587d139SMark 
990d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
991d71ae5a4SJacob Faibussowitsch {
992f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok;
993f0cf5187SStefano Zampini 
994f0cf5187SStefano Zampini   PetscFunctionBegin;
9959566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
9969566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
997f0cf5187SStefano Zampini   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
998d326c3f1SJunchao Zhang   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
9999566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(A));
10009566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
10019566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
10023ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1003f0cf5187SStefano Zampini }
1004f0cf5187SStefano Zampini 
1005f4747e26SJunchao Zhang // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
1006f4747e26SJunchao Zhang static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1007f4747e26SJunchao Zhang {
1008f4747e26SJunchao Zhang   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1009f4747e26SJunchao Zhang 
1010f4747e26SJunchao Zhang   PetscFunctionBegin;
1011f4747e26SJunchao Zhang   if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
1012f4747e26SJunchao Zhang     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1013f4747e26SJunchao Zhang 
1014f4747e26SJunchao Zhang     PetscCall(PetscLogGpuTimeBegin());
1015f4747e26SJunchao Zhang     PetscCall(MatSeqAIJKokkosSyncDevice(A));
1016f4747e26SJunchao Zhang     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1017f4747e26SJunchao Zhang     const auto &Aa     = aijkok->a_dual.view_device();
1018f4747e26SJunchao Zhang     const auto &Adiag  = aijkok->diag_dual.view_device();
1019d326c3f1SJunchao Zhang     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1020f4747e26SJunchao Zhang     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1021f4747e26SJunchao Zhang     PetscCall(PetscLogGpuFlops(n));
1022f4747e26SJunchao Zhang     PetscCall(PetscLogGpuTimeEnd());
1023f4747e26SJunchao Zhang   } else { // need reassembly, very slow!
1024f4747e26SJunchao Zhang     PetscCall(MatShift_Basic(A, a));
1025f4747e26SJunchao Zhang   }
1026f4747e26SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1027f4747e26SJunchao Zhang }
1028f4747e26SJunchao Zhang 
1029f4747e26SJunchao Zhang static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1030f4747e26SJunchao Zhang {
1031f4747e26SJunchao Zhang   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1032f4747e26SJunchao Zhang 
1033f4747e26SJunchao Zhang   PetscFunctionBegin;
1034f4747e26SJunchao Zhang   if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1035f4747e26SJunchao Zhang     ConstPetscScalarKokkosView dv;
1036f4747e26SJunchao Zhang     PetscInt                   n, nv;
1037f4747e26SJunchao Zhang 
1038f4747e26SJunchao Zhang     PetscCall(PetscLogGpuTimeBegin());
1039f4747e26SJunchao Zhang     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1040f4747e26SJunchao Zhang     PetscCall(VecGetKokkosView(D, &dv));
1041f4747e26SJunchao Zhang     PetscCall(VecGetLocalSize(D, &nv));
1042f4747e26SJunchao Zhang     n = PetscMin(Y->rmap->n, Y->cmap->n);
1043f4747e26SJunchao Zhang     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1044f4747e26SJunchao Zhang 
1045f4747e26SJunchao Zhang     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1046f4747e26SJunchao Zhang     const auto &Aa     = aijkok->a_dual.view_device();
1047f4747e26SJunchao Zhang     const auto &Adiag  = aijkok->diag_dual.view_device();
1048f4747e26SJunchao Zhang     PetscCallCXX(Kokkos::parallel_for(
1049d326c3f1SJunchao Zhang       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1050f4747e26SJunchao Zhang         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1051f4747e26SJunchao Zhang         else Aa(Adiag(i)) += dv(i);
1052f4747e26SJunchao Zhang       }));
1053f4747e26SJunchao Zhang     PetscCall(VecRestoreKokkosView(D, &dv));
1054f4747e26SJunchao Zhang     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1055f4747e26SJunchao Zhang     PetscCall(PetscLogGpuFlops(n));
1056f4747e26SJunchao Zhang     PetscCall(PetscLogGpuTimeEnd());
1057f4747e26SJunchao Zhang   } else { // need reassembly, very slow!
1058f4747e26SJunchao Zhang     PetscCall(MatDiagonalSet_Default(Y, D, is));
1059f4747e26SJunchao Zhang   }
1060f4747e26SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1061f4747e26SJunchao Zhang }
1062f4747e26SJunchao Zhang 
1063f4747e26SJunchao Zhang static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1064f4747e26SJunchao Zhang {
1065f4747e26SJunchao Zhang   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1066f4747e26SJunchao Zhang   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1067f4747e26SJunchao Zhang   ConstPetscScalarKokkosView lv, rv;
1068f4747e26SJunchao Zhang 
1069f4747e26SJunchao Zhang   PetscFunctionBegin;
1070f4747e26SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1071f4747e26SJunchao Zhang   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1072f4747e26SJunchao Zhang   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1073f4747e26SJunchao Zhang   const auto &Aa     = aijkok->a_dual.view_device();
1074f4747e26SJunchao Zhang   const auto &Ai     = aijkok->i_dual.view_device();
1075f4747e26SJunchao Zhang   const auto &Aj     = aijkok->j_dual.view_device();
1076f4747e26SJunchao Zhang   if (ll) {
1077f4747e26SJunchao Zhang     PetscCall(VecGetLocalSize(ll, &m));
1078f4747e26SJunchao Zhang     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1079f4747e26SJunchao Zhang     PetscCall(VecGetKokkosView(ll, &lv));
1080f4747e26SJunchao Zhang     PetscCallCXX(Kokkos::parallel_for( // for each row
1081d326c3f1SJunchao Zhang       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1082f4747e26SJunchao Zhang         PetscInt i   = t.league_rank(); // row i
1083f4747e26SJunchao Zhang         PetscInt len = Ai(i + 1) - Ai(i);
1084f4747e26SJunchao Zhang         // scale entries on the row
1085f4747e26SJunchao Zhang         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1086f4747e26SJunchao Zhang       }));
1087f4747e26SJunchao Zhang     PetscCall(VecRestoreKokkosView(ll, &lv));
1088f4747e26SJunchao Zhang     PetscCall(PetscLogGpuFlops(nz));
1089f4747e26SJunchao Zhang   }
1090f4747e26SJunchao Zhang   if (rr) {
1091f4747e26SJunchao Zhang     PetscCall(VecGetLocalSize(rr, &n));
1092f4747e26SJunchao Zhang     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1093f4747e26SJunchao Zhang     PetscCall(VecGetKokkosView(rr, &rv));
1094f4747e26SJunchao Zhang     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1095d326c3f1SJunchao Zhang       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1096f4747e26SJunchao Zhang     PetscCall(VecRestoreKokkosView(rr, &lv));
1097f4747e26SJunchao Zhang     PetscCall(PetscLogGpuFlops(nz));
1098f4747e26SJunchao Zhang   }
1099f4747e26SJunchao Zhang   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1100f4747e26SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1101f4747e26SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1102f4747e26SJunchao Zhang }
1103f4747e26SJunchao Zhang 
1104d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1105d71ae5a4SJacob Faibussowitsch {
1106076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
1107a587d139SMark 
1108a587d139SMark   PetscFunctionBegin;
1109076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
11102328674fSJunchao Zhang   if (aijkok) { /* Only zero the device if data is already there */
1111d326c3f1SJunchao Zhang     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
11129566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(A));
11132328674fSJunchao Zhang   } else { /* Might be preallocated but not assembled */
11149566063dSJacob Faibussowitsch     PetscCall(MatZeroEntries_SeqAIJ(A));
11152328674fSJunchao Zhang   }
11163ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1117a587d139SMark }
1118a587d139SMark 
1119d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1120d71ae5a4SJacob Faibussowitsch {
1121f78ce678SMark Adams   Mat_SeqAIJKokkos     *aijkok;
1122f78ce678SMark Adams   PetscInt              n;
1123f78ce678SMark Adams   PetscScalarKokkosView xv;
1124f78ce678SMark Adams 
1125f78ce678SMark Adams   PetscFunctionBegin;
1126f78ce678SMark Adams   PetscCall(VecGetLocalSize(x, &n));
1127f78ce678SMark Adams   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1128f78ce678SMark Adams   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1129f78ce678SMark Adams 
1130f78ce678SMark Adams   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1131f78ce678SMark Adams   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1132f78ce678SMark Adams 
1133f78ce678SMark Adams   const auto &Aa    = aijkok->a_dual.view_device();
1134f78ce678SMark Adams   const auto &Ai    = aijkok->i_dual.view_device();
1135f78ce678SMark Adams   const auto &Adiag = aijkok->diag_dual.view_device();
1136f78ce678SMark Adams 
1137f78ce678SMark Adams   PetscCall(VecGetKokkosViewWrite(x, &xv));
11389371c9d4SSatish Balay   Kokkos::parallel_for(
1139d326c3f1SJunchao Zhang     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1140f78ce678SMark Adams       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1141f78ce678SMark Adams       else xv(i) = 0;
1142f78ce678SMark Adams     });
1143f78ce678SMark Adams   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
11443ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1145f78ce678SMark Adams }
1146f78ce678SMark Adams 
1147db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1148d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1149d71ae5a4SJacob Faibussowitsch {
1150db78de30SJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
1151db78de30SJunchao Zhang 
1152db78de30SJunchao Zhang   PetscFunctionBegin;
1153db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
11544f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1155db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
11569566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1157db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1158076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
11593ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1160db78de30SJunchao Zhang }
1161db78de30SJunchao Zhang 
1162d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1163d71ae5a4SJacob Faibussowitsch {
1164db78de30SJunchao Zhang   PetscFunctionBegin;
1165db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
11664f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1167db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
11683ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1169db78de30SJunchao Zhang }
1170db78de30SJunchao Zhang 
1171d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1172d71ae5a4SJacob Faibussowitsch {
1173db78de30SJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
1174db78de30SJunchao Zhang 
1175db78de30SJunchao Zhang   PetscFunctionBegin;
1176db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
11774f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1178db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
11799566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1180db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1181076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
11823ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1183db78de30SJunchao Zhang }
1184db78de30SJunchao Zhang 
1185d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1186d71ae5a4SJacob Faibussowitsch {
1187db78de30SJunchao Zhang   PetscFunctionBegin;
1188db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
11894f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1190db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
11919566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(A));
11923ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1193db78de30SJunchao Zhang }
1194db78de30SJunchao Zhang 
1195d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1196d71ae5a4SJacob Faibussowitsch {
1197db78de30SJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
1198db78de30SJunchao Zhang 
1199db78de30SJunchao Zhang   PetscFunctionBegin;
1200db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
12014f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1202db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1203db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1204076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
12053ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1206db78de30SJunchao Zhang }
1207db78de30SJunchao Zhang 
1208d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1209d71ae5a4SJacob Faibussowitsch {
1210db78de30SJunchao Zhang   PetscFunctionBegin;
1211db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
12124f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1213db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
12149566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(A));
12153ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1216db78de30SJunchao Zhang }
1217db78de30SJunchao Zhang 
1218*c0c276a7Ssdargavi PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A)
1219*c0c276a7Ssdargavi {
1220*c0c276a7Ssdargavi   Mat_SeqAIJKokkos *akok;
1221*c0c276a7Ssdargavi 
1222*c0c276a7Ssdargavi   PetscFunctionBegin;
1223*c0c276a7Ssdargavi   auto exec = PetscGetKokkosExecutionSpace();
1224*c0c276a7Ssdargavi   // Create host copies of the input aij
1225*c0c276a7Ssdargavi   auto i_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), i_d);
1226*c0c276a7Ssdargavi   auto j_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), j_d);
1227*c0c276a7Ssdargavi   // Don't copy the vals to the host now
1228*c0c276a7Ssdargavi   auto a_h = Kokkos::create_mirror_view(HostMirrorMemorySpace(), a_d);
1229*c0c276a7Ssdargavi 
1230*c0c276a7Ssdargavi   MatScalarKokkosDualView a_dual = MatScalarKokkosDualView(a_d, a_h);
1231*c0c276a7Ssdargavi   // Note we have modified device data so it will copy lazily
1232*c0c276a7Ssdargavi   a_dual.modify_device();
1233*c0c276a7Ssdargavi   MatRowMapKokkosDualView i_dual = MatRowMapKokkosDualView(i_d, i_h);
1234*c0c276a7Ssdargavi   MatColIdxKokkosDualView j_dual = MatColIdxKokkosDualView(j_d, j_h);
1235*c0c276a7Ssdargavi 
1236*c0c276a7Ssdargavi   PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_dual.extent(0), i_dual, j_dual, a_dual));
1237*c0c276a7Ssdargavi   PetscCall(MatCreate(comm, A));
1238*c0c276a7Ssdargavi   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1239*c0c276a7Ssdargavi   PetscFunctionReturn(PETSC_SUCCESS);
1240*c0c276a7Ssdargavi }
1241*c0c276a7Ssdargavi 
1242c17cf699SJunchao Zhang /* Computes Y += alpha X */
1243d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1244d71ae5a4SJacob Faibussowitsch {
1245a587d139SMark   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1246c17cf699SJunchao Zhang   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1247c17cf699SJunchao Zhang   ConstMatScalarKokkosView Xa;
1248c17cf699SJunchao Zhang   MatScalarKokkosView      Ya;
12494df4a32cSJunchao Zhang   auto                     exec = PetscGetKokkosExecutionSpace();
1250a587d139SMark 
1251a587d139SMark   PetscFunctionBegin;
1252c17cf699SJunchao Zhang   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1253c17cf699SJunchao Zhang   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
12549566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
12559566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(X));
12569566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1257db78de30SJunchao Zhang 
1258c17cf699SJunchao Zhang   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1259a587d139SMark     PetscBool e;
12609566063dSJacob Faibussowitsch     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1261a587d139SMark     if (e) {
12629566063dSJacob Faibussowitsch       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1263c17cf699SJunchao Zhang       if (e) pattern = SAME_NONZERO_PATTERN;
1264a587d139SMark     }
1265a587d139SMark   }
1266db78de30SJunchao Zhang 
1267c17cf699SJunchao Zhang   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1268c17cf699SJunchao Zhang     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1269c17cf699SJunchao Zhang     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1270c17cf699SJunchao Zhang     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1271c17cf699SJunchao Zhang   */
1272c17cf699SJunchao Zhang   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1273c17cf699SJunchao Zhang   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1274c17cf699SJunchao Zhang   Xa   = xkok->a_dual.view_device();
1275c17cf699SJunchao Zhang   Ya   = ykok->a_dual.view_device();
1276c17cf699SJunchao Zhang 
1277c17cf699SJunchao Zhang   if (pattern == SAME_NONZERO_PATTERN) {
1278d326c3f1SJunchao Zhang     KokkosBlas::axpy(exec, alpha, Xa, Ya);
12799566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1280c17cf699SJunchao Zhang   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1281c17cf699SJunchao Zhang     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1282c17cf699SJunchao Zhang     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1283c17cf699SJunchao Zhang 
12849371c9d4SSatish Balay     Kokkos::parallel_for(
1285d326c3f1SJunchao Zhang       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
12860e3ece09SJunchao Zhang         PetscInt i = t.league_rank(); // row i
12870e3ece09SJunchao Zhang         Kokkos::single(Kokkos::PerTeam(t), [=]() {
12880e3ece09SJunchao Zhang           // Only one thread works in a team
1289c17cf699SJunchao Zhang           PetscInt p, q = Yi(i);
12900e3ece09SJunchao Zhang           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
12910e3ece09SJunchao Zhang             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
12920e3ece09SJunchao Zhang             if (Xj(p) == Yj(q)) {                        // Found it
1293c17cf699SJunchao Zhang               Ya(q) += alpha * Xa(p);
1294c17cf699SJunchao Zhang               q++;
1295a587d139SMark             } else {
12960e3ece09SJunchao Zhang             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
12970e3ece09SJunchao Zhang             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
12980e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
12990e3ece09SJunchao Zhang               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
13008b8b16f9SJunchao Zhang #else
13010e3ece09SJunchao Zhang               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
13028b8b16f9SJunchao Zhang #endif
1303a587d139SMark             }
1304c17cf699SJunchao Zhang           }
1305c17cf699SJunchao Zhang         });
1306c17cf699SJunchao Zhang       });
13079566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
13080e3ece09SJunchao Zhang   } else { // different nonzero patterns
1309c17cf699SJunchao Zhang     Mat             Z;
1310c17cf699SJunchao Zhang     KokkosCsrMatrix zcsr;
1311c17cf699SJunchao Zhang     KernelHandle    kh;
13120e3ece09SJunchao Zhang     kh.create_spadd_handle(true); // X, Y are sorted
1313c17cf699SJunchao Zhang     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1314c17cf699SJunchao Zhang     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1315c17cf699SJunchao Zhang     zkok = new Mat_SeqAIJKokkos(zcsr);
13169566063dSJacob Faibussowitsch     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
13179566063dSJacob Faibussowitsch     PetscCall(MatHeaderReplace(Y, &Z));
1318c17cf699SJunchao Zhang     kh.destroy_spadd_handle();
1319c17cf699SJunchao Zhang   }
13209566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
13210e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
13223ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1323a587d139SMark }
1324a587d139SMark 
13252c4ab24aSJunchao Zhang struct MatCOOStruct_SeqAIJKokkos {
13262c4ab24aSJunchao Zhang   PetscCount           n;
13272c4ab24aSJunchao Zhang   PetscCount           Atot;
13282c4ab24aSJunchao Zhang   PetscInt             nz;
13292c4ab24aSJunchao Zhang   PetscCountKokkosView jmap;
13302c4ab24aSJunchao Zhang   PetscCountKokkosView perm;
13312c4ab24aSJunchao Zhang 
13322c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
13332c4ab24aSJunchao Zhang   {
13342c4ab24aSJunchao Zhang     nz   = coo_h->nz;
13352c4ab24aSJunchao Zhang     n    = coo_h->n;
13362c4ab24aSJunchao Zhang     Atot = coo_h->Atot;
13372c4ab24aSJunchao Zhang     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
13382c4ab24aSJunchao Zhang     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
13392c4ab24aSJunchao Zhang   }
13402c4ab24aSJunchao Zhang };
13412c4ab24aSJunchao Zhang 
134249abdd8aSBarry Smith static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
13432c4ab24aSJunchao Zhang {
13442c4ab24aSJunchao Zhang   PetscFunctionBegin;
134549abdd8aSBarry Smith   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
13462c4ab24aSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
13472c4ab24aSJunchao Zhang }
13482c4ab24aSJunchao Zhang 
1349d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1350d71ae5a4SJacob Faibussowitsch {
135142550becSJunchao Zhang   Mat_SeqAIJKokkos          *akok;
135242550becSJunchao Zhang   Mat_SeqAIJ                *aseq;
135303e76207SPierre Jolivet   PetscContainer             container_h;
13542c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJ       *coo_h;
13552c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJKokkos *coo_d;
135642550becSJunchao Zhang 
135742550becSJunchao Zhang   PetscFunctionBegin;
13589566063dSJacob Faibussowitsch   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1359394ed5ebSJunchao Zhang   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
136042550becSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1361cbc6b225SStefano Zampini   delete akok;
1362f4747e26SJunchao Zhang   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
13639566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
13642c4ab24aSJunchao Zhang 
13652c4ab24aSJunchao Zhang   // Copy the COO struct to device
13662c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
13672c4ab24aSJunchao Zhang   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
13682c4ab24aSJunchao Zhang   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
13692c4ab24aSJunchao Zhang 
13702c4ab24aSJunchao Zhang   // Put the COO struct in a container and then attach that to the matrix
137103e76207SPierre Jolivet   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
13723ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
137342550becSJunchao Zhang }
137442550becSJunchao Zhang 
1375d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1376d71ae5a4SJacob Faibussowitsch {
137742550becSJunchao Zhang   MatScalarKokkosView        Aa;
137842550becSJunchao Zhang   ConstMatScalarKokkosView   kv;
137942550becSJunchao Zhang   PetscMemType               memtype;
13802c4ab24aSJunchao Zhang   PetscContainer             container;
13812c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJKokkos *coo;
138242550becSJunchao Zhang 
138342550becSJunchao Zhang   PetscFunctionBegin;
13842c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
13852c4ab24aSJunchao Zhang   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
13862c4ab24aSJunchao Zhang 
13872c4ab24aSJunchao Zhang   const auto &n    = coo->n;
13882c4ab24aSJunchao Zhang   const auto &Annz = coo->nz;
13892c4ab24aSJunchao Zhang   const auto &jmap = coo->jmap;
13902c4ab24aSJunchao Zhang   const auto &perm = coo->perm;
13912c4ab24aSJunchao Zhang 
13929566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(v, &memtype));
139342550becSJunchao Zhang   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
13942c4ab24aSJunchao Zhang     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
139542550becSJunchao Zhang   } else {
13962c4ab24aSJunchao Zhang     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
139742550becSJunchao Zhang   }
139842550becSJunchao Zhang 
1399c7b718f4SJunchao Zhang   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1400c7b718f4SJunchao Zhang   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
140142550becSJunchao Zhang 
140208bb9926SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
14039371c9d4SSatish Balay   Kokkos::parallel_for(
1404d326c3f1SJunchao Zhang     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1405c7b718f4SJunchao Zhang       PetscScalar sum = 0.0;
1406c7b718f4SJunchao Zhang       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1407c7b718f4SJunchao Zhang       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1408c7b718f4SJunchao Zhang     });
140908bb9926SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1410394ed5ebSJunchao Zhang 
14119566063dSJacob Faibussowitsch   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
14129566063dSJacob Faibussowitsch   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
14133ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
141442550becSJunchao Zhang }
141542550becSJunchao Zhang 
1416d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1417d71ae5a4SJacob Faibussowitsch {
14188f7e8f9dSMark Adams   PetscFunctionBegin;
14199566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncHost(A));
14209566063dSJacob Faibussowitsch   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
14218f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
14223ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
14238f7e8f9dSMark Adams }
14248f7e8f9dSMark Adams 
1425d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1426d71ae5a4SJacob Faibussowitsch {
1427076ba34aSJunchao Zhang   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1428076ba34aSJunchao Zhang 
14298c3ff71bSJunchao Zhang   PetscFunctionBegin;
1430076ba34aSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
14316f3d89d0SStefano Zampini   A->boundtocpu  = PETSC_FALSE;
14326f3d89d0SStefano Zampini 
14338c3ff71bSJunchao Zhang   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
14348c3ff71bSJunchao Zhang   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
14358c3ff71bSJunchao Zhang   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1436a587d139SMark   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1437f0cf5187SStefano Zampini   A->ops->scale                     = MatScale_SeqAIJKokkos;
1438a587d139SMark   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1439076ba34aSJunchao Zhang   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
14408c3ff71bSJunchao Zhang   A->ops->mult                      = MatMult_SeqAIJKokkos;
14418c3ff71bSJunchao Zhang   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
14428c3ff71bSJunchao Zhang   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
14438c3ff71bSJunchao Zhang   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
14448c3ff71bSJunchao Zhang   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
14458c3ff71bSJunchao Zhang   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1446076ba34aSJunchao Zhang   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
14470ecb592aSJunchao Zhang   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1448152b3e56SJunchao Zhang   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1449f78ce678SMark Adams   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1450f4747e26SJunchao Zhang   A->ops->shift                     = MatShift_SeqAIJKokkos;
1451f4747e26SJunchao Zhang   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1452f4747e26SJunchao Zhang   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1453076ba34aSJunchao Zhang   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1454076ba34aSJunchao Zhang   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1455076ba34aSJunchao Zhang   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1456076ba34aSJunchao Zhang   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1457076ba34aSJunchao Zhang   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1458076ba34aSJunchao Zhang   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
14597ee59b9bSJunchao Zhang   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
146042550becSJunchao Zhang 
14619566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
14629566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
146357761e9aSJunchao Zhang #if defined(PETSC_HAVE_HYPRE)
146457761e9aSJunchao Zhang   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
146557761e9aSJunchao Zhang #endif
14663ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1467076ba34aSJunchao Zhang }
1468076ba34aSJunchao Zhang 
14699d13fa56SJunchao Zhang /*
14709d13fa56SJunchao Zhang    Extract the (prescribled) diagonal blocks of the matrix and then invert them
14719d13fa56SJunchao Zhang 
14729d13fa56SJunchao Zhang   Input Parameters:
14739d13fa56SJunchao Zhang +  A       - the MATSEQAIJKOKKOS matrix
14749d13fa56SJunchao Zhang .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
14759d13fa56SJunchao Zhang .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
14769d13fa56SJunchao Zhang .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
14779d13fa56SJunchao Zhang -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine
14789d13fa56SJunchao Zhang 
14799d13fa56SJunchao Zhang   Output Parameter:
14809d13fa56SJunchao Zhang .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
14819d13fa56SJunchao Zhang */
14829d13fa56SJunchao Zhang PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
14839d13fa56SJunchao Zhang {
14849d13fa56SJunchao Zhang   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
14859d13fa56SJunchao Zhang   PetscInt          nblocks = bs.extent(0) - 1;
14869d13fa56SJunchao Zhang 
14879d13fa56SJunchao Zhang   PetscFunctionBegin;
14889d13fa56SJunchao Zhang   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
14899d13fa56SJunchao Zhang 
14909d13fa56SJunchao Zhang   // Pull out the diagonal blocks of the matrix and then invert the blocks
14919d13fa56SJunchao Zhang   auto Aa    = akok->a_dual.view_device();
14929d13fa56SJunchao Zhang   auto Ai    = akok->i_dual.view_device();
14939d13fa56SJunchao Zhang   auto Aj    = akok->j_dual.view_device();
14949d13fa56SJunchao Zhang   auto Adiag = akok->diag_dual.view_device();
14959d13fa56SJunchao Zhang   // TODO: how to tune the team size?
149645402d8aSJunchao Zhang #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
14979d13fa56SJunchao Zhang   auto ts = Kokkos::AUTO();
14989d13fa56SJunchao Zhang #else
14999d13fa56SJunchao Zhang   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
15009d13fa56SJunchao Zhang #endif
15019d13fa56SJunchao Zhang   PetscCallCXX(Kokkos::parallel_for(
1502d326c3f1SJunchao Zhang     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
15039d13fa56SJunchao Zhang       const PetscInt bid    = teamMember.league_rank();                                                   // block id
15049d13fa56SJunchao Zhang       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
15059d13fa56SJunchao Zhang       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
15069d13fa56SJunchao Zhang       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
15079d13fa56SJunchao Zhang       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);
15089d13fa56SJunchao Zhang 
15099d13fa56SJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
15109d13fa56SJunchao Zhang         PetscInt i = rstart + r;                                                            // i-th row in A
15119d13fa56SJunchao Zhang 
15129d13fa56SJunchao Zhang         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
15139d13fa56SJunchao Zhang           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row
15149d13fa56SJunchao Zhang 
15159d13fa56SJunchao Zhang           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
15169d13fa56SJunchao Zhang             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
15179d13fa56SJunchao Zhang               B(r, c) = 0.0;
15189d13fa56SJunchao Zhang             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
15199d13fa56SJunchao Zhang               B(r, c) = Aa(first + c);
15209d13fa56SJunchao Zhang             } else { // this entry does not show up in the CSR
15219d13fa56SJunchao Zhang               B(r, c) = 0.0;
15229d13fa56SJunchao Zhang             }
15239d13fa56SJunchao Zhang           }
15249d13fa56SJunchao Zhang         } else { // rare case that the diagonal does not exist
15259d13fa56SJunchao Zhang           const PetscInt begin = Ai(i);
15269d13fa56SJunchao Zhang           const PetscInt end   = Ai(i + 1);
15279d13fa56SJunchao Zhang           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
15289d13fa56SJunchao Zhang           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
15299d13fa56SJunchao Zhang             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
15309d13fa56SJunchao Zhang             else if (Aj(j) >= rstart + m) break;
15319d13fa56SJunchao Zhang           }
15329d13fa56SJunchao Zhang         }
15339d13fa56SJunchao Zhang       });
15349d13fa56SJunchao Zhang 
15359d13fa56SJunchao Zhang       // LU-decompose B (w/o pivoting) and then invert B
15369d13fa56SJunchao Zhang       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
15379d13fa56SJunchao Zhang       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
15389d13fa56SJunchao Zhang     }));
15399d13fa56SJunchao Zhang   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
15409d13fa56SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
15419d13fa56SJunchao Zhang }
15429d13fa56SJunchao Zhang 
1543d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1544d71ae5a4SJacob Faibussowitsch {
1545076ba34aSJunchao Zhang   Mat_SeqAIJ *aseq;
1546076ba34aSJunchao Zhang   PetscInt    i, m, n;
15474df4a32cSJunchao Zhang   auto        exec = PetscGetKokkosExecutionSpace();
1548076ba34aSJunchao Zhang 
1549076ba34aSJunchao Zhang   PetscFunctionBegin;
15505f80ce2aSJacob Faibussowitsch   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1551076ba34aSJunchao Zhang 
1552076ba34aSJunchao Zhang   m = akok->nrows();
1553076ba34aSJunchao Zhang   n = akok->ncols();
15549566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(A, m, n, m, n));
15559566063dSJacob Faibussowitsch   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1556076ba34aSJunchao Zhang 
1557076ba34aSJunchao Zhang   /* Set up data structures of A as a MATSEQAIJ */
15589566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
155957508eceSPierre Jolivet   aseq = (Mat_SeqAIJ *)A->data;
1560076ba34aSJunchao Zhang 
1561e36ced11SJunchao Zhang   PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */
1562e36ced11SJunchao Zhang   PetscCallCXX(akok->j_dual.sync_host(exec));
1563e36ced11SJunchao Zhang   PetscCallCXX(exec.fence());
1564076ba34aSJunchao Zhang 
1565076ba34aSJunchao Zhang   aseq->i       = akok->i_host_data();
1566076ba34aSJunchao Zhang   aseq->j       = akok->j_host_data();
1567076ba34aSJunchao Zhang   aseq->a       = akok->a_host_data();
1568076ba34aSJunchao Zhang   aseq->nonew   = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1569076ba34aSJunchao Zhang   aseq->free_a  = PETSC_FALSE;
1570076ba34aSJunchao Zhang   aseq->free_ij = PETSC_FALSE;
1571076ba34aSJunchao Zhang   aseq->nz      = akok->nnz();
1572076ba34aSJunchao Zhang   aseq->maxnz   = aseq->nz;
1573076ba34aSJunchao Zhang 
15749566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &aseq->imax));
15759566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &aseq->ilen));
1576ad540459SPierre Jolivet   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1577076ba34aSJunchao Zhang 
1578076ba34aSJunchao Zhang   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1579076ba34aSJunchao Zhang   akok->nonzerostate = A->nonzerostate;
1580ff751488SJunchao Zhang   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
15819566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
15829566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
15833ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1584076ba34aSJunchao Zhang }
1585076ba34aSJunchao Zhang 
15860e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
15870e3ece09SJunchao Zhang {
15880e3ece09SJunchao Zhang   PetscFunctionBegin;
15890e3ece09SJunchao Zhang   PetscCall(MatSeqAIJKokkosSyncDevice(A));
15900e3ece09SJunchao Zhang   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
15910e3ece09SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
15920e3ece09SJunchao Zhang }
15930e3ece09SJunchao Zhang 
15940e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
15950e3ece09SJunchao Zhang {
15960e3ece09SJunchao Zhang   Mat_SeqAIJKokkos *akok;
15974d86920dSPierre Jolivet 
15980e3ece09SJunchao Zhang   PetscFunctionBegin;
15990e3ece09SJunchao Zhang   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
16000e3ece09SJunchao Zhang   PetscCall(MatCreate(comm, A));
16010e3ece09SJunchao Zhang   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
16020e3ece09SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
16030e3ece09SJunchao Zhang }
16040e3ece09SJunchao Zhang 
1605076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1606076ba34aSJunchao Zhang 
1607076ba34aSJunchao Zhang    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1608076ba34aSJunchao Zhang  */
1609d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1610d71ae5a4SJacob Faibussowitsch {
1611076ba34aSJunchao Zhang   PetscFunctionBegin;
16129566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
16139566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
16143ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16158c3ff71bSJunchao Zhang }
16168c3ff71bSJunchao Zhang 
1617152b3e56SJunchao Zhang /*@C
161811a5261eSBarry Smith   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
16198c3ff71bSJunchao Zhang   (the default parallel PETSc format). This matrix will ultimately be handled by
162020f4b53cSBarry Smith   Kokkos for calculations.
16218c3ff71bSJunchao Zhang 
16228c3ff71bSJunchao Zhang   Collective
16238c3ff71bSJunchao Zhang 
16248c3ff71bSJunchao Zhang   Input Parameters:
162511a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF`
16268c3ff71bSJunchao Zhang . m    - number of rows
16278c3ff71bSJunchao Zhang . n    - number of columns
162820f4b53cSBarry Smith . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
162920f4b53cSBarry Smith - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
16308c3ff71bSJunchao Zhang 
16318c3ff71bSJunchao Zhang   Output Parameter:
16328c3ff71bSJunchao Zhang . A - the matrix
16338c3ff71bSJunchao Zhang 
16342ef1f0ffSBarry Smith   Level: intermediate
16352ef1f0ffSBarry Smith 
16362ef1f0ffSBarry Smith   Notes:
163711a5261eSBarry Smith   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
16388c3ff71bSJunchao Zhang   MatXXXXSetPreallocation() paradgm instead of this routine directly.
163911a5261eSBarry Smith   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
16408c3ff71bSJunchao Zhang 
164111a5261eSBarry Smith   The AIJ format, also called
16422ef1f0ffSBarry Smith   compressed row storage, is fully compatible with standard Fortran
16438c3ff71bSJunchao Zhang   storage.  That is, the stored row and column indices can begin at
164420f4b53cSBarry Smith   either one (as in Fortran) or zero.
16458c3ff71bSJunchao Zhang 
16462ef1f0ffSBarry Smith   Specify the preallocated storage with either `nz` or `nnz` (not both).
16472ef1f0ffSBarry Smith   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
16482ef1f0ffSBarry Smith   allocation.
16498c3ff71bSJunchao Zhang 
1650fe59aa6dSJacob Faibussowitsch .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
16518c3ff71bSJunchao Zhang @*/
1652d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1653d71ae5a4SJacob Faibussowitsch {
16548c3ff71bSJunchao Zhang   PetscFunctionBegin;
16559566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
16569566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
16579566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, m, n));
16589566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
16599566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
16603ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16618c3ff71bSJunchao Zhang }
1662930e68a5SMark Adams 
1663d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1664d71ae5a4SJacob Faibussowitsch {
1665930e68a5SMark Adams   PetscFunctionBegin;
16669566063dSJacob Faibussowitsch   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
166786a27549SJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
16683ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
166986a27549SJunchao Zhang }
167086a27549SJunchao Zhang 
1671d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1672d71ae5a4SJacob Faibussowitsch {
167386a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
167486a27549SJunchao Zhang 
167586a27549SJunchao Zhang   PetscFunctionBegin;
167686a27549SJunchao Zhang   if (!factors->sptrsv_symbolic_completed) {
167786a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
167886a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
167986a27549SJunchao Zhang     factors->sptrsv_symbolic_completed = PETSC_TRUE;
168086a27549SJunchao Zhang   }
16813ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
168286a27549SJunchao Zhang }
168386a27549SJunchao Zhang 
168486a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */
1685d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1686d71ae5a4SJacob Faibussowitsch {
168786a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1688076ba34aSJunchao Zhang   MatColIdxType               n       = A->rmap->n;
168986a27549SJunchao Zhang 
169086a27549SJunchao Zhang   PetscFunctionBegin;
169186a27549SJunchao Zhang   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
169286a27549SJunchao Zhang     /* Update L^T and do sptrsv symbolic */
16937b8d4ba6SJunchao Zhang     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0
16947b8d4ba6SJunchao Zhang     factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
16957b8d4ba6SJunchao Zhang     factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
169686a27549SJunchao Zhang 
16979371c9d4SSatish Balay     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
169886a27549SJunchao Zhang                                                                                                                                                                                                               factors->iLt_d, factors->jLt_d, factors->aLt_d);
169986a27549SJunchao Zhang 
170086a27549SJunchao Zhang     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
170186a27549SJunchao Zhang       We have to sort the indices, until KK provides finer control options.
170286a27549SJunchao Zhang     */
17039371c9d4SSatish Balay     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);
170486a27549SJunchao Zhang 
170586a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);
170686a27549SJunchao Zhang 
170786a27549SJunchao Zhang     /* Update U^T and do sptrsv symbolic */
17087b8d4ba6SJunchao Zhang     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0
17097b8d4ba6SJunchao Zhang     factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
17107b8d4ba6SJunchao Zhang     factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
171186a27549SJunchao Zhang 
17129371c9d4SSatish Balay     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
171386a27549SJunchao Zhang                                                                                                                                                                                                               factors->iUt_d, factors->jUt_d, factors->aUt_d);
171486a27549SJunchao Zhang 
171586a27549SJunchao Zhang     /* Sort indices. See comments above */
17169371c9d4SSatish Balay     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);
171786a27549SJunchao Zhang 
171886a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
171986a27549SJunchao Zhang     factors->transpose_updated = PETSC_TRUE;
172086a27549SJunchao Zhang   }
17213ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
172286a27549SJunchao Zhang }
172386a27549SJunchao Zhang 
172486a27549SJunchao Zhang /* Solve Ax = b, with A = LU */
1725d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x)
1726d71ae5a4SJacob Faibussowitsch {
172786a27549SJunchao Zhang   ConstPetscScalarKokkosView  bv;
172886a27549SJunchao Zhang   PetscScalarKokkosView       xv;
172986a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
173086a27549SJunchao Zhang 
173186a27549SJunchao Zhang   PetscFunctionBegin;
17329566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
17339566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
17349566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(b, &bv));
17359566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(x, &xv));
173686a27549SJunchao Zhang   /* Solve L tmpv = b */
17379566063dSJacob Faibussowitsch   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector));
173886a27549SJunchao Zhang   /* Solve Ux = tmpv */
17399566063dSJacob Faibussowitsch   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv));
17409566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(b, &bv));
17419566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
17429566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
17433ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
174486a27549SJunchao Zhang }
174586a27549SJunchao Zhang 
1746076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */
1747d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x)
1748d71ae5a4SJacob Faibussowitsch {
174986a27549SJunchao Zhang   ConstPetscScalarKokkosView  bv;
175086a27549SJunchao Zhang   PetscScalarKokkosView       xv;
175186a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
175286a27549SJunchao Zhang 
175386a27549SJunchao Zhang   PetscFunctionBegin;
17549566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
17559566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
17569566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(b, &bv));
17579566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(x, &xv));
175886a27549SJunchao Zhang   /* Solve U^T tmpv = b */
175986a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);
176086a27549SJunchao Zhang 
176186a27549SJunchao Zhang   /* Solve L^T x = tmpv */
176286a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
17639566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(b, &bv));
17649566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
17659566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
17663ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
176786a27549SJunchao Zhang }
176886a27549SJunchao Zhang 
1769d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1770d71ae5a4SJacob Faibussowitsch {
177186a27549SJunchao Zhang   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
177286a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
177386a27549SJunchao Zhang   PetscInt                    fill_lev = info->levels;
177486a27549SJunchao Zhang 
177586a27549SJunchao Zhang   PetscFunctionBegin;
17769566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
17779566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1778076ba34aSJunchao Zhang 
1779076ba34aSJunchao Zhang   auto a_d = aijkok->a_dual.view_device();
1780076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1781076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1782076ba34aSJunchao Zhang 
1783076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);
178486a27549SJunchao Zhang 
178586a27549SJunchao Zhang   B->assembled              = PETSC_TRUE;
178686a27549SJunchao Zhang   B->preallocated           = PETSC_TRUE;
178786a27549SJunchao Zhang   B->ops->solve             = MatSolve_SeqAIJKokkos;
178886a27549SJunchao Zhang   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos;
178986a27549SJunchao Zhang   B->ops->matsolve          = NULL;
179086a27549SJunchao Zhang   B->ops->matsolvetranspose = NULL;
179186a27549SJunchao Zhang   B->offloadmask            = PETSC_OFFLOAD_GPU;
179286a27549SJunchao Zhang 
179386a27549SJunchao Zhang   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
179486a27549SJunchao Zhang   factors->transpose_updated         = PETSC_FALSE;
179586a27549SJunchao Zhang   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1796eeadb341SJunchao Zhang   /* TODO: log flops, but how to know that? */
17979566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
17983ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
179986a27549SJunchao Zhang }
180086a27549SJunchao Zhang 
1801d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1802d71ae5a4SJacob Faibussowitsch {
180386a27549SJunchao Zhang   Mat_SeqAIJKokkos           *aijkok;
180486a27549SJunchao Zhang   Mat_SeqAIJ                 *b;
180586a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
180686a27549SJunchao Zhang   PetscInt                    fill_lev = info->levels;
180786a27549SJunchao Zhang   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
180886a27549SJunchao Zhang   PetscInt                    n        = A->rmap->n;
180986a27549SJunchao Zhang 
181086a27549SJunchao Zhang   PetscFunctionBegin;
18119566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
181286a27549SJunchao Zhang   /* Rebuild factors */
18139371c9d4SSatish Balay   if (factors) {
18149371c9d4SSatish Balay     factors->Destroy();
18159371c9d4SSatish Balay   } /* Destroy the old if it exists */
18169371c9d4SSatish Balay   else {
18179371c9d4SSatish Balay     B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
18189371c9d4SSatish Balay   }
181986a27549SJunchao Zhang 
182086a27549SJunchao Zhang   /* Create a spiluk handle and then do symbolic factorization */
182186a27549SJunchao Zhang   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
182286a27549SJunchao Zhang   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
182386a27549SJunchao Zhang 
182486a27549SJunchao Zhang   auto spiluk_handle = factors->kh.get_spiluk_handle();
182586a27549SJunchao Zhang 
182686a27549SJunchao Zhang   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
182786a27549SJunchao Zhang   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
182886a27549SJunchao Zhang   Kokkos::realloc(factors->iU_d, n + 1);
182986a27549SJunchao Zhang   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
183086a27549SJunchao Zhang 
183186a27549SJunchao Zhang   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1832076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1833076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1834076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
183586a27549SJunchao Zhang   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
183686a27549SJunchao Zhang 
183786a27549SJunchao Zhang   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
183886a27549SJunchao Zhang   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
183986a27549SJunchao Zhang   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
184086a27549SJunchao Zhang   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
184186a27549SJunchao Zhang 
184286a27549SJunchao Zhang   /* TODO: add options to select sptrsv algorithms */
184386a27549SJunchao Zhang   /* Create sptrsv handles for L, U and their transpose */
184486a27549SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
184586a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
184686a27549SJunchao Zhang #else
184786a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
184886a27549SJunchao Zhang #endif
184986a27549SJunchao Zhang 
185086a27549SJunchao Zhang   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
185186a27549SJunchao Zhang   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
185286a27549SJunchao Zhang   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
185386a27549SJunchao Zhang   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
185486a27549SJunchao Zhang 
185586a27549SJunchao Zhang   /* Fill fields of the factor matrix B */
18569566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
185786a27549SJunchao Zhang   b     = (Mat_SeqAIJ *)B->data;
185886a27549SJunchao Zhang   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
185986a27549SJunchao Zhang   B->info.fill_ratio_given  = info->fill;
1860a1e4e190SStefano Zampini   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
186186a27549SJunchao Zhang 
186286a27549SJunchao Zhang   B->offloadmask          = PETSC_OFFLOAD_GPU;
186386a27549SJunchao Zhang   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
18643ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1865930e68a5SMark Adams }
1866930e68a5SMark Adams 
1867d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type)
1868d71ae5a4SJacob Faibussowitsch {
1869930e68a5SMark Adams   PetscFunctionBegin;
1870930e68a5SMark Adams   *type = MATSOLVERKOKKOS;
18713ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1872930e68a5SMark Adams }
1873930e68a5SMark Adams 
1874930e68a5SMark Adams /*MC
187586a27549SJunchao Zhang   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
187611a5261eSBarry Smith   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
1877930e68a5SMark Adams 
1878930e68a5SMark Adams   Level: beginner
1879930e68a5SMark Adams 
18801cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1881930e68a5SMark Adams M*/
188286a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1883930e68a5SMark Adams {
1884930e68a5SMark Adams   PetscInt n = A->rmap->n;
1885930e68a5SMark Adams 
1886930e68a5SMark Adams   PetscFunctionBegin;
18879566063dSJacob Faibussowitsch   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
18889566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*B, n, n, n, n));
1889930e68a5SMark Adams   (*B)->factortype = ftype;
18909566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
18919566063dSJacob Faibussowitsch   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1892930e68a5SMark Adams 
18938f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
18949566063dSJacob Faibussowitsch     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
189586a27549SJunchao Zhang     (*B)->canuseordering        = PETSC_TRUE;
189686a27549SJunchao Zhang     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
189786a27549SJunchao Zhang   } else if (ftype == MAT_FACTOR_ILU) {
18989566063dSJacob Faibussowitsch     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
189986a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_FALSE;
190086a27549SJunchao Zhang     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
190198921bdaSJacob Faibussowitsch   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1902930e68a5SMark Adams 
19039566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1904f4f49eeaSPierre Jolivet   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos));
19053ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1906930e68a5SMark Adams }
19078f7e8f9dSMark Adams 
1908d1f0640dSPierre Jolivet PETSC_INTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1909d71ae5a4SJacob Faibussowitsch {
191086a27549SJunchao Zhang   PetscFunctionBegin;
19119566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
19129566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
19133ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
191486a27549SJunchao Zhang }
191586a27549SJunchao Zhang 
1916076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */
1917d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
1918d71ae5a4SJacob Faibussowitsch {
191945402d8aSJunchao Zhang   const auto        &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
192045402d8aSJunchao Zhang   const auto        &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
192145402d8aSJunchao Zhang   const auto        &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
1922076ba34aSJunchao Zhang   const PetscInt    *i  = iv.data();
1923076ba34aSJunchao Zhang   const PetscInt    *j  = jv.data();
1924076ba34aSJunchao Zhang   const PetscScalar *a  = av.data();
1925076ba34aSJunchao Zhang   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
1926076ba34aSJunchao Zhang 
1927076ba34aSJunchao Zhang   PetscFunctionBegin;
19289566063dSJacob Faibussowitsch   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
1929076ba34aSJunchao Zhang   for (PetscInt k = 0; k < m; k++) {
19309566063dSJacob Faibussowitsch     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
193148a46eb9SPierre Jolivet     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
19329566063dSJacob Faibussowitsch     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
1933076ba34aSJunchao Zhang   }
19343ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1935076ba34aSJunchao Zhang }
1936