xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision d326c3f1c5d950e31249cdd49d2b04d6dbe295f4)
1e36ced11SJunchao Zhang #include <petsc_kokkos.hpp>
211d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp>
3076ba34aSJunchao Zhang #include <petscpkg_version.h>
4152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h>
542550becSJunchao Zhang #include <petsc/private/sfimpl.h>
68c3ff71bSJunchao Zhang #include <petscsystypes.h>
78c3ff71bSJunchao Zhang #include <petscerror.h>
88c3ff71bSJunchao Zhang 
98c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp>
10f0cf5187SStefano Zampini #include <KokkosBlas.hpp>
118c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp>
128c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
1386a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp>
1486a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp>
15076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp>
16076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp>
179d13fa56SJunchao Zhang #include <KokkosBatched_LU_Decl.hpp>
189d13fa56SJunchao Zhang #include <KokkosBatched_InverseLU_Decl.hpp>
1986a27549SJunchao Zhang 
2042550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
218c3ff71bSJunchao Zhang 
220e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
23f98996d3SJunchao Zhang   #include <KokkosSparse_Utils.hpp>
24f98996d3SJunchao Zhang using KokkosSparse::sort_crs_matrix;
259371c9d4SSatish Balay using KokkosSparse::Impl::transpose_matrix;
26f98996d3SJunchao Zhang #else
27f98996d3SJunchao Zhang   #include <KokkosKernels_Sorting.hpp>
28f98996d3SJunchao Zhang using KokkosKernels::sort_crs_matrix;
299371c9d4SSatish Balay using KokkosKernels::Impl::transpose_matrix;
30f98996d3SJunchao Zhang #endif
31f98996d3SJunchao Zhang 
328c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
338c3ff71bSJunchao Zhang 
34076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
35076ba34aSJunchao Zhang    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
36076ba34aSJunchao Zhang    In the latter case, it is important to set a_dual's sync state correctly.
37076ba34aSJunchao Zhang  */
38d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
39d71ae5a4SJacob Faibussowitsch {
40076ba34aSJunchao Zhang   Mat_SeqAIJ       *aijseq;
41076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
428c3ff71bSJunchao Zhang 
438c3ff71bSJunchao Zhang   PetscFunctionBegin;
443ba16761SJacob Faibussowitsch   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
459566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
46076ba34aSJunchao Zhang 
47076ba34aSJunchao Zhang   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
48076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
49076ba34aSJunchao Zhang 
50076ba34aSJunchao Zhang   /* If aijkok does not exist, we just copy i, j to device.
51076ba34aSJunchao Zhang      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
52076ba34aSJunchao Zhang      In both cases, we build a new aijkok structure.
53076ba34aSJunchao Zhang   */
54076ba34aSJunchao Zhang   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
55076ba34aSJunchao Zhang     delete aijkok;
56f4747e26SJunchao Zhang     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
57076ba34aSJunchao Zhang     A->spptr = aijkok;
58f4747e26SJunchao Zhang   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
59f4747e26SJunchao Zhang     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
60f4747e26SJunchao Zhang     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
61f4747e26SJunchao Zhang     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
62076ba34aSJunchao Zhang   }
633ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
648c3ff71bSJunchao Zhang }
658c3ff71bSJunchao Zhang 
6686a27549SJunchao Zhang /* Sync CSR data to device if not yet */
67d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
68d71ae5a4SJacob Faibussowitsch {
698c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
708c3ff71bSJunchao Zhang 
718c3ff71bSJunchao Zhang   PetscFunctionBegin;
72aaa8cc7dSPierre Jolivet   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
735f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
74076ba34aSJunchao Zhang   if (aijkok->a_dual.need_sync_device()) {
75076ba34aSJunchao Zhang     aijkok->a_dual.sync_device();
76580c7c76SPierre Jolivet     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
7786a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_FALSE;
788c3ff71bSJunchao Zhang   }
793ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
808c3ff71bSJunchao Zhang }
818c3ff71bSJunchao Zhang 
82076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */
83d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
84d71ae5a4SJacob Faibussowitsch {
8586a27549SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
8686a27549SJunchao Zhang 
8786a27549SJunchao Zhang   PetscFunctionBegin;
885f80ce2aSJacob Faibussowitsch   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
8986a27549SJunchao Zhang   aijkok->a_dual.clear_sync_state();
9086a27549SJunchao Zhang   aijkok->a_dual.modify_device();
9186a27549SJunchao Zhang   aijkok->transpose_updated = PETSC_FALSE;
9286a27549SJunchao Zhang   aijkok->hermitian_updated = PETSC_FALSE;
939566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
949566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
953ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
9686a27549SJunchao Zhang }
9786a27549SJunchao Zhang 
98d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
99d71ae5a4SJacob Faibussowitsch {
100f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
101e36ced11SJunchao Zhang   auto             &exec   = PetscGetKokkosExecutionSpace();
102f0cf5187SStefano Zampini 
103f0cf5187SStefano Zampini   PetscFunctionBegin;
104f0cf5187SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
10586a27549SJunchao Zhang   /* We do not expect one needs factors on host  */
106aaa8cc7dSPierre Jolivet   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
1075f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
108e36ced11SJunchao Zhang   PetscCallCXX(aijkok->a_dual.sync_host(exec));
109e36ced11SJunchao Zhang   PetscCallCXX(exec.fence());
1103ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
111f0cf5187SStefano Zampini }
112f0cf5187SStefano Zampini 
113d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
114d71ae5a4SJacob Faibussowitsch {
115076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
116f0cf5187SStefano Zampini 
117f0cf5187SStefano Zampini   PetscFunctionBegin;
1185519a089SJose E. Roman   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
1195519a089SJose E. Roman     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
1205519a089SJose E. Roman     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
1215519a089SJose E. Roman     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
1225519a089SJose E. Roman   */
1235519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
124e36ced11SJunchao Zhang     auto &exec = PetscGetKokkosExecutionSpace();
125e36ced11SJunchao Zhang     PetscCallCXX(aijkok->a_dual.sync_host(exec));
126e36ced11SJunchao Zhang     PetscCallCXX(exec.fence());
127076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
128076ba34aSJunchao Zhang   } else { /* Happens when calling MatSetValues on a newly created matrix */
129076ba34aSJunchao Zhang     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
130076ba34aSJunchao Zhang   }
1313ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
132076ba34aSJunchao Zhang }
133076ba34aSJunchao Zhang 
134d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
135d71ae5a4SJacob Faibussowitsch {
136076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
137076ba34aSJunchao Zhang 
138076ba34aSJunchao Zhang   PetscFunctionBegin;
1395519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
1403ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
141076ba34aSJunchao Zhang }
142076ba34aSJunchao Zhang 
143d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
144d71ae5a4SJacob Faibussowitsch {
145076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
146076ba34aSJunchao Zhang 
147076ba34aSJunchao Zhang   PetscFunctionBegin;
1485519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
149e36ced11SJunchao Zhang     auto &exec = PetscGetKokkosExecutionSpace();
150e36ced11SJunchao Zhang     PetscCallCXX(aijkok->a_dual.sync_host(exec));
151e36ced11SJunchao Zhang     PetscCallCXX(exec.fence());
152076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1532328674fSJunchao Zhang   } else {
1542328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
1552328674fSJunchao Zhang   }
1563ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
157076ba34aSJunchao Zhang }
158076ba34aSJunchao Zhang 
159d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
160d71ae5a4SJacob Faibussowitsch {
161076ba34aSJunchao Zhang   PetscFunctionBegin;
162076ba34aSJunchao Zhang   *array = NULL;
1633ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
164076ba34aSJunchao Zhang }
165076ba34aSJunchao Zhang 
166d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
167d71ae5a4SJacob Faibussowitsch {
168076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
169076ba34aSJunchao Zhang 
170076ba34aSJunchao Zhang   PetscFunctionBegin;
1715519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
172076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1732328674fSJunchao Zhang   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
1742328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
1752328674fSJunchao Zhang   }
1763ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
177076ba34aSJunchao Zhang }
178076ba34aSJunchao Zhang 
179d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
180d71ae5a4SJacob Faibussowitsch {
181076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
182076ba34aSJunchao Zhang 
183076ba34aSJunchao Zhang   PetscFunctionBegin;
1845519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
185076ba34aSJunchao Zhang     aijkok->a_dual.clear_sync_state();
186076ba34aSJunchao Zhang     aijkok->a_dual.modify_host();
1872328674fSJunchao Zhang   }
1883ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
189f0cf5187SStefano Zampini }
190f0cf5187SStefano Zampini 
191d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
192d71ae5a4SJacob Faibussowitsch {
1937ee59b9bSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1947ee59b9bSJunchao Zhang 
1957ee59b9bSJunchao Zhang   PetscFunctionBegin;
1967ee59b9bSJunchao Zhang   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
1977ee59b9bSJunchao Zhang 
1987ee59b9bSJunchao Zhang   if (i) *i = aijkok->i_device_data();
1997ee59b9bSJunchao Zhang   if (j) *j = aijkok->j_device_data();
2007ee59b9bSJunchao Zhang   if (a) {
2017ee59b9bSJunchao Zhang     aijkok->a_dual.sync_device();
2027ee59b9bSJunchao Zhang     *a = aijkok->a_device_data();
2037ee59b9bSJunchao Zhang   }
2047ee59b9bSJunchao Zhang   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
2053ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2067ee59b9bSJunchao Zhang }
2077ee59b9bSJunchao Zhang 
2080e3ece09SJunchao Zhang /*
2090e3ece09SJunchao Zhang   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
2100e3ece09SJunchao Zhang 
2110e3ece09SJunchao Zhang   Input Parameter:
2120e3ece09SJunchao Zhang .  A       - the MATSEQAIJKOKKOS matrix
2130e3ece09SJunchao Zhang 
2140e3ece09SJunchao Zhang   Output Parameters:
2150e3ece09SJunchao Zhang +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
216aaa8cc7dSPierre Jolivet -  T_d    - the transpose on device, whose value array is allocated but not initialized
2170e3ece09SJunchao Zhang */
2180e3ece09SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
219d71ae5a4SJacob Faibussowitsch {
2200e3ece09SJunchao Zhang   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
2210e3ece09SJunchao Zhang   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
2220e3ece09SJunchao Zhang   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
2237b8d4ba6SJunchao Zhang   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
2240e3ece09SJunchao Zhang   MatRowMapType          *Ti = Ti_h.data();
2257b8d4ba6SJunchao Zhang   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
2267b8d4ba6SJunchao Zhang   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
2270e3ece09SJunchao Zhang   PetscInt               *Tj   = Tj_h.data();
2280e3ece09SJunchao Zhang   PetscInt               *perm = perm_h.data();
2290e3ece09SJunchao Zhang   PetscInt               *offset;
230152b3e56SJunchao Zhang 
231152b3e56SJunchao Zhang   PetscFunctionBegin;
2320e3ece09SJunchao Zhang   // Populate Ti
2330e3ece09SJunchao Zhang   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
2340e3ece09SJunchao Zhang   Ti++;
2350e3ece09SJunchao Zhang   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
2360e3ece09SJunchao Zhang   Ti--;
2370e3ece09SJunchao Zhang   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
2380e3ece09SJunchao Zhang 
2390e3ece09SJunchao Zhang   // Populate Tj and the permutation array
2400e3ece09SJunchao Zhang   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
2410e3ece09SJunchao Zhang   for (PetscInt i = 0; i < m; i++) {
2420e3ece09SJunchao Zhang     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
2430e3ece09SJunchao Zhang       PetscInt r    = Aj[j];                       // row r of T
2440e3ece09SJunchao Zhang       PetscInt disp = Ti[r] + offset[r];
2450e3ece09SJunchao Zhang 
2460e3ece09SJunchao Zhang       Tj[disp]   = i; // col i of T
2470e3ece09SJunchao Zhang       perm[disp] = j;
2480e3ece09SJunchao Zhang       offset[r]++;
249076ba34aSJunchao Zhang     }
2500e3ece09SJunchao Zhang   }
2510e3ece09SJunchao Zhang   PetscCall(PetscFree(offset));
2520e3ece09SJunchao Zhang 
2530e3ece09SJunchao Zhang   // Sort each row of T, along with the permutation array
2540e3ece09SJunchao Zhang   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
2550e3ece09SJunchao Zhang 
2560e3ece09SJunchao Zhang   // Output perm and T on device
2570e3ece09SJunchao Zhang   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
2580e3ece09SJunchao Zhang   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
2590e3ece09SJunchao Zhang   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
2600e3ece09SJunchao Zhang   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
2613ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
262152b3e56SJunchao Zhang }
263152b3e56SJunchao Zhang 
2640e3ece09SJunchao Zhang // Generate the transpose on device and cache it internally
2650e3ece09SJunchao Zhang // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
2660e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
267d71ae5a4SJacob Faibussowitsch {
2680e3ece09SJunchao Zhang   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
2690e3ece09SJunchao Zhang   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
2700e3ece09SJunchao Zhang   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
2710e3ece09SJunchao Zhang   KokkosCsrMatrix  &T = akok->csrmatT;
272152b3e56SJunchao Zhang 
273152b3e56SJunchao Zhang   PetscFunctionBegin;
2740e3ece09SJunchao Zhang   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
275145b44c9SPierre Jolivet   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
2760e3ece09SJunchao Zhang 
2770e3ece09SJunchao Zhang   const auto &Aa = akok->a_dual.view_device();
2780e3ece09SJunchao Zhang 
2790e3ece09SJunchao Zhang   if (A->symmetric == PETSC_BOOL3_TRUE) {
2800e3ece09SJunchao Zhang     *csrmatT = akok->csrmat;
2810e3ece09SJunchao Zhang   } else {
2820e3ece09SJunchao Zhang     // See if we already have a cached transpose and its value is up to date
2830e3ece09SJunchao Zhang     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
2840e3ece09SJunchao Zhang       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
2850e3ece09SJunchao Zhang         const auto &perm = akok->transpose_perm; // get the permutation array
2860e3ece09SJunchao Zhang         auto       &Ta   = T.values;
2870e3ece09SJunchao Zhang 
288*d326c3f1SJunchao Zhang         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
289076ba34aSJunchao Zhang       }
2900e3ece09SJunchao Zhang     } else { // Generate T of size n x m for the first time
2910e3ece09SJunchao Zhang       MatRowMapKokkosView perm;
2920e3ece09SJunchao Zhang 
2930e3ece09SJunchao Zhang       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
2940e3ece09SJunchao Zhang       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
295*d326c3f1SJunchao Zhang       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
2960e3ece09SJunchao Zhang     }
2970e3ece09SJunchao Zhang     akok->transpose_updated = PETSC_TRUE;
2980e3ece09SJunchao Zhang     *csrmatT                = akok->csrmatT;
2990e3ece09SJunchao Zhang   }
3000e3ece09SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
3010e3ece09SJunchao Zhang }
3020e3ece09SJunchao Zhang 
3030e3ece09SJunchao Zhang // Generate the Hermitian on device and cache it internally
3040e3ece09SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
3050e3ece09SJunchao Zhang {
3060e3ece09SJunchao Zhang   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
3070e3ece09SJunchao Zhang   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
3080e3ece09SJunchao Zhang   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
3090e3ece09SJunchao Zhang   KokkosCsrMatrix  &T = akok->csrmatH;
3100e3ece09SJunchao Zhang 
3110e3ece09SJunchao Zhang   PetscFunctionBegin;
3120e3ece09SJunchao Zhang   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
3130e3ece09SJunchao Zhang   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
3140e3ece09SJunchao Zhang 
3150e3ece09SJunchao Zhang   const auto &Aa = akok->a_dual.view_device();
3160e3ece09SJunchao Zhang 
3170e3ece09SJunchao Zhang   if (A->hermitian == PETSC_BOOL3_TRUE) {
3180e3ece09SJunchao Zhang     *csrmatH = akok->csrmat;
3190e3ece09SJunchao Zhang   } else {
3200e3ece09SJunchao Zhang     // See if we already have a cached hermitian and its value is up to date
3210e3ece09SJunchao Zhang     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
3220e3ece09SJunchao Zhang       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
3230e3ece09SJunchao Zhang         const auto &perm = akok->transpose_perm; // get the permutation array
3240e3ece09SJunchao Zhang         auto       &Ta   = T.values;
3250e3ece09SJunchao Zhang 
326*d326c3f1SJunchao Zhang         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
3270e3ece09SJunchao Zhang       }
3280e3ece09SJunchao Zhang     } else { // Generate T of size n x m for the first time
3290e3ece09SJunchao Zhang       MatRowMapKokkosView perm;
3300e3ece09SJunchao Zhang 
3310e3ece09SJunchao Zhang       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
3320e3ece09SJunchao Zhang       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
333*d326c3f1SJunchao Zhang       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
3340e3ece09SJunchao Zhang     }
3350e3ece09SJunchao Zhang     akok->hermitian_updated = PETSC_TRUE;
3360e3ece09SJunchao Zhang     *csrmatH                = akok->csrmatH;
3370e3ece09SJunchao Zhang   }
3383ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
339152b3e56SJunchao Zhang }
340a587d139SMark 
3418c3ff71bSJunchao Zhang /* y = A x */
342d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
343d71ae5a4SJacob Faibussowitsch {
3448c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
345152b3e56SJunchao Zhang   ConstPetscScalarKokkosView xv;
346152b3e56SJunchao Zhang   PetscScalarKokkosView      yv;
3478c3ff71bSJunchao Zhang 
3488c3ff71bSJunchao Zhang   PetscFunctionBegin;
3499566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3509566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
3519566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
3529566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(yy, &yv));
3538c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
354*d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
3559566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
3569566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
357076ba34aSJunchao Zhang   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
3589566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
3599566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3603ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3618c3ff71bSJunchao Zhang }
3628c3ff71bSJunchao Zhang 
3638c3ff71bSJunchao Zhang /* y = A^T x */
364d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
365d71ae5a4SJacob Faibussowitsch {
3668c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
367152b3e56SJunchao Zhang   const char                *mode;
368152b3e56SJunchao Zhang   ConstPetscScalarKokkosView xv;
369152b3e56SJunchao Zhang   PetscScalarKokkosView      yv;
3700e3ece09SJunchao Zhang   KokkosCsrMatrix            csrmat;
3718c3ff71bSJunchao Zhang 
3728c3ff71bSJunchao Zhang   PetscFunctionBegin;
3739566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3749566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
3759566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
3769566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(yy, &yv));
377152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
3789566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
379152b3e56SJunchao Zhang     mode = "N";
380152b3e56SJunchao Zhang   } else {
381076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
3820e3ece09SJunchao Zhang     csrmat = aijkok->csrmat;
383152b3e56SJunchao Zhang     mode   = "T";
384152b3e56SJunchao Zhang   }
385*d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
3869566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
3879566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
3880e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
3899566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3903ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3918c3ff71bSJunchao Zhang }
3928c3ff71bSJunchao Zhang 
3938c3ff71bSJunchao Zhang /* y = A^H x */
394d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
395d71ae5a4SJacob Faibussowitsch {
3968c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
397152b3e56SJunchao Zhang   const char                *mode;
398152b3e56SJunchao Zhang   ConstPetscScalarKokkosView xv;
399152b3e56SJunchao Zhang   PetscScalarKokkosView      yv;
4000e3ece09SJunchao Zhang   KokkosCsrMatrix            csrmat;
4018c3ff71bSJunchao Zhang 
4028c3ff71bSJunchao Zhang   PetscFunctionBegin;
4039566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4049566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
4059566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
4069566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(yy, &yv));
407152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
4089566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
409152b3e56SJunchao Zhang     mode = "N";
410152b3e56SJunchao Zhang   } else {
411076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
4120e3ece09SJunchao Zhang     csrmat = aijkok->csrmat;
413152b3e56SJunchao Zhang     mode   = "C";
414152b3e56SJunchao Zhang   }
415*d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
4169566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
4179566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
4180e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
4199566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
4203ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4218c3ff71bSJunchao Zhang }
4228c3ff71bSJunchao Zhang 
4238c3ff71bSJunchao Zhang /* z = A x + y */
424d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
425d71ae5a4SJacob Faibussowitsch {
4268c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
427152b3e56SJunchao Zhang   ConstPetscScalarKokkosView xv, yv;
428152b3e56SJunchao Zhang   PetscScalarKokkosView      zv;
4298c3ff71bSJunchao Zhang 
4308c3ff71bSJunchao Zhang   PetscFunctionBegin;
4319566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4329566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
4339566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
4349566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(yy, &yv));
4359566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(zz, &zv));
4368c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv, yv);
4378c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
438*d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
4399566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
4409566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(yy, &yv));
4419566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
4429566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
4439566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
4443ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4458c3ff71bSJunchao Zhang }
4468c3ff71bSJunchao Zhang 
4478c3ff71bSJunchao Zhang /* z = A^T x + y */
448d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
449d71ae5a4SJacob Faibussowitsch {
4508c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
451152b3e56SJunchao Zhang   const char                *mode;
452152b3e56SJunchao Zhang   ConstPetscScalarKokkosView xv, yv;
453152b3e56SJunchao Zhang   PetscScalarKokkosView      zv;
4540e3ece09SJunchao Zhang   KokkosCsrMatrix            csrmat;
4558c3ff71bSJunchao Zhang 
4568c3ff71bSJunchao Zhang   PetscFunctionBegin;
4579566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4589566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
4599566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
4609566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(yy, &yv));
4619566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(zz, &zv));
4628c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv, yv);
463152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
4649566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
465152b3e56SJunchao Zhang     mode = "N";
466152b3e56SJunchao Zhang   } else {
467076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
4680e3ece09SJunchao Zhang     csrmat = aijkok->csrmat;
469152b3e56SJunchao Zhang     mode   = "T";
470152b3e56SJunchao Zhang   }
471*d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
4729566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
4739566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(yy, &yv));
4749566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
4750e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
4769566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
4773ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4788c3ff71bSJunchao Zhang }
4798c3ff71bSJunchao Zhang 
4808c3ff71bSJunchao Zhang /* z = A^H x + y */
481d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
482d71ae5a4SJacob Faibussowitsch {
4838c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
484152b3e56SJunchao Zhang   const char                *mode;
485152b3e56SJunchao Zhang   ConstPetscScalarKokkosView xv, yv;
486152b3e56SJunchao Zhang   PetscScalarKokkosView      zv;
4870e3ece09SJunchao Zhang   KokkosCsrMatrix            csrmat;
4888c3ff71bSJunchao Zhang 
4898c3ff71bSJunchao Zhang   PetscFunctionBegin;
4909566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4919566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
4929566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
4939566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(yy, &yv));
4949566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(zz, &zv));
4958c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv, yv);
496152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
4979566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
498152b3e56SJunchao Zhang     mode = "N";
499152b3e56SJunchao Zhang   } else {
500076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
5010e3ece09SJunchao Zhang     csrmat = aijkok->csrmat;
502152b3e56SJunchao Zhang     mode   = "C";
503152b3e56SJunchao Zhang   }
504*d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
5059566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
5069566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(yy, &yv));
5079566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
5080e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
5099566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
5103ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
511152b3e56SJunchao Zhang }
512152b3e56SJunchao Zhang 
51366976f2fSJacob Faibussowitsch static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
514d71ae5a4SJacob Faibussowitsch {
515152b3e56SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
516152b3e56SJunchao Zhang 
517152b3e56SJunchao Zhang   PetscFunctionBegin;
518152b3e56SJunchao Zhang   switch (op) {
519152b3e56SJunchao Zhang   case MAT_FORM_EXPLICIT_TRANSPOSE:
520152b3e56SJunchao Zhang     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
5219566063dSJacob Faibussowitsch     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
522152b3e56SJunchao Zhang     A->form_explicit_transpose = flg;
523152b3e56SJunchao Zhang     break;
524d71ae5a4SJacob Faibussowitsch   default:
525d71ae5a4SJacob Faibussowitsch     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
526d71ae5a4SJacob Faibussowitsch     break;
527152b3e56SJunchao Zhang   }
5283ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5298c3ff71bSJunchao Zhang }
5308c3ff71bSJunchao Zhang 
531076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */
532d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
533d71ae5a4SJacob Faibussowitsch {
534076ba34aSJunchao Zhang   Mat_SeqAIJ *aseq;
5358c3ff71bSJunchao Zhang 
5368c3ff71bSJunchao Zhang   PetscFunctionBegin;
5379566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
538076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
5399566063dSJacob Faibussowitsch     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));  /* the returned newmat is a SeqAIJKokkos */
5408c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
5419566063dSJacob Faibussowitsch     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
542076ba34aSJunchao Zhang   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
5435f80ce2aSJacob Faibussowitsch     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
5449566063dSJacob Faibussowitsch     PetscCall(PetscFree(A->defaultvectype));
5459566063dSJacob Faibussowitsch     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
5469566063dSJacob Faibussowitsch     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
5479566063dSJacob Faibussowitsch     PetscCall(MatSetOps_SeqAIJKokkos(A));
548076ba34aSJunchao Zhang     aseq = static_cast<Mat_SeqAIJ *>(A->data);
549394ed5ebSJunchao Zhang     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
5505f80ce2aSJacob Faibussowitsch       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
551f4747e26SJunchao Zhang       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
5528c3ff71bSJunchao Zhang     }
553076ba34aSJunchao Zhang   }
5543ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5558c3ff71bSJunchao Zhang }
5568c3ff71bSJunchao Zhang 
557076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
558076ba34aSJunchao Zhang    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
559076ba34aSJunchao Zhang  */
560d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
561d71ae5a4SJacob Faibussowitsch {
562076ba34aSJunchao Zhang   Mat_SeqAIJ       *bseq;
563076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
564076ba34aSJunchao Zhang   Mat               mat;
5658c3ff71bSJunchao Zhang 
5668c3ff71bSJunchao Zhang   PetscFunctionBegin;
567076ba34aSJunchao Zhang   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
5689566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
569076ba34aSJunchao Zhang   mat = *B;
570f4747e26SJunchao Zhang   if (A->assembled) {
571076ba34aSJunchao Zhang     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
572f4747e26SJunchao Zhang     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
573076ba34aSJunchao Zhang     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
574076ba34aSJunchao Zhang     /* Now copy values to B if needed */
575076ba34aSJunchao Zhang     if (dupOption == MAT_COPY_VALUES) {
576076ba34aSJunchao Zhang       if (akok->a_dual.need_sync_device()) {
577076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
578076ba34aSJunchao Zhang         bkok->a_dual.modify_host();
579076ba34aSJunchao Zhang       } else { /* If device has the latest data, we only copy data on device */
580076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
581076ba34aSJunchao Zhang         bkok->a_dual.modify_device();
582076ba34aSJunchao Zhang       }
583076ba34aSJunchao Zhang     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
584076ba34aSJunchao Zhang       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
585076ba34aSJunchao Zhang       bkok->a_dual.modify_host();
586076ba34aSJunchao Zhang     }
587076ba34aSJunchao Zhang     mat->spptr = bkok;
588076ba34aSJunchao Zhang   }
589076ba34aSJunchao Zhang 
5909566063dSJacob Faibussowitsch   PetscCall(PetscFree(mat->defaultvectype));
5919566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
5929566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
5939566063dSJacob Faibussowitsch   PetscCall(MatSetOps_SeqAIJKokkos(mat));
5943ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5958c3ff71bSJunchao Zhang }
5968c3ff71bSJunchao Zhang 
597d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
598d71ae5a4SJacob Faibussowitsch {
5990ecb592aSJunchao Zhang   Mat               At;
6000e3ece09SJunchao Zhang   KokkosCsrMatrix   internT;
6010ecb592aSJunchao Zhang   Mat_SeqAIJKokkos *atkok, *bkok;
6020ecb592aSJunchao Zhang 
6030ecb592aSJunchao Zhang   PetscFunctionBegin;
6047fb60732SBarry Smith   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
6059566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
6060ecb592aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
607ff751488SJunchao Zhang     /* Deep copy internT, as we want to isolate the internal transpose */
6080e3ece09SJunchao Zhang     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
6099566063dSJacob Faibussowitsch     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
6100ecb592aSJunchao Zhang     if (reuse == MAT_INITIAL_MATRIX) *B = At;
6119566063dSJacob Faibussowitsch     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
6120ecb592aSJunchao Zhang   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
6130ecb592aSJunchao Zhang     if ((*B)->assembled) {
6140ecb592aSJunchao Zhang       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
6150e3ece09SJunchao Zhang       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
6169566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
6170ecb592aSJunchao Zhang     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
6180ecb592aSJunchao Zhang       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
6190e3ece09SJunchao Zhang       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
6200e3ece09SJunchao Zhang       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
6210e3ece09SJunchao Zhang       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
6220e3ece09SJunchao Zhang       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
6230ecb592aSJunchao Zhang     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
6240ecb592aSJunchao Zhang   }
6253ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6260ecb592aSJunchao Zhang }
6270ecb592aSJunchao Zhang 
628d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
629d71ae5a4SJacob Faibussowitsch {
63086a27549SJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
6318c3ff71bSJunchao Zhang 
6328c3ff71bSJunchao Zhang   PetscFunctionBegin;
63386a27549SJunchao Zhang   if (A->factortype == MAT_FACTOR_NONE) {
63486a27549SJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
6358c3ff71bSJunchao Zhang     delete aijkok;
63686a27549SJunchao Zhang   } else {
63786a27549SJunchao Zhang     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
63886a27549SJunchao Zhang   }
639cbc6b225SStefano Zampini   A->spptr = NULL;
6409566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
6419566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
6429566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
6439566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
6443ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6458c3ff71bSJunchao Zhang }
6468c3ff71bSJunchao Zhang 
6473f3ba80aSJunchao Zhang /*MC
6483f3ba80aSJunchao Zhang    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
6493f3ba80aSJunchao Zhang 
65015229ffcSPierre Jolivet    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
6513f3ba80aSJunchao Zhang 
6522ef1f0ffSBarry Smith    Options Database Key:
65311a5261eSBarry Smith .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
6543f3ba80aSJunchao Zhang 
6553f3ba80aSJunchao Zhang   Level: beginner
6563f3ba80aSJunchao Zhang 
6571cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
6583f3ba80aSJunchao Zhang M*/
659d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
660d71ae5a4SJacob Faibussowitsch {
66186a27549SJunchao Zhang   PetscFunctionBegin;
6629566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
6639566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(A));
6649566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
6653ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
66686a27549SJunchao Zhang }
66786a27549SJunchao Zhang 
668076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
669d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
670d71ae5a4SJacob Faibussowitsch {
671076ba34aSJunchao Zhang   Mat_SeqAIJ         *a, *b;
672076ba34aSJunchao Zhang   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
673076ba34aSJunchao Zhang   MatScalarKokkosView aa, ba, ca;
674076ba34aSJunchao Zhang   MatRowMapKokkosView ai, bi, ci;
675076ba34aSJunchao Zhang   MatColIdxKokkosView aj, bj, cj;
676076ba34aSJunchao Zhang   PetscInt            m, n, nnz, aN;
677a3f881fbSStefano Zampini 
678a3f881fbSStefano Zampini   PetscFunctionBegin;
679076ba34aSJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
680076ba34aSJunchao Zhang   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
6814f572ea9SToby Isaac   PetscAssertPointer(C, 4);
682076ba34aSJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
683076ba34aSJunchao Zhang   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
6845f80ce2aSJacob Faibussowitsch   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
6855f80ce2aSJacob Faibussowitsch   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
686076ba34aSJunchao Zhang 
6879566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
6889566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B));
689076ba34aSJunchao Zhang   a    = static_cast<Mat_SeqAIJ *>(A->data);
690076ba34aSJunchao Zhang   b    = static_cast<Mat_SeqAIJ *>(B->data);
691076ba34aSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
692076ba34aSJunchao Zhang   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
693076ba34aSJunchao Zhang   aa   = akok->a_dual.view_device();
694076ba34aSJunchao Zhang   ai   = akok->i_dual.view_device();
695076ba34aSJunchao Zhang   ba   = bkok->a_dual.view_device();
696076ba34aSJunchao Zhang   bi   = bkok->i_dual.view_device();
697076ba34aSJunchao Zhang   m    = A->rmap->n; /* M, N and nnz of C */
698076ba34aSJunchao Zhang   n    = A->cmap->n + B->cmap->n;
699076ba34aSJunchao Zhang   nnz  = a->nz + b->nz;
700076ba34aSJunchao Zhang   aN   = A->cmap->n; /* N of A */
701076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) {
702076ba34aSJunchao Zhang     aj           = akok->j_dual.view_device();
703076ba34aSJunchao Zhang     bj           = bkok->j_dual.view_device();
704076ba34aSJunchao Zhang     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
705076ba34aSJunchao Zhang     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
706076ba34aSJunchao Zhang     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
707076ba34aSJunchao Zhang     ca           = ca_dual.view_device();
708076ba34aSJunchao Zhang     ci           = ci_dual.view_device();
709076ba34aSJunchao Zhang     cj           = cj_dual.view_device();
710076ba34aSJunchao Zhang 
711076ba34aSJunchao Zhang     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
7129371c9d4SSatish Balay     Kokkos::parallel_for(
713*d326c3f1SJunchao Zhang       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
714076ba34aSJunchao Zhang         PetscInt i       = t.league_rank(); /* row i */
715076ba34aSJunchao Zhang         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
716076ba34aSJunchao Zhang 
717076ba34aSJunchao Zhang         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
718076ba34aSJunchao Zhang                                                    ci(i) = coffset;
719076ba34aSJunchao Zhang                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
720076ba34aSJunchao Zhang         });
721076ba34aSJunchao Zhang 
722076ba34aSJunchao Zhang         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
723076ba34aSJunchao Zhang           if (k < alen) {
724076ba34aSJunchao Zhang             ca(coffset + k) = aa(ai(i) + k);
725076ba34aSJunchao Zhang             cj(coffset + k) = aj(ai(i) + k);
726076ba34aSJunchao Zhang           } else {
727076ba34aSJunchao Zhang             ca(coffset + k) = ba(bi(i) + k - alen);
728076ba34aSJunchao Zhang             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
729076ba34aSJunchao Zhang           }
730076ba34aSJunchao Zhang         });
731076ba34aSJunchao Zhang       });
732076ba34aSJunchao Zhang     ca_dual.modify_device();
733076ba34aSJunchao Zhang     ci_dual.modify_device();
734076ba34aSJunchao Zhang     cj_dual.modify_device();
7359566063dSJacob Faibussowitsch     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
7369566063dSJacob Faibussowitsch     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
737076ba34aSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {
738076ba34aSJunchao Zhang     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
739076ba34aSJunchao Zhang     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
740076ba34aSJunchao Zhang     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
741076ba34aSJunchao Zhang     ca   = ckok->a_dual.view_device();
742076ba34aSJunchao Zhang     ci   = ckok->i_dual.view_device();
743076ba34aSJunchao Zhang 
7449371c9d4SSatish Balay     Kokkos::parallel_for(
745*d326c3f1SJunchao Zhang       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
746076ba34aSJunchao Zhang         PetscInt i    = t.league_rank(); /* row i */
747076ba34aSJunchao Zhang         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
748076ba34aSJunchao Zhang         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
749076ba34aSJunchao Zhang           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
750076ba34aSJunchao Zhang           else ca(ci(i) + k) = ba(bi(i) + k - alen);
751076ba34aSJunchao Zhang         });
752076ba34aSJunchao Zhang       });
7539566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
754076ba34aSJunchao Zhang   }
7553ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
756076ba34aSJunchao Zhang }
757076ba34aSJunchao Zhang 
758d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
759d71ae5a4SJacob Faibussowitsch {
760076ba34aSJunchao Zhang   PetscFunctionBegin;
761076ba34aSJunchao Zhang   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
7623ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
763a3f881fbSStefano Zampini }
764a3f881fbSStefano Zampini 
765d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
766d71ae5a4SJacob Faibussowitsch {
767a3f881fbSStefano Zampini   Mat_Product                 *product = C->product;
768a3f881fbSStefano Zampini   Mat                          A, B;
769076ba34aSJunchao Zhang   bool                         transA, transB; /* use bool, since KK needs this type */
770a3f881fbSStefano Zampini   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
771a3f881fbSStefano Zampini   Mat_SeqAIJ                  *c;
772076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos *pdata;
7730e3ece09SJunchao Zhang   KokkosCsrMatrix              csrmatA, csrmatB;
774a3f881fbSStefano Zampini 
775a3f881fbSStefano Zampini   PetscFunctionBegin;
776a3f881fbSStefano Zampini   MatCheckProduct(C, 1);
7775f80ce2aSJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
778076ba34aSJunchao Zhang   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
779076ba34aSJunchao Zhang 
7800e3ece09SJunchao Zhang   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
7810e3ece09SJunchao Zhang   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
7820e3ece09SJunchao Zhang   // we still do numeric.
7830e3ece09SJunchao Zhang   if (pdata->reusesym) { // numeric reuses results from symbolic
7840e3ece09SJunchao Zhang     pdata->reusesym = PETSC_FALSE;
7853ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
786076ba34aSJunchao Zhang   }
787076ba34aSJunchao Zhang 
788076ba34aSJunchao Zhang   switch (product->type) {
7899371c9d4SSatish Balay   case MATPRODUCT_AB:
7909371c9d4SSatish Balay     transA = false;
7919371c9d4SSatish Balay     transB = false;
7929371c9d4SSatish Balay     break;
7939371c9d4SSatish Balay   case MATPRODUCT_AtB:
7949371c9d4SSatish Balay     transA = true;
7959371c9d4SSatish Balay     transB = false;
7969371c9d4SSatish Balay     break;
7979371c9d4SSatish Balay   case MATPRODUCT_ABt:
7989371c9d4SSatish Balay     transA = false;
7999371c9d4SSatish Balay     transB = true;
8009371c9d4SSatish Balay     break;
801d71ae5a4SJacob Faibussowitsch   default:
802d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
803076ba34aSJunchao Zhang   }
804076ba34aSJunchao Zhang 
805a3f881fbSStefano Zampini   A = product->A;
806a3f881fbSStefano Zampini   B = product->B;
8079566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
8089566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B));
809a3f881fbSStefano Zampini   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
810a3f881fbSStefano Zampini   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
811a3f881fbSStefano Zampini   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
812076ba34aSJunchao Zhang 
8135f80ce2aSJacob Faibussowitsch   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
814076ba34aSJunchao Zhang 
8150e3ece09SJunchao Zhang   csrmatA = akok->csrmat;
8160e3ece09SJunchao Zhang   csrmatB = bkok->csrmat;
817076ba34aSJunchao Zhang 
818076ba34aSJunchao Zhang   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
819076ba34aSJunchao Zhang   if (transA) {
8209566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
821076ba34aSJunchao Zhang     transA = false;
822a3f881fbSStefano Zampini   }
823a3f881fbSStefano Zampini 
824076ba34aSJunchao Zhang   if (transB) {
8259566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
826076ba34aSJunchao Zhang     transB = false;
827076ba34aSJunchao Zhang   }
8289566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
8290e3ece09SJunchao Zhang   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
8300e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
831866eb059SJunchao Zhang   auto spgemmHandle = pdata->kh.get_spgemm_handle();
832866eb059SJunchao Zhang   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
833e944a159SJunchao Zhang #endif
834866eb059SJunchao Zhang 
8359566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
8369566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(C));
837a3f881fbSStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
838a3f881fbSStefano Zampini   c = (Mat_SeqAIJ *)C->data;
8399566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
8409566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
8419566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
842a3f881fbSStefano Zampini   c->reallocs         = 0;
843076ba34aSJunchao Zhang   C->info.mallocs     = 0;
844a3f881fbSStefano Zampini   C->info.nz_unneeded = 0;
845a3f881fbSStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
846a3f881fbSStefano Zampini   C->num_ass++;
8473ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
848a3f881fbSStefano Zampini }
849a3f881fbSStefano Zampini 
850d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
851d71ae5a4SJacob Faibussowitsch {
852076ba34aSJunchao Zhang   Mat_Product                 *product = C->product;
853076ba34aSJunchao Zhang   MatProductType               ptype;
854076ba34aSJunchao Zhang   Mat                          A, B;
855076ba34aSJunchao Zhang   bool                         transA, transB;
856076ba34aSJunchao Zhang   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
857076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos *pdata;
858076ba34aSJunchao Zhang   MPI_Comm                     comm;
8590e3ece09SJunchao Zhang   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
860a3f881fbSStefano Zampini 
861a3f881fbSStefano Zampini   PetscFunctionBegin;
862a3f881fbSStefano Zampini   MatCheckProduct(C, 1);
8639566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
8645f80ce2aSJacob Faibussowitsch   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
865a3f881fbSStefano Zampini   A = product->A;
866a3f881fbSStefano Zampini   B = product->B;
8679566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
8689566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B));
869a3f881fbSStefano Zampini   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
870a3f881fbSStefano Zampini   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
8710e3ece09SJunchao Zhang   csrmatA = akok->csrmat;
8720e3ece09SJunchao Zhang   csrmatB = bkok->csrmat;
873076ba34aSJunchao Zhang 
874a3f881fbSStefano Zampini   ptype = product->type;
8750e3ece09SJunchao Zhang   // Take advantage of the symmetry if true
8760e3ece09SJunchao Zhang   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
8770e3ece09SJunchao Zhang     ptype                                          = MATPRODUCT_AB;
8780e3ece09SJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
8790e3ece09SJunchao Zhang   }
8800e3ece09SJunchao Zhang   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
8810e3ece09SJunchao Zhang     ptype                                          = MATPRODUCT_AB;
8820e3ece09SJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
8830e3ece09SJunchao Zhang   }
8840e3ece09SJunchao Zhang 
885a3f881fbSStefano Zampini   switch (ptype) {
8869371c9d4SSatish Balay   case MATPRODUCT_AB:
8879371c9d4SSatish Balay     transA = false;
8889371c9d4SSatish Balay     transB = false;
8899371c9d4SSatish Balay     break;
8909371c9d4SSatish Balay   case MATPRODUCT_AtB:
8919371c9d4SSatish Balay     transA = true;
8929371c9d4SSatish Balay     transB = false;
8939371c9d4SSatish Balay     break;
8949371c9d4SSatish Balay   case MATPRODUCT_ABt:
8959371c9d4SSatish Balay     transA = false;
8969371c9d4SSatish Balay     transB = true;
8979371c9d4SSatish Balay     break;
898d71ae5a4SJacob Faibussowitsch   default:
899d71ae5a4SJacob Faibussowitsch     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
900a3f881fbSStefano Zampini   }
9010e3ece09SJunchao Zhang   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
902076ba34aSJunchao Zhang   pdata->reusesym = product->api_user;
903a3f881fbSStefano Zampini 
904076ba34aSJunchao Zhang   /* TODO: add command line options to select spgemm algorithms */
905866eb059SJunchao Zhang   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
906866eb059SJunchao Zhang 
907866eb059SJunchao Zhang   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
908866eb059SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
909866eb059SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
910866eb059SJunchao Zhang   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
911866eb059SJunchao Zhang   #endif
912866eb059SJunchao Zhang #endif
9130e3ece09SJunchao Zhang   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
914076ba34aSJunchao Zhang 
9159566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
916076ba34aSJunchao Zhang   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
917076ba34aSJunchao Zhang   if (transA) {
9189566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
919076ba34aSJunchao Zhang     transA = false;
920076ba34aSJunchao Zhang   }
921076ba34aSJunchao Zhang 
922076ba34aSJunchao Zhang   if (transB) {
9239566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
924076ba34aSJunchao Zhang     transB = false;
925076ba34aSJunchao Zhang   }
926076ba34aSJunchao Zhang 
9270e3ece09SJunchao Zhang   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
928076ba34aSJunchao Zhang   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
929076ba34aSJunchao Zhang     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
930076ba34aSJunchao Zhang     calling new Mat_SeqAIJKokkos().
931076ba34aSJunchao Zhang     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
932076ba34aSJunchao Zhang   */
9330e3ece09SJunchao Zhang   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
9340e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
935866eb059SJunchao Zhang   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
936866eb059SJunchao Zhang   auto spgemmHandle = pdata->kh.get_spgemm_handle();
937866eb059SJunchao Zhang   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
938e944a159SJunchao Zhang #endif
9399566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
940076ba34aSJunchao Zhang 
9419566063dSJacob Faibussowitsch   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
9429566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
943076ba34aSJunchao Zhang   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
9443ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
945a3f881fbSStefano Zampini }
946a3f881fbSStefano Zampini 
947a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */
948d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
949d71ae5a4SJacob Faibussowitsch {
950076ba34aSJunchao Zhang   Mat_Product *product = mat->product;
951a3f881fbSStefano Zampini   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
952a3f881fbSStefano Zampini 
953a3f881fbSStefano Zampini   PetscFunctionBegin;
954a3f881fbSStefano Zampini   MatCheckProduct(mat, 1);
9559566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
95648a46eb9SPierre Jolivet   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
957a3f881fbSStefano Zampini   if (Biskok && Ciskok) {
958a3f881fbSStefano Zampini     switch (product->type) {
959a3f881fbSStefano Zampini     case MATPRODUCT_AB:
960a3f881fbSStefano Zampini     case MATPRODUCT_AtB:
961d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABt:
962d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
963d71ae5a4SJacob Faibussowitsch       break;
964a3f881fbSStefano Zampini     case MATPRODUCT_PtAP:
965a3f881fbSStefano Zampini     case MATPRODUCT_RARt:
966d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABC:
967d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
968d71ae5a4SJacob Faibussowitsch       break;
969d71ae5a4SJacob Faibussowitsch     default:
970d71ae5a4SJacob Faibussowitsch       break;
971a3f881fbSStefano Zampini     }
972a3f881fbSStefano Zampini   } else { /* fallback for AIJ */
9739566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
974a3f881fbSStefano Zampini   }
9753ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
976a3f881fbSStefano Zampini }
977a587d139SMark 
978d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
979d71ae5a4SJacob Faibussowitsch {
980f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok;
981f0cf5187SStefano Zampini 
982f0cf5187SStefano Zampini   PetscFunctionBegin;
9839566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
9849566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
985f0cf5187SStefano Zampini   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
986*d326c3f1SJunchao Zhang   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
9879566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(A));
9889566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
9899566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
9903ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
991f0cf5187SStefano Zampini }
992f0cf5187SStefano Zampini 
993f4747e26SJunchao Zhang // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
994f4747e26SJunchao Zhang static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
995f4747e26SJunchao Zhang {
996f4747e26SJunchao Zhang   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
997f4747e26SJunchao Zhang 
998f4747e26SJunchao Zhang   PetscFunctionBegin;
999f4747e26SJunchao Zhang   if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
1000f4747e26SJunchao Zhang     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1001f4747e26SJunchao Zhang 
1002f4747e26SJunchao Zhang     PetscCall(PetscLogGpuTimeBegin());
1003f4747e26SJunchao Zhang     PetscCall(MatSeqAIJKokkosSyncDevice(A));
1004f4747e26SJunchao Zhang     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1005f4747e26SJunchao Zhang     const auto &Aa     = aijkok->a_dual.view_device();
1006f4747e26SJunchao Zhang     const auto &Adiag  = aijkok->diag_dual.view_device();
1007*d326c3f1SJunchao Zhang     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1008f4747e26SJunchao Zhang     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1009f4747e26SJunchao Zhang     PetscCall(PetscLogGpuFlops(n));
1010f4747e26SJunchao Zhang     PetscCall(PetscLogGpuTimeEnd());
1011f4747e26SJunchao Zhang   } else { // need reassembly, very slow!
1012f4747e26SJunchao Zhang     PetscCall(MatShift_Basic(A, a));
1013f4747e26SJunchao Zhang   }
1014f4747e26SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1015f4747e26SJunchao Zhang }
1016f4747e26SJunchao Zhang 
1017f4747e26SJunchao Zhang static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1018f4747e26SJunchao Zhang {
1019f4747e26SJunchao Zhang   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1020f4747e26SJunchao Zhang 
1021f4747e26SJunchao Zhang   PetscFunctionBegin;
1022f4747e26SJunchao Zhang   if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1023f4747e26SJunchao Zhang     ConstPetscScalarKokkosView dv;
1024f4747e26SJunchao Zhang     PetscInt                   n, nv;
1025f4747e26SJunchao Zhang 
1026f4747e26SJunchao Zhang     PetscCall(PetscLogGpuTimeBegin());
1027f4747e26SJunchao Zhang     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1028f4747e26SJunchao Zhang     PetscCall(VecGetKokkosView(D, &dv));
1029f4747e26SJunchao Zhang     PetscCall(VecGetLocalSize(D, &nv));
1030f4747e26SJunchao Zhang     n = PetscMin(Y->rmap->n, Y->cmap->n);
1031f4747e26SJunchao Zhang     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1032f4747e26SJunchao Zhang 
1033f4747e26SJunchao Zhang     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1034f4747e26SJunchao Zhang     const auto &Aa     = aijkok->a_dual.view_device();
1035f4747e26SJunchao Zhang     const auto &Adiag  = aijkok->diag_dual.view_device();
1036f4747e26SJunchao Zhang     PetscCallCXX(Kokkos::parallel_for(
1037*d326c3f1SJunchao Zhang       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1038f4747e26SJunchao Zhang         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1039f4747e26SJunchao Zhang         else Aa(Adiag(i)) += dv(i);
1040f4747e26SJunchao Zhang       }));
1041f4747e26SJunchao Zhang     PetscCall(VecRestoreKokkosView(D, &dv));
1042f4747e26SJunchao Zhang     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1043f4747e26SJunchao Zhang     PetscCall(PetscLogGpuFlops(n));
1044f4747e26SJunchao Zhang     PetscCall(PetscLogGpuTimeEnd());
1045f4747e26SJunchao Zhang   } else { // need reassembly, very slow!
1046f4747e26SJunchao Zhang     PetscCall(MatDiagonalSet_Default(Y, D, is));
1047f4747e26SJunchao Zhang   }
1048f4747e26SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1049f4747e26SJunchao Zhang }
1050f4747e26SJunchao Zhang 
1051f4747e26SJunchao Zhang static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1052f4747e26SJunchao Zhang {
1053f4747e26SJunchao Zhang   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1054f4747e26SJunchao Zhang   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1055f4747e26SJunchao Zhang   ConstPetscScalarKokkosView lv, rv;
1056f4747e26SJunchao Zhang 
1057f4747e26SJunchao Zhang   PetscFunctionBegin;
1058f4747e26SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1059f4747e26SJunchao Zhang   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1060f4747e26SJunchao Zhang   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1061f4747e26SJunchao Zhang   const auto &Aa     = aijkok->a_dual.view_device();
1062f4747e26SJunchao Zhang   const auto &Ai     = aijkok->i_dual.view_device();
1063f4747e26SJunchao Zhang   const auto &Aj     = aijkok->j_dual.view_device();
1064f4747e26SJunchao Zhang   if (ll) {
1065f4747e26SJunchao Zhang     PetscCall(VecGetLocalSize(ll, &m));
1066f4747e26SJunchao Zhang     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1067f4747e26SJunchao Zhang     PetscCall(VecGetKokkosView(ll, &lv));
1068f4747e26SJunchao Zhang     PetscCallCXX(Kokkos::parallel_for( // for each row
1069*d326c3f1SJunchao Zhang       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1070f4747e26SJunchao Zhang         PetscInt i   = t.league_rank(); // row i
1071f4747e26SJunchao Zhang         PetscInt len = Ai(i + 1) - Ai(i);
1072f4747e26SJunchao Zhang         // scale entries on the row
1073f4747e26SJunchao Zhang         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1074f4747e26SJunchao Zhang       }));
1075f4747e26SJunchao Zhang     PetscCall(VecRestoreKokkosView(ll, &lv));
1076f4747e26SJunchao Zhang     PetscCall(PetscLogGpuFlops(nz));
1077f4747e26SJunchao Zhang   }
1078f4747e26SJunchao Zhang   if (rr) {
1079f4747e26SJunchao Zhang     PetscCall(VecGetLocalSize(rr, &n));
1080f4747e26SJunchao Zhang     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1081f4747e26SJunchao Zhang     PetscCall(VecGetKokkosView(rr, &rv));
1082f4747e26SJunchao Zhang     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1083*d326c3f1SJunchao Zhang       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1084f4747e26SJunchao Zhang     PetscCall(VecRestoreKokkosView(rr, &lv));
1085f4747e26SJunchao Zhang     PetscCall(PetscLogGpuFlops(nz));
1086f4747e26SJunchao Zhang   }
1087f4747e26SJunchao Zhang   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1088f4747e26SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1089f4747e26SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1090f4747e26SJunchao Zhang }
1091f4747e26SJunchao Zhang 
1092d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1093d71ae5a4SJacob Faibussowitsch {
1094076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
1095a587d139SMark 
1096a587d139SMark   PetscFunctionBegin;
1097076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
10982328674fSJunchao Zhang   if (aijkok) { /* Only zero the device if data is already there */
1099*d326c3f1SJunchao Zhang     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
11009566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(A));
11012328674fSJunchao Zhang   } else { /* Might be preallocated but not assembled */
11029566063dSJacob Faibussowitsch     PetscCall(MatZeroEntries_SeqAIJ(A));
11032328674fSJunchao Zhang   }
11043ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1105a587d139SMark }
1106a587d139SMark 
1107d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1108d71ae5a4SJacob Faibussowitsch {
1109f78ce678SMark Adams   Mat_SeqAIJKokkos     *aijkok;
1110f78ce678SMark Adams   PetscInt              n;
1111f78ce678SMark Adams   PetscScalarKokkosView xv;
1112f78ce678SMark Adams 
1113f78ce678SMark Adams   PetscFunctionBegin;
1114f78ce678SMark Adams   PetscCall(VecGetLocalSize(x, &n));
1115f78ce678SMark Adams   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1116f78ce678SMark Adams   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1117f78ce678SMark Adams 
1118f78ce678SMark Adams   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1119f78ce678SMark Adams   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1120f78ce678SMark Adams 
1121f78ce678SMark Adams   const auto &Aa    = aijkok->a_dual.view_device();
1122f78ce678SMark Adams   const auto &Ai    = aijkok->i_dual.view_device();
1123f78ce678SMark Adams   const auto &Adiag = aijkok->diag_dual.view_device();
1124f78ce678SMark Adams 
1125f78ce678SMark Adams   PetscCall(VecGetKokkosViewWrite(x, &xv));
11269371c9d4SSatish Balay   Kokkos::parallel_for(
1127*d326c3f1SJunchao Zhang     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1128f78ce678SMark Adams       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1129f78ce678SMark Adams       else xv(i) = 0;
1130f78ce678SMark Adams     });
1131f78ce678SMark Adams   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
11323ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1133f78ce678SMark Adams }
1134f78ce678SMark Adams 
1135db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1136d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1137d71ae5a4SJacob Faibussowitsch {
1138db78de30SJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
1139db78de30SJunchao Zhang 
1140db78de30SJunchao Zhang   PetscFunctionBegin;
1141db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
11424f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1143db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
11449566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1145db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1146076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
11473ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1148db78de30SJunchao Zhang }
1149db78de30SJunchao Zhang 
1150d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1151d71ae5a4SJacob Faibussowitsch {
1152db78de30SJunchao Zhang   PetscFunctionBegin;
1153db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
11544f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1155db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
11563ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1157db78de30SJunchao Zhang }
1158db78de30SJunchao Zhang 
1159d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1160d71ae5a4SJacob Faibussowitsch {
1161db78de30SJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
1162db78de30SJunchao Zhang 
1163db78de30SJunchao Zhang   PetscFunctionBegin;
1164db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
11654f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1166db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
11679566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1168db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1169076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
11703ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1171db78de30SJunchao Zhang }
1172db78de30SJunchao Zhang 
1173d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1174d71ae5a4SJacob Faibussowitsch {
1175db78de30SJunchao Zhang   PetscFunctionBegin;
1176db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
11774f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1178db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
11799566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(A));
11803ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1181db78de30SJunchao Zhang }
1182db78de30SJunchao Zhang 
1183d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1184d71ae5a4SJacob Faibussowitsch {
1185db78de30SJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
1186db78de30SJunchao Zhang 
1187db78de30SJunchao Zhang   PetscFunctionBegin;
1188db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
11894f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1190db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1191db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1192076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
11933ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1194db78de30SJunchao Zhang }
1195db78de30SJunchao Zhang 
1196d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1197d71ae5a4SJacob Faibussowitsch {
1198db78de30SJunchao Zhang   PetscFunctionBegin;
1199db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
12004f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1201db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
12029566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(A));
12033ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1204db78de30SJunchao Zhang }
1205db78de30SJunchao Zhang 
1206c17cf699SJunchao Zhang /* Computes Y += alpha X */
1207d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1208d71ae5a4SJacob Faibussowitsch {
1209a587d139SMark   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1210c17cf699SJunchao Zhang   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1211c17cf699SJunchao Zhang   ConstMatScalarKokkosView Xa;
1212c17cf699SJunchao Zhang   MatScalarKokkosView      Ya;
1213*d326c3f1SJunchao Zhang   auto                    &exec = PetscGetKokkosExecutionSpace();
1214a587d139SMark 
1215a587d139SMark   PetscFunctionBegin;
1216c17cf699SJunchao Zhang   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1217c17cf699SJunchao Zhang   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
12189566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
12199566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(X));
12209566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1221db78de30SJunchao Zhang 
1222c17cf699SJunchao Zhang   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1223a587d139SMark     PetscBool e;
12249566063dSJacob Faibussowitsch     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1225a587d139SMark     if (e) {
12269566063dSJacob Faibussowitsch       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1227c17cf699SJunchao Zhang       if (e) pattern = SAME_NONZERO_PATTERN;
1228a587d139SMark     }
1229a587d139SMark   }
1230db78de30SJunchao Zhang 
1231c17cf699SJunchao Zhang   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1232c17cf699SJunchao Zhang     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1233c17cf699SJunchao Zhang     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1234c17cf699SJunchao Zhang     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1235c17cf699SJunchao Zhang   */
1236c17cf699SJunchao Zhang   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1237c17cf699SJunchao Zhang   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1238c17cf699SJunchao Zhang   Xa   = xkok->a_dual.view_device();
1239c17cf699SJunchao Zhang   Ya   = ykok->a_dual.view_device();
1240c17cf699SJunchao Zhang 
1241c17cf699SJunchao Zhang   if (pattern == SAME_NONZERO_PATTERN) {
1242*d326c3f1SJunchao Zhang     KokkosBlas::axpy(exec, alpha, Xa, Ya);
12439566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1244c17cf699SJunchao Zhang   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1245c17cf699SJunchao Zhang     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1246c17cf699SJunchao Zhang     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1247c17cf699SJunchao Zhang 
12489371c9d4SSatish Balay     Kokkos::parallel_for(
1249*d326c3f1SJunchao Zhang       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
12500e3ece09SJunchao Zhang         PetscInt i = t.league_rank(); // row i
12510e3ece09SJunchao Zhang         Kokkos::single(Kokkos::PerTeam(t), [=]() {
12520e3ece09SJunchao Zhang           // Only one thread works in a team
1253c17cf699SJunchao Zhang           PetscInt p, q = Yi(i);
12540e3ece09SJunchao Zhang           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
12550e3ece09SJunchao Zhang             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
12560e3ece09SJunchao Zhang             if (Xj(p) == Yj(q)) {                        // Found it
1257c17cf699SJunchao Zhang               Ya(q) += alpha * Xa(p);
1258c17cf699SJunchao Zhang               q++;
1259a587d139SMark             } else {
12600e3ece09SJunchao Zhang             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
12610e3ece09SJunchao Zhang             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
12620e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
12630e3ece09SJunchao Zhang               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
12648b8b16f9SJunchao Zhang #else
12650e3ece09SJunchao Zhang               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
12668b8b16f9SJunchao Zhang #endif
1267a587d139SMark             }
1268c17cf699SJunchao Zhang           }
1269c17cf699SJunchao Zhang         });
1270c17cf699SJunchao Zhang       });
12719566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
12720e3ece09SJunchao Zhang   } else { // different nonzero patterns
1273c17cf699SJunchao Zhang     Mat             Z;
1274c17cf699SJunchao Zhang     KokkosCsrMatrix zcsr;
1275c17cf699SJunchao Zhang     KernelHandle    kh;
12760e3ece09SJunchao Zhang     kh.create_spadd_handle(true); // X, Y are sorted
1277c17cf699SJunchao Zhang     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1278c17cf699SJunchao Zhang     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1279c17cf699SJunchao Zhang     zkok = new Mat_SeqAIJKokkos(zcsr);
12809566063dSJacob Faibussowitsch     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
12819566063dSJacob Faibussowitsch     PetscCall(MatHeaderReplace(Y, &Z));
1282c17cf699SJunchao Zhang     kh.destroy_spadd_handle();
1283c17cf699SJunchao Zhang   }
12849566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
12850e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
12863ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1287a587d139SMark }
1288a587d139SMark 
12892c4ab24aSJunchao Zhang struct MatCOOStruct_SeqAIJKokkos {
12902c4ab24aSJunchao Zhang   PetscCount           n;
12912c4ab24aSJunchao Zhang   PetscCount           Atot;
12922c4ab24aSJunchao Zhang   PetscInt             nz;
12932c4ab24aSJunchao Zhang   PetscCountKokkosView jmap;
12942c4ab24aSJunchao Zhang   PetscCountKokkosView perm;
12952c4ab24aSJunchao Zhang 
12962c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
12972c4ab24aSJunchao Zhang   {
12982c4ab24aSJunchao Zhang     nz   = coo_h->nz;
12992c4ab24aSJunchao Zhang     n    = coo_h->n;
13002c4ab24aSJunchao Zhang     Atot = coo_h->Atot;
13012c4ab24aSJunchao Zhang     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
13022c4ab24aSJunchao Zhang     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
13032c4ab24aSJunchao Zhang   }
13042c4ab24aSJunchao Zhang };
13052c4ab24aSJunchao Zhang 
13062c4ab24aSJunchao Zhang static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void *data)
13072c4ab24aSJunchao Zhang {
13082c4ab24aSJunchao Zhang   PetscFunctionBegin;
13092c4ab24aSJunchao Zhang   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(data));
13102c4ab24aSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
13112c4ab24aSJunchao Zhang }
13122c4ab24aSJunchao Zhang 
1313d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1314d71ae5a4SJacob Faibussowitsch {
131542550becSJunchao Zhang   Mat_SeqAIJKokkos          *akok;
131642550becSJunchao Zhang   Mat_SeqAIJ                *aseq;
13172c4ab24aSJunchao Zhang   PetscContainer             container_h, container_d;
13182c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJ       *coo_h;
13192c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJKokkos *coo_d;
132042550becSJunchao Zhang 
132142550becSJunchao Zhang   PetscFunctionBegin;
13229566063dSJacob Faibussowitsch   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1323394ed5ebSJunchao Zhang   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
132442550becSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1325cbc6b225SStefano Zampini   delete akok;
1326f4747e26SJunchao Zhang   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
13279566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
13282c4ab24aSJunchao Zhang 
13292c4ab24aSJunchao Zhang   // Copy the COO struct to device
13302c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
13312c4ab24aSJunchao Zhang   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
13322c4ab24aSJunchao Zhang   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
13332c4ab24aSJunchao Zhang 
13342c4ab24aSJunchao Zhang   // Put the COO struct in a container and then attach that to the matrix
13352c4ab24aSJunchao Zhang   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d));
13362c4ab24aSJunchao Zhang   PetscCall(PetscContainerSetPointer(container_d, coo_d));
13372c4ab24aSJunchao Zhang   PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_SeqAIJKokkos));
13382c4ab24aSJunchao Zhang   PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d));
13392c4ab24aSJunchao Zhang   PetscCall(PetscContainerDestroy(&container_d));
13403ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
134142550becSJunchao Zhang }
134242550becSJunchao Zhang 
1343d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1344d71ae5a4SJacob Faibussowitsch {
134542550becSJunchao Zhang   MatScalarKokkosView        Aa;
134642550becSJunchao Zhang   ConstMatScalarKokkosView   kv;
134742550becSJunchao Zhang   PetscMemType               memtype;
13482c4ab24aSJunchao Zhang   PetscContainer             container;
13492c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJKokkos *coo;
135042550becSJunchao Zhang 
135142550becSJunchao Zhang   PetscFunctionBegin;
13522c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
13532c4ab24aSJunchao Zhang   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
13542c4ab24aSJunchao Zhang 
13552c4ab24aSJunchao Zhang   const auto &n    = coo->n;
13562c4ab24aSJunchao Zhang   const auto &Annz = coo->nz;
13572c4ab24aSJunchao Zhang   const auto &jmap = coo->jmap;
13582c4ab24aSJunchao Zhang   const auto &perm = coo->perm;
13592c4ab24aSJunchao Zhang 
13609566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(v, &memtype));
136142550becSJunchao Zhang   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
13622c4ab24aSJunchao Zhang     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
136342550becSJunchao Zhang   } else {
13642c4ab24aSJunchao Zhang     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
136542550becSJunchao Zhang   }
136642550becSJunchao Zhang 
1367c7b718f4SJunchao Zhang   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1368c7b718f4SJunchao Zhang   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
136942550becSJunchao Zhang 
137008bb9926SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
13719371c9d4SSatish Balay   Kokkos::parallel_for(
1372*d326c3f1SJunchao Zhang     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1373c7b718f4SJunchao Zhang       PetscScalar sum = 0.0;
1374c7b718f4SJunchao Zhang       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1375c7b718f4SJunchao Zhang       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1376c7b718f4SJunchao Zhang     });
137708bb9926SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1378394ed5ebSJunchao Zhang 
13799566063dSJacob Faibussowitsch   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
13809566063dSJacob Faibussowitsch   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
13813ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
138242550becSJunchao Zhang }
138342550becSJunchao Zhang 
1384d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1385d71ae5a4SJacob Faibussowitsch {
13868f7e8f9dSMark Adams   PetscFunctionBegin;
13879566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncHost(A));
13889566063dSJacob Faibussowitsch   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
13898f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
13903ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
13918f7e8f9dSMark Adams }
13928f7e8f9dSMark Adams 
1393d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1394d71ae5a4SJacob Faibussowitsch {
1395076ba34aSJunchao Zhang   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1396076ba34aSJunchao Zhang 
13978c3ff71bSJunchao Zhang   PetscFunctionBegin;
1398076ba34aSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
13996f3d89d0SStefano Zampini   A->boundtocpu  = PETSC_FALSE;
14006f3d89d0SStefano Zampini 
14018c3ff71bSJunchao Zhang   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
14028c3ff71bSJunchao Zhang   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
14038c3ff71bSJunchao Zhang   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1404a587d139SMark   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1405f0cf5187SStefano Zampini   A->ops->scale                     = MatScale_SeqAIJKokkos;
1406a587d139SMark   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1407076ba34aSJunchao Zhang   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
14088c3ff71bSJunchao Zhang   A->ops->mult                      = MatMult_SeqAIJKokkos;
14098c3ff71bSJunchao Zhang   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
14108c3ff71bSJunchao Zhang   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
14118c3ff71bSJunchao Zhang   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
14128c3ff71bSJunchao Zhang   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
14138c3ff71bSJunchao Zhang   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1414076ba34aSJunchao Zhang   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
14150ecb592aSJunchao Zhang   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1416152b3e56SJunchao Zhang   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1417f78ce678SMark Adams   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1418f4747e26SJunchao Zhang   A->ops->shift                     = MatShift_SeqAIJKokkos;
1419f4747e26SJunchao Zhang   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1420f4747e26SJunchao Zhang   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1421076ba34aSJunchao Zhang   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1422076ba34aSJunchao Zhang   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1423076ba34aSJunchao Zhang   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1424076ba34aSJunchao Zhang   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1425076ba34aSJunchao Zhang   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1426076ba34aSJunchao Zhang   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
14277ee59b9bSJunchao Zhang   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
142842550becSJunchao Zhang 
14299566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
14309566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
14313ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1432076ba34aSJunchao Zhang }
1433076ba34aSJunchao Zhang 
14349d13fa56SJunchao Zhang /*
14359d13fa56SJunchao Zhang    Extract the (prescribled) diagonal blocks of the matrix and then invert them
14369d13fa56SJunchao Zhang 
14379d13fa56SJunchao Zhang   Input Parameters:
14389d13fa56SJunchao Zhang +  A       - the MATSEQAIJKOKKOS matrix
14399d13fa56SJunchao Zhang .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
14409d13fa56SJunchao Zhang .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
14419d13fa56SJunchao Zhang .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
14429d13fa56SJunchao Zhang -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine
14439d13fa56SJunchao Zhang 
14449d13fa56SJunchao Zhang   Output Parameter:
14459d13fa56SJunchao Zhang .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
14469d13fa56SJunchao Zhang */
14479d13fa56SJunchao Zhang PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
14489d13fa56SJunchao Zhang {
14499d13fa56SJunchao Zhang   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
14509d13fa56SJunchao Zhang   PetscInt          nblocks = bs.extent(0) - 1;
14519d13fa56SJunchao Zhang 
14529d13fa56SJunchao Zhang   PetscFunctionBegin;
14539d13fa56SJunchao Zhang   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
14549d13fa56SJunchao Zhang 
14559d13fa56SJunchao Zhang   // Pull out the diagonal blocks of the matrix and then invert the blocks
14569d13fa56SJunchao Zhang   auto Aa    = akok->a_dual.view_device();
14579d13fa56SJunchao Zhang   auto Ai    = akok->i_dual.view_device();
14589d13fa56SJunchao Zhang   auto Aj    = akok->j_dual.view_device();
14599d13fa56SJunchao Zhang   auto Adiag = akok->diag_dual.view_device();
14609d13fa56SJunchao Zhang   // TODO: how to tune the team size?
14619d13fa56SJunchao Zhang #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HOST)
14629d13fa56SJunchao Zhang   auto ts = Kokkos::AUTO();
14639d13fa56SJunchao Zhang #else
14649d13fa56SJunchao Zhang   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
14659d13fa56SJunchao Zhang #endif
14669d13fa56SJunchao Zhang   PetscCallCXX(Kokkos::parallel_for(
1467*d326c3f1SJunchao Zhang     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
14689d13fa56SJunchao Zhang       const PetscInt bid    = teamMember.league_rank();                                                   // block id
14699d13fa56SJunchao Zhang       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
14709d13fa56SJunchao Zhang       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
14719d13fa56SJunchao Zhang       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
14729d13fa56SJunchao Zhang       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);
14739d13fa56SJunchao Zhang 
14749d13fa56SJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
14759d13fa56SJunchao Zhang         PetscInt i = rstart + r;                                                            // i-th row in A
14769d13fa56SJunchao Zhang 
14779d13fa56SJunchao Zhang         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
14789d13fa56SJunchao Zhang           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row
14799d13fa56SJunchao Zhang 
14809d13fa56SJunchao Zhang           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
14819d13fa56SJunchao Zhang             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
14829d13fa56SJunchao Zhang               B(r, c) = 0.0;
14839d13fa56SJunchao Zhang             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
14849d13fa56SJunchao Zhang               B(r, c) = Aa(first + c);
14859d13fa56SJunchao Zhang             } else { // this entry does not show up in the CSR
14869d13fa56SJunchao Zhang               B(r, c) = 0.0;
14879d13fa56SJunchao Zhang             }
14889d13fa56SJunchao Zhang           }
14899d13fa56SJunchao Zhang         } else { // rare case that the diagonal does not exist
14909d13fa56SJunchao Zhang           const PetscInt begin = Ai(i);
14919d13fa56SJunchao Zhang           const PetscInt end   = Ai(i + 1);
14929d13fa56SJunchao Zhang           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
14939d13fa56SJunchao Zhang           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
14949d13fa56SJunchao Zhang             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
14959d13fa56SJunchao Zhang             else if (Aj(j) >= rstart + m) break;
14969d13fa56SJunchao Zhang           }
14979d13fa56SJunchao Zhang         }
14989d13fa56SJunchao Zhang       });
14999d13fa56SJunchao Zhang 
15009d13fa56SJunchao Zhang       // LU-decompose B (w/o pivoting) and then invert B
15019d13fa56SJunchao Zhang       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
15029d13fa56SJunchao Zhang       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
15039d13fa56SJunchao Zhang     }));
15049d13fa56SJunchao Zhang   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
15059d13fa56SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
15069d13fa56SJunchao Zhang }
15079d13fa56SJunchao Zhang 
1508d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1509d71ae5a4SJacob Faibussowitsch {
1510076ba34aSJunchao Zhang   Mat_SeqAIJ *aseq;
1511076ba34aSJunchao Zhang   PetscInt    i, m, n;
1512e36ced11SJunchao Zhang   auto       &exec = PetscGetKokkosExecutionSpace();
1513076ba34aSJunchao Zhang 
1514076ba34aSJunchao Zhang   PetscFunctionBegin;
15155f80ce2aSJacob Faibussowitsch   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1516076ba34aSJunchao Zhang 
1517076ba34aSJunchao Zhang   m = akok->nrows();
1518076ba34aSJunchao Zhang   n = akok->ncols();
15199566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(A, m, n, m, n));
15209566063dSJacob Faibussowitsch   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1521076ba34aSJunchao Zhang 
1522076ba34aSJunchao Zhang   /* Set up data structures of A as a MATSEQAIJ */
15239566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1524076ba34aSJunchao Zhang   aseq = (Mat_SeqAIJ *)(A)->data;
1525076ba34aSJunchao Zhang 
1526e36ced11SJunchao Zhang   PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */
1527e36ced11SJunchao Zhang   PetscCallCXX(akok->j_dual.sync_host(exec));
1528e36ced11SJunchao Zhang   PetscCallCXX(exec.fence());
1529076ba34aSJunchao Zhang 
1530076ba34aSJunchao Zhang   aseq->i            = akok->i_host_data();
1531076ba34aSJunchao Zhang   aseq->j            = akok->j_host_data();
1532076ba34aSJunchao Zhang   aseq->a            = akok->a_host_data();
1533076ba34aSJunchao Zhang   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1534076ba34aSJunchao Zhang   aseq->singlemalloc = PETSC_FALSE;
1535076ba34aSJunchao Zhang   aseq->free_a       = PETSC_FALSE;
1536076ba34aSJunchao Zhang   aseq->free_ij      = PETSC_FALSE;
1537076ba34aSJunchao Zhang   aseq->nz           = akok->nnz();
1538076ba34aSJunchao Zhang   aseq->maxnz        = aseq->nz;
1539076ba34aSJunchao Zhang 
15409566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &aseq->imax));
15419566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &aseq->ilen));
1542ad540459SPierre Jolivet   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1543076ba34aSJunchao Zhang 
1544076ba34aSJunchao Zhang   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1545076ba34aSJunchao Zhang   akok->nonzerostate = A->nonzerostate;
1546ff751488SJunchao Zhang   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
15479566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
15489566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
15493ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1550076ba34aSJunchao Zhang }
1551076ba34aSJunchao Zhang 
15520e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
15530e3ece09SJunchao Zhang {
15540e3ece09SJunchao Zhang   PetscFunctionBegin;
15550e3ece09SJunchao Zhang   PetscCall(MatSeqAIJKokkosSyncDevice(A));
15560e3ece09SJunchao Zhang   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
15570e3ece09SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
15580e3ece09SJunchao Zhang }
15590e3ece09SJunchao Zhang 
15600e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
15610e3ece09SJunchao Zhang {
15620e3ece09SJunchao Zhang   Mat_SeqAIJKokkos *akok;
15634d86920dSPierre Jolivet 
15640e3ece09SJunchao Zhang   PetscFunctionBegin;
15650e3ece09SJunchao Zhang   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
15660e3ece09SJunchao Zhang   PetscCall(MatCreate(comm, A));
15670e3ece09SJunchao Zhang   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
15680e3ece09SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
15690e3ece09SJunchao Zhang }
15700e3ece09SJunchao Zhang 
1571076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1572076ba34aSJunchao Zhang 
1573076ba34aSJunchao Zhang    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1574076ba34aSJunchao Zhang  */
1575d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1576d71ae5a4SJacob Faibussowitsch {
1577076ba34aSJunchao Zhang   PetscFunctionBegin;
15789566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
15799566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
15803ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
15818c3ff71bSJunchao Zhang }
15828c3ff71bSJunchao Zhang 
1583152b3e56SJunchao Zhang /*@C
158411a5261eSBarry Smith   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
15858c3ff71bSJunchao Zhang   (the default parallel PETSc format). This matrix will ultimately be handled by
158620f4b53cSBarry Smith   Kokkos for calculations.
15878c3ff71bSJunchao Zhang 
15888c3ff71bSJunchao Zhang   Collective
15898c3ff71bSJunchao Zhang 
15908c3ff71bSJunchao Zhang   Input Parameters:
159111a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF`
15928c3ff71bSJunchao Zhang . m    - number of rows
15938c3ff71bSJunchao Zhang . n    - number of columns
159420f4b53cSBarry Smith . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
159520f4b53cSBarry Smith - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
15968c3ff71bSJunchao Zhang 
15978c3ff71bSJunchao Zhang   Output Parameter:
15988c3ff71bSJunchao Zhang . A - the matrix
15998c3ff71bSJunchao Zhang 
16002ef1f0ffSBarry Smith   Level: intermediate
16012ef1f0ffSBarry Smith 
16022ef1f0ffSBarry Smith   Notes:
160311a5261eSBarry Smith   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
16048c3ff71bSJunchao Zhang   MatXXXXSetPreallocation() paradgm instead of this routine directly.
160511a5261eSBarry Smith   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
16068c3ff71bSJunchao Zhang 
160711a5261eSBarry Smith   The AIJ format, also called
16082ef1f0ffSBarry Smith   compressed row storage, is fully compatible with standard Fortran
16098c3ff71bSJunchao Zhang   storage.  That is, the stored row and column indices can begin at
161020f4b53cSBarry Smith   either one (as in Fortran) or zero.
16118c3ff71bSJunchao Zhang 
16122ef1f0ffSBarry Smith   Specify the preallocated storage with either `nz` or `nnz` (not both).
16132ef1f0ffSBarry Smith   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
16142ef1f0ffSBarry Smith   allocation.
16158c3ff71bSJunchao Zhang 
1616fe59aa6dSJacob Faibussowitsch .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
16178c3ff71bSJunchao Zhang @*/
1618d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1619d71ae5a4SJacob Faibussowitsch {
16208c3ff71bSJunchao Zhang   PetscFunctionBegin;
16219566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
16229566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
16239566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, m, n));
16249566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
16259566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
16263ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16278c3ff71bSJunchao Zhang }
1628930e68a5SMark Adams 
1629d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1630d71ae5a4SJacob Faibussowitsch {
1631930e68a5SMark Adams   PetscFunctionBegin;
16329566063dSJacob Faibussowitsch   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
163386a27549SJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
16343ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
163586a27549SJunchao Zhang }
163686a27549SJunchao Zhang 
1637d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1638d71ae5a4SJacob Faibussowitsch {
163986a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
164086a27549SJunchao Zhang 
164186a27549SJunchao Zhang   PetscFunctionBegin;
164286a27549SJunchao Zhang   if (!factors->sptrsv_symbolic_completed) {
164386a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
164486a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
164586a27549SJunchao Zhang     factors->sptrsv_symbolic_completed = PETSC_TRUE;
164686a27549SJunchao Zhang   }
16473ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
164886a27549SJunchao Zhang }
164986a27549SJunchao Zhang 
165086a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */
1651d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1652d71ae5a4SJacob Faibussowitsch {
165386a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1654076ba34aSJunchao Zhang   MatColIdxType               n       = A->rmap->n;
165586a27549SJunchao Zhang 
165686a27549SJunchao Zhang   PetscFunctionBegin;
165786a27549SJunchao Zhang   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
165886a27549SJunchao Zhang     /* Update L^T and do sptrsv symbolic */
16597b8d4ba6SJunchao Zhang     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0
16607b8d4ba6SJunchao Zhang     factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
16617b8d4ba6SJunchao Zhang     factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
166286a27549SJunchao Zhang 
16639371c9d4SSatish Balay     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
166486a27549SJunchao Zhang                                                                                                                                                                                                               factors->iLt_d, factors->jLt_d, factors->aLt_d);
166586a27549SJunchao Zhang 
166686a27549SJunchao Zhang     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
166786a27549SJunchao Zhang       We have to sort the indices, until KK provides finer control options.
166886a27549SJunchao Zhang     */
16699371c9d4SSatish Balay     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);
167086a27549SJunchao Zhang 
167186a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);
167286a27549SJunchao Zhang 
167386a27549SJunchao Zhang     /* Update U^T and do sptrsv symbolic */
16747b8d4ba6SJunchao Zhang     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0
16757b8d4ba6SJunchao Zhang     factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
16767b8d4ba6SJunchao Zhang     factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
167786a27549SJunchao Zhang 
16789371c9d4SSatish Balay     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
167986a27549SJunchao Zhang                                                                                                                                                                                                               factors->iUt_d, factors->jUt_d, factors->aUt_d);
168086a27549SJunchao Zhang 
168186a27549SJunchao Zhang     /* Sort indices. See comments above */
16829371c9d4SSatish Balay     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);
168386a27549SJunchao Zhang 
168486a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
168586a27549SJunchao Zhang     factors->transpose_updated = PETSC_TRUE;
168686a27549SJunchao Zhang   }
16873ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
168886a27549SJunchao Zhang }
168986a27549SJunchao Zhang 
169086a27549SJunchao Zhang /* Solve Ax = b, with A = LU */
1691d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x)
1692d71ae5a4SJacob Faibussowitsch {
169386a27549SJunchao Zhang   ConstPetscScalarKokkosView  bv;
169486a27549SJunchao Zhang   PetscScalarKokkosView       xv;
169586a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
169686a27549SJunchao Zhang 
169786a27549SJunchao Zhang   PetscFunctionBegin;
16989566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
16999566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
17009566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(b, &bv));
17019566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(x, &xv));
170286a27549SJunchao Zhang   /* Solve L tmpv = b */
17039566063dSJacob Faibussowitsch   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector));
170486a27549SJunchao Zhang   /* Solve Ux = tmpv */
17059566063dSJacob Faibussowitsch   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv));
17069566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(b, &bv));
17079566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
17089566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
17093ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
171086a27549SJunchao Zhang }
171186a27549SJunchao Zhang 
1712076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */
1713d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x)
1714d71ae5a4SJacob Faibussowitsch {
171586a27549SJunchao Zhang   ConstPetscScalarKokkosView  bv;
171686a27549SJunchao Zhang   PetscScalarKokkosView       xv;
171786a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
171886a27549SJunchao Zhang 
171986a27549SJunchao Zhang   PetscFunctionBegin;
17209566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
17219566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
17229566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(b, &bv));
17239566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(x, &xv));
172486a27549SJunchao Zhang   /* Solve U^T tmpv = b */
172586a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);
172686a27549SJunchao Zhang 
172786a27549SJunchao Zhang   /* Solve L^T x = tmpv */
172886a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
17299566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(b, &bv));
17309566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
17319566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
17323ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
173386a27549SJunchao Zhang }
173486a27549SJunchao Zhang 
1735d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1736d71ae5a4SJacob Faibussowitsch {
173786a27549SJunchao Zhang   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
173886a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
173986a27549SJunchao Zhang   PetscInt                    fill_lev = info->levels;
174086a27549SJunchao Zhang 
174186a27549SJunchao Zhang   PetscFunctionBegin;
17429566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
17439566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1744076ba34aSJunchao Zhang 
1745076ba34aSJunchao Zhang   auto a_d = aijkok->a_dual.view_device();
1746076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1747076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1748076ba34aSJunchao Zhang 
1749076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);
175086a27549SJunchao Zhang 
175186a27549SJunchao Zhang   B->assembled              = PETSC_TRUE;
175286a27549SJunchao Zhang   B->preallocated           = PETSC_TRUE;
175386a27549SJunchao Zhang   B->ops->solve             = MatSolve_SeqAIJKokkos;
175486a27549SJunchao Zhang   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos;
175586a27549SJunchao Zhang   B->ops->matsolve          = NULL;
175686a27549SJunchao Zhang   B->ops->matsolvetranspose = NULL;
175786a27549SJunchao Zhang   B->offloadmask            = PETSC_OFFLOAD_GPU;
175886a27549SJunchao Zhang 
175986a27549SJunchao Zhang   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
176086a27549SJunchao Zhang   factors->transpose_updated         = PETSC_FALSE;
176186a27549SJunchao Zhang   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1762eeadb341SJunchao Zhang   /* TODO: log flops, but how to know that? */
17639566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
17643ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
176586a27549SJunchao Zhang }
176686a27549SJunchao Zhang 
1767d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1768d71ae5a4SJacob Faibussowitsch {
176986a27549SJunchao Zhang   Mat_SeqAIJKokkos           *aijkok;
177086a27549SJunchao Zhang   Mat_SeqAIJ                 *b;
177186a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
177286a27549SJunchao Zhang   PetscInt                    fill_lev = info->levels;
177386a27549SJunchao Zhang   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
177486a27549SJunchao Zhang   PetscInt                    n        = A->rmap->n;
177586a27549SJunchao Zhang 
177686a27549SJunchao Zhang   PetscFunctionBegin;
17779566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
177886a27549SJunchao Zhang   /* Rebuild factors */
17799371c9d4SSatish Balay   if (factors) {
17809371c9d4SSatish Balay     factors->Destroy();
17819371c9d4SSatish Balay   } /* Destroy the old if it exists */
17829371c9d4SSatish Balay   else {
17839371c9d4SSatish Balay     B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
17849371c9d4SSatish Balay   }
178586a27549SJunchao Zhang 
178686a27549SJunchao Zhang   /* Create a spiluk handle and then do symbolic factorization */
178786a27549SJunchao Zhang   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
178886a27549SJunchao Zhang   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
178986a27549SJunchao Zhang 
179086a27549SJunchao Zhang   auto spiluk_handle = factors->kh.get_spiluk_handle();
179186a27549SJunchao Zhang 
179286a27549SJunchao Zhang   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
179386a27549SJunchao Zhang   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
179486a27549SJunchao Zhang   Kokkos::realloc(factors->iU_d, n + 1);
179586a27549SJunchao Zhang   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
179686a27549SJunchao Zhang 
179786a27549SJunchao Zhang   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1798076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1799076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1800076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
180186a27549SJunchao Zhang   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
180286a27549SJunchao Zhang 
180386a27549SJunchao Zhang   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
180486a27549SJunchao Zhang   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
180586a27549SJunchao Zhang   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
180686a27549SJunchao Zhang   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
180786a27549SJunchao Zhang 
180886a27549SJunchao Zhang   /* TODO: add options to select sptrsv algorithms */
180986a27549SJunchao Zhang   /* Create sptrsv handles for L, U and their transpose */
181086a27549SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
181186a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
181286a27549SJunchao Zhang #else
181386a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
181486a27549SJunchao Zhang #endif
181586a27549SJunchao Zhang 
181686a27549SJunchao Zhang   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
181786a27549SJunchao Zhang   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
181886a27549SJunchao Zhang   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
181986a27549SJunchao Zhang   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
182086a27549SJunchao Zhang 
182186a27549SJunchao Zhang   /* Fill fields of the factor matrix B */
18229566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
182386a27549SJunchao Zhang   b     = (Mat_SeqAIJ *)B->data;
182486a27549SJunchao Zhang   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
182586a27549SJunchao Zhang   B->info.fill_ratio_given  = info->fill;
1826a1e4e190SStefano Zampini   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
182786a27549SJunchao Zhang 
182886a27549SJunchao Zhang   B->offloadmask          = PETSC_OFFLOAD_GPU;
182986a27549SJunchao Zhang   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
18303ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1831930e68a5SMark Adams }
1832930e68a5SMark Adams 
1833d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type)
1834d71ae5a4SJacob Faibussowitsch {
1835930e68a5SMark Adams   PetscFunctionBegin;
1836930e68a5SMark Adams   *type = MATSOLVERKOKKOS;
18373ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1838930e68a5SMark Adams }
1839930e68a5SMark Adams 
1840930e68a5SMark Adams /*MC
184186a27549SJunchao Zhang   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
184211a5261eSBarry Smith   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
1843930e68a5SMark Adams 
1844930e68a5SMark Adams   Level: beginner
1845930e68a5SMark Adams 
18461cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1847930e68a5SMark Adams M*/
184886a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1849930e68a5SMark Adams {
1850930e68a5SMark Adams   PetscInt n = A->rmap->n;
1851930e68a5SMark Adams 
1852930e68a5SMark Adams   PetscFunctionBegin;
18539566063dSJacob Faibussowitsch   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
18549566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*B, n, n, n, n));
1855930e68a5SMark Adams   (*B)->factortype = ftype;
18569566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
18579566063dSJacob Faibussowitsch   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1858930e68a5SMark Adams 
18598f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
18609566063dSJacob Faibussowitsch     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
186186a27549SJunchao Zhang     (*B)->canuseordering        = PETSC_TRUE;
186286a27549SJunchao Zhang     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
186386a27549SJunchao Zhang   } else if (ftype == MAT_FACTOR_ILU) {
18649566063dSJacob Faibussowitsch     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
186586a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_FALSE;
186686a27549SJunchao Zhang     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
186798921bdaSJacob Faibussowitsch   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1868930e68a5SMark Adams 
18699566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1870f4f49eeaSPierre Jolivet   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos));
18713ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1872930e68a5SMark Adams }
18738f7e8f9dSMark Adams 
1874d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1875d71ae5a4SJacob Faibussowitsch {
187686a27549SJunchao Zhang   PetscFunctionBegin;
18779566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
18789566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
18793ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
188086a27549SJunchao Zhang }
188186a27549SJunchao Zhang 
1882076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */
1883d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
1884d71ae5a4SJacob Faibussowitsch {
1885076ba34aSJunchao Zhang   const auto        &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map);
1886076ba34aSJunchao Zhang   const auto        &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries);
1887076ba34aSJunchao Zhang   const auto        &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values);
1888076ba34aSJunchao Zhang   const PetscInt    *i  = iv.data();
1889076ba34aSJunchao Zhang   const PetscInt    *j  = jv.data();
1890076ba34aSJunchao Zhang   const PetscScalar *a  = av.data();
1891076ba34aSJunchao Zhang   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
1892076ba34aSJunchao Zhang 
1893076ba34aSJunchao Zhang   PetscFunctionBegin;
18949566063dSJacob Faibussowitsch   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
1895076ba34aSJunchao Zhang   for (PetscInt k = 0; k < m; k++) {
18969566063dSJacob Faibussowitsch     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
189748a46eb9SPierre Jolivet     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
18989566063dSJacob Faibussowitsch     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
1899076ba34aSJunchao Zhang   }
19003ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1901076ba34aSJunchao Zhang }
1902