xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 49abdd8a111d9c2ef7fc48ade253ef64e07f9b37)
1e36ced11SJunchao Zhang #include <petsc_kokkos.hpp>
211d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp>
3076ba34aSJunchao Zhang #include <petscpkg_version.h>
4152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h>
542550becSJunchao Zhang #include <petsc/private/sfimpl.h>
68c3ff71bSJunchao Zhang #include <petscsystypes.h>
78c3ff71bSJunchao Zhang #include <petscerror.h>
88c3ff71bSJunchao Zhang 
98c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp>
10f0cf5187SStefano Zampini #include <KokkosBlas.hpp>
118c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp>
128c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
1386a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp>
1486a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp>
15076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp>
16076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp>
179d13fa56SJunchao Zhang #include <KokkosBatched_LU_Decl.hpp>
189d13fa56SJunchao Zhang #include <KokkosBatched_InverseLU_Decl.hpp>
1986a27549SJunchao Zhang 
2042550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
218c3ff71bSJunchao Zhang 
220e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
23f98996d3SJunchao Zhang   #include <KokkosSparse_Utils.hpp>
24f98996d3SJunchao Zhang using KokkosSparse::sort_crs_matrix;
259371c9d4SSatish Balay using KokkosSparse::Impl::transpose_matrix;
26f98996d3SJunchao Zhang #else
27f98996d3SJunchao Zhang   #include <KokkosKernels_Sorting.hpp>
28f98996d3SJunchao Zhang using KokkosKernels::sort_crs_matrix;
299371c9d4SSatish Balay using KokkosKernels::Impl::transpose_matrix;
30f98996d3SJunchao Zhang #endif
31f98996d3SJunchao Zhang 
328c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
338c3ff71bSJunchao Zhang 
34076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
35076ba34aSJunchao Zhang    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
36076ba34aSJunchao Zhang    In the latter case, it is important to set a_dual's sync state correctly.
37076ba34aSJunchao Zhang  */
38d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
39d71ae5a4SJacob Faibussowitsch {
40076ba34aSJunchao Zhang   Mat_SeqAIJ       *aijseq;
41076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
428c3ff71bSJunchao Zhang 
438c3ff71bSJunchao Zhang   PetscFunctionBegin;
443ba16761SJacob Faibussowitsch   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
459566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
46076ba34aSJunchao Zhang 
47076ba34aSJunchao Zhang   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
48076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
49076ba34aSJunchao Zhang 
50076ba34aSJunchao Zhang   /* If aijkok does not exist, we just copy i, j to device.
51076ba34aSJunchao Zhang      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
52076ba34aSJunchao Zhang      In both cases, we build a new aijkok structure.
53076ba34aSJunchao Zhang   */
54076ba34aSJunchao Zhang   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
55076ba34aSJunchao Zhang     delete aijkok;
56f4747e26SJunchao Zhang     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
57076ba34aSJunchao Zhang     A->spptr = aijkok;
58f4747e26SJunchao Zhang   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
59f4747e26SJunchao Zhang     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
60f4747e26SJunchao Zhang     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
61f4747e26SJunchao Zhang     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
62076ba34aSJunchao Zhang   }
633ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
648c3ff71bSJunchao Zhang }
658c3ff71bSJunchao Zhang 
6686a27549SJunchao Zhang /* Sync CSR data to device if not yet */
67d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
68d71ae5a4SJacob Faibussowitsch {
698c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
708c3ff71bSJunchao Zhang 
718c3ff71bSJunchao Zhang   PetscFunctionBegin;
72aaa8cc7dSPierre Jolivet   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
735f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
74076ba34aSJunchao Zhang   if (aijkok->a_dual.need_sync_device()) {
75076ba34aSJunchao Zhang     aijkok->a_dual.sync_device();
76580c7c76SPierre Jolivet     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
7786a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_FALSE;
788c3ff71bSJunchao Zhang   }
793ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
808c3ff71bSJunchao Zhang }
818c3ff71bSJunchao Zhang 
82076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */
83d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
84d71ae5a4SJacob Faibussowitsch {
8586a27549SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
8686a27549SJunchao Zhang 
8786a27549SJunchao Zhang   PetscFunctionBegin;
885f80ce2aSJacob Faibussowitsch   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
8986a27549SJunchao Zhang   aijkok->a_dual.clear_sync_state();
9086a27549SJunchao Zhang   aijkok->a_dual.modify_device();
9186a27549SJunchao Zhang   aijkok->transpose_updated = PETSC_FALSE;
9286a27549SJunchao Zhang   aijkok->hermitian_updated = PETSC_FALSE;
939566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
949566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
953ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
9686a27549SJunchao Zhang }
9786a27549SJunchao Zhang 
98d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
99d71ae5a4SJacob Faibussowitsch {
100f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
101e36ced11SJunchao Zhang   auto             &exec   = PetscGetKokkosExecutionSpace();
102f0cf5187SStefano Zampini 
103f0cf5187SStefano Zampini   PetscFunctionBegin;
104f0cf5187SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
10586a27549SJunchao Zhang   /* We do not expect one needs factors on host  */
106aaa8cc7dSPierre Jolivet   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
1075f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
108e36ced11SJunchao Zhang   PetscCallCXX(aijkok->a_dual.sync_host(exec));
109e36ced11SJunchao Zhang   PetscCallCXX(exec.fence());
1103ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
111f0cf5187SStefano Zampini }
112f0cf5187SStefano Zampini 
113d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
114d71ae5a4SJacob Faibussowitsch {
115076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
116f0cf5187SStefano Zampini 
117f0cf5187SStefano Zampini   PetscFunctionBegin;
1185519a089SJose E. Roman   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
1195519a089SJose E. Roman     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
1205519a089SJose E. Roman     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
1215519a089SJose E. Roman     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
1225519a089SJose E. Roman   */
1235519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
124e36ced11SJunchao Zhang     auto &exec = PetscGetKokkosExecutionSpace();
125e36ced11SJunchao Zhang     PetscCallCXX(aijkok->a_dual.sync_host(exec));
126e36ced11SJunchao Zhang     PetscCallCXX(exec.fence());
127076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
128076ba34aSJunchao Zhang   } else { /* Happens when calling MatSetValues on a newly created matrix */
129076ba34aSJunchao Zhang     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
130076ba34aSJunchao Zhang   }
1313ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
132076ba34aSJunchao Zhang }
133076ba34aSJunchao Zhang 
134d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
135d71ae5a4SJacob Faibussowitsch {
136076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
137076ba34aSJunchao Zhang 
138076ba34aSJunchao Zhang   PetscFunctionBegin;
1395519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
1403ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
141076ba34aSJunchao Zhang }
142076ba34aSJunchao Zhang 
143d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
144d71ae5a4SJacob Faibussowitsch {
145076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
146076ba34aSJunchao Zhang 
147076ba34aSJunchao Zhang   PetscFunctionBegin;
1485519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
149e36ced11SJunchao Zhang     auto &exec = PetscGetKokkosExecutionSpace();
150e36ced11SJunchao Zhang     PetscCallCXX(aijkok->a_dual.sync_host(exec));
151e36ced11SJunchao Zhang     PetscCallCXX(exec.fence());
152076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1532328674fSJunchao Zhang   } else {
1542328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
1552328674fSJunchao Zhang   }
1563ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
157076ba34aSJunchao Zhang }
158076ba34aSJunchao Zhang 
159d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
160d71ae5a4SJacob Faibussowitsch {
161076ba34aSJunchao Zhang   PetscFunctionBegin;
162076ba34aSJunchao Zhang   *array = NULL;
1633ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
164076ba34aSJunchao Zhang }
165076ba34aSJunchao Zhang 
166d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
167d71ae5a4SJacob Faibussowitsch {
168076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
169076ba34aSJunchao Zhang 
170076ba34aSJunchao Zhang   PetscFunctionBegin;
1715519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
172076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1732328674fSJunchao Zhang   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
1742328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
1752328674fSJunchao Zhang   }
1763ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
177076ba34aSJunchao Zhang }
178076ba34aSJunchao Zhang 
179d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
180d71ae5a4SJacob Faibussowitsch {
181076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
182076ba34aSJunchao Zhang 
183076ba34aSJunchao Zhang   PetscFunctionBegin;
1845519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
185076ba34aSJunchao Zhang     aijkok->a_dual.clear_sync_state();
186076ba34aSJunchao Zhang     aijkok->a_dual.modify_host();
1872328674fSJunchao Zhang   }
1883ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
189f0cf5187SStefano Zampini }
190f0cf5187SStefano Zampini 
191d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
192d71ae5a4SJacob Faibussowitsch {
1937ee59b9bSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1947ee59b9bSJunchao Zhang 
1957ee59b9bSJunchao Zhang   PetscFunctionBegin;
1967ee59b9bSJunchao Zhang   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
1977ee59b9bSJunchao Zhang 
1987ee59b9bSJunchao Zhang   if (i) *i = aijkok->i_device_data();
1997ee59b9bSJunchao Zhang   if (j) *j = aijkok->j_device_data();
2007ee59b9bSJunchao Zhang   if (a) {
2017ee59b9bSJunchao Zhang     aijkok->a_dual.sync_device();
2027ee59b9bSJunchao Zhang     *a = aijkok->a_device_data();
2037ee59b9bSJunchao Zhang   }
2047ee59b9bSJunchao Zhang   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
2053ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2067ee59b9bSJunchao Zhang }
2077ee59b9bSJunchao Zhang 
2080e3ece09SJunchao Zhang /*
2090e3ece09SJunchao Zhang   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
2100e3ece09SJunchao Zhang 
2110e3ece09SJunchao Zhang   Input Parameter:
2120e3ece09SJunchao Zhang .  A       - the MATSEQAIJKOKKOS matrix
2130e3ece09SJunchao Zhang 
2140e3ece09SJunchao Zhang   Output Parameters:
2150e3ece09SJunchao Zhang +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
216aaa8cc7dSPierre Jolivet -  T_d    - the transpose on device, whose value array is allocated but not initialized
2170e3ece09SJunchao Zhang */
2180e3ece09SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
219d71ae5a4SJacob Faibussowitsch {
2200e3ece09SJunchao Zhang   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
2210e3ece09SJunchao Zhang   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
2220e3ece09SJunchao Zhang   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
2237b8d4ba6SJunchao Zhang   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
2240e3ece09SJunchao Zhang   MatRowMapType          *Ti = Ti_h.data();
2257b8d4ba6SJunchao Zhang   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
2267b8d4ba6SJunchao Zhang   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
2270e3ece09SJunchao Zhang   PetscInt               *Tj   = Tj_h.data();
2280e3ece09SJunchao Zhang   PetscInt               *perm = perm_h.data();
2290e3ece09SJunchao Zhang   PetscInt               *offset;
230152b3e56SJunchao Zhang 
231152b3e56SJunchao Zhang   PetscFunctionBegin;
2320e3ece09SJunchao Zhang   // Populate Ti
2330e3ece09SJunchao Zhang   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
2340e3ece09SJunchao Zhang   Ti++;
2350e3ece09SJunchao Zhang   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
2360e3ece09SJunchao Zhang   Ti--;
2370e3ece09SJunchao Zhang   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
2380e3ece09SJunchao Zhang 
2390e3ece09SJunchao Zhang   // Populate Tj and the permutation array
2400e3ece09SJunchao Zhang   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
2410e3ece09SJunchao Zhang   for (PetscInt i = 0; i < m; i++) {
2420e3ece09SJunchao Zhang     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
2430e3ece09SJunchao Zhang       PetscInt r    = Aj[j];                       // row r of T
2440e3ece09SJunchao Zhang       PetscInt disp = Ti[r] + offset[r];
2450e3ece09SJunchao Zhang 
2460e3ece09SJunchao Zhang       Tj[disp]   = i; // col i of T
2470e3ece09SJunchao Zhang       perm[disp] = j;
2480e3ece09SJunchao Zhang       offset[r]++;
249076ba34aSJunchao Zhang     }
2500e3ece09SJunchao Zhang   }
2510e3ece09SJunchao Zhang   PetscCall(PetscFree(offset));
2520e3ece09SJunchao Zhang 
2530e3ece09SJunchao Zhang   // Sort each row of T, along with the permutation array
2540e3ece09SJunchao Zhang   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
2550e3ece09SJunchao Zhang 
2560e3ece09SJunchao Zhang   // Output perm and T on device
2570e3ece09SJunchao Zhang   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
2580e3ece09SJunchao Zhang   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
2590e3ece09SJunchao Zhang   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
2600e3ece09SJunchao Zhang   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
2613ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
262152b3e56SJunchao Zhang }
263152b3e56SJunchao Zhang 
2640e3ece09SJunchao Zhang // Generate the transpose on device and cache it internally
2650e3ece09SJunchao Zhang // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
2660e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
267d71ae5a4SJacob Faibussowitsch {
2680e3ece09SJunchao Zhang   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
2690e3ece09SJunchao Zhang   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
2700e3ece09SJunchao Zhang   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
2710e3ece09SJunchao Zhang   KokkosCsrMatrix  &T = akok->csrmatT;
272152b3e56SJunchao Zhang 
273152b3e56SJunchao Zhang   PetscFunctionBegin;
2740e3ece09SJunchao Zhang   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
275145b44c9SPierre Jolivet   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
2760e3ece09SJunchao Zhang 
2770e3ece09SJunchao Zhang   const auto &Aa = akok->a_dual.view_device();
2780e3ece09SJunchao Zhang 
2790e3ece09SJunchao Zhang   if (A->symmetric == PETSC_BOOL3_TRUE) {
2800e3ece09SJunchao Zhang     *csrmatT = akok->csrmat;
2810e3ece09SJunchao Zhang   } else {
2820e3ece09SJunchao Zhang     // See if we already have a cached transpose and its value is up to date
2830e3ece09SJunchao Zhang     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
2840e3ece09SJunchao Zhang       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
2850e3ece09SJunchao Zhang         const auto &perm = akok->transpose_perm; // get the permutation array
2860e3ece09SJunchao Zhang         auto       &Ta   = T.values;
2870e3ece09SJunchao Zhang 
288d326c3f1SJunchao Zhang         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
289076ba34aSJunchao Zhang       }
2900e3ece09SJunchao Zhang     } else { // Generate T of size n x m for the first time
2910e3ece09SJunchao Zhang       MatRowMapKokkosView perm;
2920e3ece09SJunchao Zhang 
2930e3ece09SJunchao Zhang       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
2940e3ece09SJunchao Zhang       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
295d326c3f1SJunchao Zhang       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
2960e3ece09SJunchao Zhang     }
2970e3ece09SJunchao Zhang     akok->transpose_updated = PETSC_TRUE;
2980e3ece09SJunchao Zhang     *csrmatT                = akok->csrmatT;
2990e3ece09SJunchao Zhang   }
3000e3ece09SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
3010e3ece09SJunchao Zhang }
3020e3ece09SJunchao Zhang 
3030e3ece09SJunchao Zhang // Generate the Hermitian on device and cache it internally
3040e3ece09SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
3050e3ece09SJunchao Zhang {
3060e3ece09SJunchao Zhang   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
3070e3ece09SJunchao Zhang   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
3080e3ece09SJunchao Zhang   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
3090e3ece09SJunchao Zhang   KokkosCsrMatrix  &T = akok->csrmatH;
3100e3ece09SJunchao Zhang 
3110e3ece09SJunchao Zhang   PetscFunctionBegin;
3120e3ece09SJunchao Zhang   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
3130e3ece09SJunchao Zhang   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
3140e3ece09SJunchao Zhang 
3150e3ece09SJunchao Zhang   const auto &Aa = akok->a_dual.view_device();
3160e3ece09SJunchao Zhang 
3170e3ece09SJunchao Zhang   if (A->hermitian == PETSC_BOOL3_TRUE) {
3180e3ece09SJunchao Zhang     *csrmatH = akok->csrmat;
3190e3ece09SJunchao Zhang   } else {
3200e3ece09SJunchao Zhang     // See if we already have a cached hermitian and its value is up to date
3210e3ece09SJunchao Zhang     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
3220e3ece09SJunchao Zhang       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
3230e3ece09SJunchao Zhang         const auto &perm = akok->transpose_perm; // get the permutation array
3240e3ece09SJunchao Zhang         auto       &Ta   = T.values;
3250e3ece09SJunchao Zhang 
326d326c3f1SJunchao Zhang         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
3270e3ece09SJunchao Zhang       }
3280e3ece09SJunchao Zhang     } else { // Generate T of size n x m for the first time
3290e3ece09SJunchao Zhang       MatRowMapKokkosView perm;
3300e3ece09SJunchao Zhang 
3310e3ece09SJunchao Zhang       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
3320e3ece09SJunchao Zhang       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
333d326c3f1SJunchao Zhang       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
3340e3ece09SJunchao Zhang     }
3350e3ece09SJunchao Zhang     akok->hermitian_updated = PETSC_TRUE;
3360e3ece09SJunchao Zhang     *csrmatH                = akok->csrmatH;
3370e3ece09SJunchao Zhang   }
3383ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
339152b3e56SJunchao Zhang }
340a587d139SMark 
3418c3ff71bSJunchao Zhang /* y = A x */
342d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
343d71ae5a4SJacob Faibussowitsch {
3448c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
345152b3e56SJunchao Zhang   ConstPetscScalarKokkosView xv;
346152b3e56SJunchao Zhang   PetscScalarKokkosView      yv;
3478c3ff71bSJunchao Zhang 
3488c3ff71bSJunchao Zhang   PetscFunctionBegin;
3499566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3509566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
3519566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
3529566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(yy, &yv));
3538c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
354d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
3559566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
3569566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
357076ba34aSJunchao Zhang   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
3589566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
3599566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3603ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3618c3ff71bSJunchao Zhang }
3628c3ff71bSJunchao Zhang 
3638c3ff71bSJunchao Zhang /* y = A^T x */
364d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
365d71ae5a4SJacob Faibussowitsch {
3668c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
367152b3e56SJunchao Zhang   const char                *mode;
368152b3e56SJunchao Zhang   ConstPetscScalarKokkosView xv;
369152b3e56SJunchao Zhang   PetscScalarKokkosView      yv;
3700e3ece09SJunchao Zhang   KokkosCsrMatrix            csrmat;
3718c3ff71bSJunchao Zhang 
3728c3ff71bSJunchao Zhang   PetscFunctionBegin;
3739566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3749566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
3759566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
3769566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(yy, &yv));
377152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
3789566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
379152b3e56SJunchao Zhang     mode = "N";
380152b3e56SJunchao Zhang   } else {
381076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
3820e3ece09SJunchao Zhang     csrmat = aijkok->csrmat;
383152b3e56SJunchao Zhang     mode   = "T";
384152b3e56SJunchao Zhang   }
385d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
3869566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
3879566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
3880e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
3899566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3903ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3918c3ff71bSJunchao Zhang }
3928c3ff71bSJunchao Zhang 
3938c3ff71bSJunchao Zhang /* y = A^H x */
394d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
395d71ae5a4SJacob Faibussowitsch {
3968c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
397152b3e56SJunchao Zhang   const char                *mode;
398152b3e56SJunchao Zhang   ConstPetscScalarKokkosView xv;
399152b3e56SJunchao Zhang   PetscScalarKokkosView      yv;
4000e3ece09SJunchao Zhang   KokkosCsrMatrix            csrmat;
4018c3ff71bSJunchao Zhang 
4028c3ff71bSJunchao Zhang   PetscFunctionBegin;
4039566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4049566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
4059566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
4069566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(yy, &yv));
407152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
4089566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
409152b3e56SJunchao Zhang     mode = "N";
410152b3e56SJunchao Zhang   } else {
411076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
4120e3ece09SJunchao Zhang     csrmat = aijkok->csrmat;
413152b3e56SJunchao Zhang     mode   = "C";
414152b3e56SJunchao Zhang   }
415d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
4169566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
4179566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
4180e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
4199566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
4203ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4218c3ff71bSJunchao Zhang }
4228c3ff71bSJunchao Zhang 
4238c3ff71bSJunchao Zhang /* z = A x + y */
424d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
425d71ae5a4SJacob Faibussowitsch {
4268c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
42792896123SJunchao Zhang   ConstPetscScalarKokkosView xv;
428152b3e56SJunchao Zhang   PetscScalarKokkosView      zv;
4298c3ff71bSJunchao Zhang 
4308c3ff71bSJunchao Zhang   PetscFunctionBegin;
4319566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4329566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
43392896123SJunchao Zhang   if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
4349566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
43592896123SJunchao Zhang   PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
4368c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
437d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
4389566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
43992896123SJunchao Zhang   PetscCall(VecRestoreKokkosView(zz, &zv));
4409566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
4419566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
4423ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4438c3ff71bSJunchao Zhang }
4448c3ff71bSJunchao Zhang 
4458c3ff71bSJunchao Zhang /* z = A^T x + y */
446d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
447d71ae5a4SJacob Faibussowitsch {
4488c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
449152b3e56SJunchao Zhang   const char                *mode;
45092896123SJunchao Zhang   ConstPetscScalarKokkosView xv;
451152b3e56SJunchao Zhang   PetscScalarKokkosView      zv;
4520e3ece09SJunchao Zhang   KokkosCsrMatrix            csrmat;
4538c3ff71bSJunchao Zhang 
4548c3ff71bSJunchao Zhang   PetscFunctionBegin;
4559566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4569566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
45792896123SJunchao Zhang   if (zz != yy) PetscCall(VecCopy(yy, zz));
4589566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
45992896123SJunchao Zhang   PetscCall(VecGetKokkosView(zz, &zv));
460152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
4619566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
462152b3e56SJunchao Zhang     mode = "N";
463152b3e56SJunchao Zhang   } else {
464076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
4650e3ece09SJunchao Zhang     csrmat = aijkok->csrmat;
466152b3e56SJunchao Zhang     mode   = "T";
467152b3e56SJunchao Zhang   }
468d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
4699566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
47092896123SJunchao Zhang   PetscCall(VecRestoreKokkosView(zz, &zv));
4710e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
4729566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
4733ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4748c3ff71bSJunchao Zhang }
4758c3ff71bSJunchao Zhang 
4768c3ff71bSJunchao Zhang /* z = A^H x + y */
477d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
478d71ae5a4SJacob Faibussowitsch {
4798c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
480152b3e56SJunchao Zhang   const char                *mode;
48192896123SJunchao Zhang   ConstPetscScalarKokkosView xv;
482152b3e56SJunchao Zhang   PetscScalarKokkosView      zv;
4830e3ece09SJunchao Zhang   KokkosCsrMatrix            csrmat;
4848c3ff71bSJunchao Zhang 
4858c3ff71bSJunchao Zhang   PetscFunctionBegin;
4869566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4879566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
48892896123SJunchao Zhang   if (zz != yy) PetscCall(VecCopy(yy, zz));
4899566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
49092896123SJunchao Zhang   PetscCall(VecGetKokkosView(zz, &zv));
491152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
4929566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
493152b3e56SJunchao Zhang     mode = "N";
494152b3e56SJunchao Zhang   } else {
495076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
4960e3ece09SJunchao Zhang     csrmat = aijkok->csrmat;
497152b3e56SJunchao Zhang     mode   = "C";
498152b3e56SJunchao Zhang   }
499d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
5009566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
50192896123SJunchao Zhang   PetscCall(VecRestoreKokkosView(zz, &zv));
5020e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
5039566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
5043ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
505152b3e56SJunchao Zhang }
506152b3e56SJunchao Zhang 
50766976f2fSJacob Faibussowitsch static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
508d71ae5a4SJacob Faibussowitsch {
509152b3e56SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
510152b3e56SJunchao Zhang 
511152b3e56SJunchao Zhang   PetscFunctionBegin;
512152b3e56SJunchao Zhang   switch (op) {
513152b3e56SJunchao Zhang   case MAT_FORM_EXPLICIT_TRANSPOSE:
514152b3e56SJunchao Zhang     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
5159566063dSJacob Faibussowitsch     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
516152b3e56SJunchao Zhang     A->form_explicit_transpose = flg;
517152b3e56SJunchao Zhang     break;
518d71ae5a4SJacob Faibussowitsch   default:
519d71ae5a4SJacob Faibussowitsch     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
520d71ae5a4SJacob Faibussowitsch     break;
521152b3e56SJunchao Zhang   }
5223ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5238c3ff71bSJunchao Zhang }
5248c3ff71bSJunchao Zhang 
525076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */
526d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
527d71ae5a4SJacob Faibussowitsch {
528076ba34aSJunchao Zhang   Mat_SeqAIJ *aseq;
5298c3ff71bSJunchao Zhang 
5308c3ff71bSJunchao Zhang   PetscFunctionBegin;
5319566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
532076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
5339566063dSJacob Faibussowitsch     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));  /* the returned newmat is a SeqAIJKokkos */
5348c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
5359566063dSJacob Faibussowitsch     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
536076ba34aSJunchao Zhang   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
5375f80ce2aSJacob Faibussowitsch     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
5389566063dSJacob Faibussowitsch     PetscCall(PetscFree(A->defaultvectype));
5399566063dSJacob Faibussowitsch     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
5409566063dSJacob Faibussowitsch     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
5419566063dSJacob Faibussowitsch     PetscCall(MatSetOps_SeqAIJKokkos(A));
542076ba34aSJunchao Zhang     aseq = static_cast<Mat_SeqAIJ *>(A->data);
543394ed5ebSJunchao Zhang     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
5445f80ce2aSJacob Faibussowitsch       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
545f4747e26SJunchao Zhang       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
5468c3ff71bSJunchao Zhang     }
547076ba34aSJunchao Zhang   }
5483ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5498c3ff71bSJunchao Zhang }
5508c3ff71bSJunchao Zhang 
551076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
552076ba34aSJunchao Zhang    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
553076ba34aSJunchao Zhang  */
554d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
555d71ae5a4SJacob Faibussowitsch {
556076ba34aSJunchao Zhang   Mat_SeqAIJ       *bseq;
557076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
558076ba34aSJunchao Zhang   Mat               mat;
5598c3ff71bSJunchao Zhang 
5608c3ff71bSJunchao Zhang   PetscFunctionBegin;
561076ba34aSJunchao Zhang   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
5629566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
563076ba34aSJunchao Zhang   mat = *B;
564f4747e26SJunchao Zhang   if (A->assembled) {
565076ba34aSJunchao Zhang     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
566f4747e26SJunchao Zhang     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
567076ba34aSJunchao Zhang     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
568076ba34aSJunchao Zhang     /* Now copy values to B if needed */
569076ba34aSJunchao Zhang     if (dupOption == MAT_COPY_VALUES) {
570076ba34aSJunchao Zhang       if (akok->a_dual.need_sync_device()) {
571076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
572076ba34aSJunchao Zhang         bkok->a_dual.modify_host();
573076ba34aSJunchao Zhang       } else { /* If device has the latest data, we only copy data on device */
574076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
575076ba34aSJunchao Zhang         bkok->a_dual.modify_device();
576076ba34aSJunchao Zhang       }
577076ba34aSJunchao Zhang     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
578076ba34aSJunchao Zhang       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
579076ba34aSJunchao Zhang       bkok->a_dual.modify_host();
580076ba34aSJunchao Zhang     }
581076ba34aSJunchao Zhang     mat->spptr = bkok;
582076ba34aSJunchao Zhang   }
583076ba34aSJunchao Zhang 
5849566063dSJacob Faibussowitsch   PetscCall(PetscFree(mat->defaultvectype));
5859566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
5869566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
5879566063dSJacob Faibussowitsch   PetscCall(MatSetOps_SeqAIJKokkos(mat));
5883ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5898c3ff71bSJunchao Zhang }
5908c3ff71bSJunchao Zhang 
591d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
592d71ae5a4SJacob Faibussowitsch {
5930ecb592aSJunchao Zhang   Mat               At;
5940e3ece09SJunchao Zhang   KokkosCsrMatrix   internT;
5950ecb592aSJunchao Zhang   Mat_SeqAIJKokkos *atkok, *bkok;
5960ecb592aSJunchao Zhang 
5970ecb592aSJunchao Zhang   PetscFunctionBegin;
5987fb60732SBarry Smith   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
5999566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
6000ecb592aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
601ff751488SJunchao Zhang     /* Deep copy internT, as we want to isolate the internal transpose */
6020e3ece09SJunchao Zhang     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
6039566063dSJacob Faibussowitsch     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
6040ecb592aSJunchao Zhang     if (reuse == MAT_INITIAL_MATRIX) *B = At;
6059566063dSJacob Faibussowitsch     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
6060ecb592aSJunchao Zhang   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
6070ecb592aSJunchao Zhang     if ((*B)->assembled) {
6080ecb592aSJunchao Zhang       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
6090e3ece09SJunchao Zhang       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
6109566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
6110ecb592aSJunchao Zhang     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
6120ecb592aSJunchao Zhang       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
6130e3ece09SJunchao Zhang       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
6140e3ece09SJunchao Zhang       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
6150e3ece09SJunchao Zhang       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
6160e3ece09SJunchao Zhang       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
6170ecb592aSJunchao Zhang     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
6180ecb592aSJunchao Zhang   }
6193ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6200ecb592aSJunchao Zhang }
6210ecb592aSJunchao Zhang 
622d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
623d71ae5a4SJacob Faibussowitsch {
62486a27549SJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
6258c3ff71bSJunchao Zhang 
6268c3ff71bSJunchao Zhang   PetscFunctionBegin;
62786a27549SJunchao Zhang   if (A->factortype == MAT_FACTOR_NONE) {
62886a27549SJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
6298c3ff71bSJunchao Zhang     delete aijkok;
63086a27549SJunchao Zhang   } else {
63186a27549SJunchao Zhang     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
63286a27549SJunchao Zhang   }
633cbc6b225SStefano Zampini   A->spptr = NULL;
6349566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
6359566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
6369566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
6379566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
6383ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6398c3ff71bSJunchao Zhang }
6408c3ff71bSJunchao Zhang 
6413f3ba80aSJunchao Zhang /*MC
6423f3ba80aSJunchao Zhang    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
6433f3ba80aSJunchao Zhang 
64415229ffcSPierre Jolivet    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
6453f3ba80aSJunchao Zhang 
6462ef1f0ffSBarry Smith    Options Database Key:
64711a5261eSBarry Smith .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
6483f3ba80aSJunchao Zhang 
6493f3ba80aSJunchao Zhang   Level: beginner
6503f3ba80aSJunchao Zhang 
6511cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
6523f3ba80aSJunchao Zhang M*/
653d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
654d71ae5a4SJacob Faibussowitsch {
65586a27549SJunchao Zhang   PetscFunctionBegin;
6569566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
6579566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(A));
6589566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
6593ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
66086a27549SJunchao Zhang }
66186a27549SJunchao Zhang 
662076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
663d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
664d71ae5a4SJacob Faibussowitsch {
665076ba34aSJunchao Zhang   Mat_SeqAIJ         *a, *b;
666076ba34aSJunchao Zhang   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
667076ba34aSJunchao Zhang   MatScalarKokkosView aa, ba, ca;
668076ba34aSJunchao Zhang   MatRowMapKokkosView ai, bi, ci;
669076ba34aSJunchao Zhang   MatColIdxKokkosView aj, bj, cj;
670076ba34aSJunchao Zhang   PetscInt            m, n, nnz, aN;
671a3f881fbSStefano Zampini 
672a3f881fbSStefano Zampini   PetscFunctionBegin;
673076ba34aSJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
674076ba34aSJunchao Zhang   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
6754f572ea9SToby Isaac   PetscAssertPointer(C, 4);
676076ba34aSJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
677076ba34aSJunchao Zhang   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
6785f80ce2aSJacob Faibussowitsch   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
6795f80ce2aSJacob Faibussowitsch   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
680076ba34aSJunchao Zhang 
6819566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
6829566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B));
683076ba34aSJunchao Zhang   a    = static_cast<Mat_SeqAIJ *>(A->data);
684076ba34aSJunchao Zhang   b    = static_cast<Mat_SeqAIJ *>(B->data);
685076ba34aSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
686076ba34aSJunchao Zhang   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
687076ba34aSJunchao Zhang   aa   = akok->a_dual.view_device();
688076ba34aSJunchao Zhang   ai   = akok->i_dual.view_device();
689076ba34aSJunchao Zhang   ba   = bkok->a_dual.view_device();
690076ba34aSJunchao Zhang   bi   = bkok->i_dual.view_device();
691076ba34aSJunchao Zhang   m    = A->rmap->n; /* M, N and nnz of C */
692076ba34aSJunchao Zhang   n    = A->cmap->n + B->cmap->n;
693076ba34aSJunchao Zhang   nnz  = a->nz + b->nz;
694076ba34aSJunchao Zhang   aN   = A->cmap->n; /* N of A */
695076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) {
696076ba34aSJunchao Zhang     aj           = akok->j_dual.view_device();
697076ba34aSJunchao Zhang     bj           = bkok->j_dual.view_device();
698076ba34aSJunchao Zhang     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
699076ba34aSJunchao Zhang     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
700076ba34aSJunchao Zhang     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
701076ba34aSJunchao Zhang     ca           = ca_dual.view_device();
702076ba34aSJunchao Zhang     ci           = ci_dual.view_device();
703076ba34aSJunchao Zhang     cj           = cj_dual.view_device();
704076ba34aSJunchao Zhang 
705076ba34aSJunchao Zhang     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
7069371c9d4SSatish Balay     Kokkos::parallel_for(
707d326c3f1SJunchao Zhang       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
708076ba34aSJunchao Zhang         PetscInt i       = t.league_rank(); /* row i */
709076ba34aSJunchao Zhang         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
710076ba34aSJunchao Zhang 
711076ba34aSJunchao Zhang         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
712076ba34aSJunchao Zhang                                                    ci(i) = coffset;
713076ba34aSJunchao Zhang                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
714076ba34aSJunchao Zhang         });
715076ba34aSJunchao Zhang 
716076ba34aSJunchao Zhang         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
717076ba34aSJunchao Zhang           if (k < alen) {
718076ba34aSJunchao Zhang             ca(coffset + k) = aa(ai(i) + k);
719076ba34aSJunchao Zhang             cj(coffset + k) = aj(ai(i) + k);
720076ba34aSJunchao Zhang           } else {
721076ba34aSJunchao Zhang             ca(coffset + k) = ba(bi(i) + k - alen);
722076ba34aSJunchao Zhang             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
723076ba34aSJunchao Zhang           }
724076ba34aSJunchao Zhang         });
725076ba34aSJunchao Zhang       });
726076ba34aSJunchao Zhang     ca_dual.modify_device();
727076ba34aSJunchao Zhang     ci_dual.modify_device();
728076ba34aSJunchao Zhang     cj_dual.modify_device();
7299566063dSJacob Faibussowitsch     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
7309566063dSJacob Faibussowitsch     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
731076ba34aSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {
732076ba34aSJunchao Zhang     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
733076ba34aSJunchao Zhang     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
734076ba34aSJunchao Zhang     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
735076ba34aSJunchao Zhang     ca   = ckok->a_dual.view_device();
736076ba34aSJunchao Zhang     ci   = ckok->i_dual.view_device();
737076ba34aSJunchao Zhang 
7389371c9d4SSatish Balay     Kokkos::parallel_for(
739d326c3f1SJunchao Zhang       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
740076ba34aSJunchao Zhang         PetscInt i    = t.league_rank(); /* row i */
741076ba34aSJunchao Zhang         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
742076ba34aSJunchao Zhang         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
743076ba34aSJunchao Zhang           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
744076ba34aSJunchao Zhang           else ca(ci(i) + k) = ba(bi(i) + k - alen);
745076ba34aSJunchao Zhang         });
746076ba34aSJunchao Zhang       });
7479566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
748076ba34aSJunchao Zhang   }
7493ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
750076ba34aSJunchao Zhang }
751076ba34aSJunchao Zhang 
752d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
753d71ae5a4SJacob Faibussowitsch {
754076ba34aSJunchao Zhang   PetscFunctionBegin;
755076ba34aSJunchao Zhang   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
7563ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
757a3f881fbSStefano Zampini }
758a3f881fbSStefano Zampini 
759d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
760d71ae5a4SJacob Faibussowitsch {
761a3f881fbSStefano Zampini   Mat_Product                 *product = C->product;
762a3f881fbSStefano Zampini   Mat                          A, B;
763076ba34aSJunchao Zhang   bool                         transA, transB; /* use bool, since KK needs this type */
764a3f881fbSStefano Zampini   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
765a3f881fbSStefano Zampini   Mat_SeqAIJ                  *c;
766076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos *pdata;
7670e3ece09SJunchao Zhang   KokkosCsrMatrix              csrmatA, csrmatB;
768a3f881fbSStefano Zampini 
769a3f881fbSStefano Zampini   PetscFunctionBegin;
770a3f881fbSStefano Zampini   MatCheckProduct(C, 1);
7715f80ce2aSJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
772076ba34aSJunchao Zhang   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
773076ba34aSJunchao Zhang 
7740e3ece09SJunchao Zhang   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
7750e3ece09SJunchao Zhang   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
7760e3ece09SJunchao Zhang   // we still do numeric.
7770e3ece09SJunchao Zhang   if (pdata->reusesym) { // numeric reuses results from symbolic
7780e3ece09SJunchao Zhang     pdata->reusesym = PETSC_FALSE;
7793ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
780076ba34aSJunchao Zhang   }
781076ba34aSJunchao Zhang 
782076ba34aSJunchao Zhang   switch (product->type) {
7839371c9d4SSatish Balay   case MATPRODUCT_AB:
7849371c9d4SSatish Balay     transA = false;
7859371c9d4SSatish Balay     transB = false;
7869371c9d4SSatish Balay     break;
7879371c9d4SSatish Balay   case MATPRODUCT_AtB:
7889371c9d4SSatish Balay     transA = true;
7899371c9d4SSatish Balay     transB = false;
7909371c9d4SSatish Balay     break;
7919371c9d4SSatish Balay   case MATPRODUCT_ABt:
7929371c9d4SSatish Balay     transA = false;
7939371c9d4SSatish Balay     transB = true;
7949371c9d4SSatish Balay     break;
795d71ae5a4SJacob Faibussowitsch   default:
796d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
797076ba34aSJunchao Zhang   }
798076ba34aSJunchao Zhang 
799a3f881fbSStefano Zampini   A = product->A;
800a3f881fbSStefano Zampini   B = product->B;
8019566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
8029566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B));
803a3f881fbSStefano Zampini   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
804a3f881fbSStefano Zampini   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
805a3f881fbSStefano Zampini   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
806076ba34aSJunchao Zhang 
8075f80ce2aSJacob Faibussowitsch   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
808076ba34aSJunchao Zhang 
8090e3ece09SJunchao Zhang   csrmatA = akok->csrmat;
8100e3ece09SJunchao Zhang   csrmatB = bkok->csrmat;
811076ba34aSJunchao Zhang 
812076ba34aSJunchao Zhang   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
813076ba34aSJunchao Zhang   if (transA) {
8149566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
815076ba34aSJunchao Zhang     transA = false;
816a3f881fbSStefano Zampini   }
817a3f881fbSStefano Zampini 
818076ba34aSJunchao Zhang   if (transB) {
8199566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
820076ba34aSJunchao Zhang     transB = false;
821076ba34aSJunchao Zhang   }
8229566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
8230e3ece09SJunchao Zhang   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
8240e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
825866eb059SJunchao Zhang   auto spgemmHandle = pdata->kh.get_spgemm_handle();
826866eb059SJunchao Zhang   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
827e944a159SJunchao Zhang #endif
828866eb059SJunchao Zhang 
8299566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
8309566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(C));
831a3f881fbSStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
832a3f881fbSStefano Zampini   c = (Mat_SeqAIJ *)C->data;
8339566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
8349566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
8359566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
836a3f881fbSStefano Zampini   c->reallocs         = 0;
837076ba34aSJunchao Zhang   C->info.mallocs     = 0;
838a3f881fbSStefano Zampini   C->info.nz_unneeded = 0;
839a3f881fbSStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
840a3f881fbSStefano Zampini   C->num_ass++;
8413ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
842a3f881fbSStefano Zampini }
843a3f881fbSStefano Zampini 
844d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
845d71ae5a4SJacob Faibussowitsch {
846076ba34aSJunchao Zhang   Mat_Product                 *product = C->product;
847076ba34aSJunchao Zhang   MatProductType               ptype;
848076ba34aSJunchao Zhang   Mat                          A, B;
849076ba34aSJunchao Zhang   bool                         transA, transB;
850076ba34aSJunchao Zhang   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
851076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos *pdata;
852076ba34aSJunchao Zhang   MPI_Comm                     comm;
8530e3ece09SJunchao Zhang   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
854a3f881fbSStefano Zampini 
855a3f881fbSStefano Zampini   PetscFunctionBegin;
856a3f881fbSStefano Zampini   MatCheckProduct(C, 1);
8579566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
8585f80ce2aSJacob Faibussowitsch   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
859a3f881fbSStefano Zampini   A = product->A;
860a3f881fbSStefano Zampini   B = product->B;
8619566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
8629566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B));
863a3f881fbSStefano Zampini   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
864a3f881fbSStefano Zampini   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
8650e3ece09SJunchao Zhang   csrmatA = akok->csrmat;
8660e3ece09SJunchao Zhang   csrmatB = bkok->csrmat;
867076ba34aSJunchao Zhang 
868a3f881fbSStefano Zampini   ptype = product->type;
8690e3ece09SJunchao Zhang   // Take advantage of the symmetry if true
8700e3ece09SJunchao Zhang   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
8710e3ece09SJunchao Zhang     ptype                                          = MATPRODUCT_AB;
8720e3ece09SJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
8730e3ece09SJunchao Zhang   }
8740e3ece09SJunchao Zhang   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
8750e3ece09SJunchao Zhang     ptype                                          = MATPRODUCT_AB;
8760e3ece09SJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
8770e3ece09SJunchao Zhang   }
8780e3ece09SJunchao Zhang 
879a3f881fbSStefano Zampini   switch (ptype) {
8809371c9d4SSatish Balay   case MATPRODUCT_AB:
8819371c9d4SSatish Balay     transA = false;
8829371c9d4SSatish Balay     transB = false;
8839371c9d4SSatish Balay     break;
8849371c9d4SSatish Balay   case MATPRODUCT_AtB:
8859371c9d4SSatish Balay     transA = true;
8869371c9d4SSatish Balay     transB = false;
8879371c9d4SSatish Balay     break;
8889371c9d4SSatish Balay   case MATPRODUCT_ABt:
8899371c9d4SSatish Balay     transA = false;
8909371c9d4SSatish Balay     transB = true;
8919371c9d4SSatish Balay     break;
892d71ae5a4SJacob Faibussowitsch   default:
893d71ae5a4SJacob Faibussowitsch     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
894a3f881fbSStefano Zampini   }
8950e3ece09SJunchao Zhang   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
896076ba34aSJunchao Zhang   pdata->reusesym = product->api_user;
897a3f881fbSStefano Zampini 
898076ba34aSJunchao Zhang   /* TODO: add command line options to select spgemm algorithms */
899866eb059SJunchao Zhang   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
900866eb059SJunchao Zhang 
901866eb059SJunchao Zhang   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
902866eb059SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
903866eb059SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
904866eb059SJunchao Zhang   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
905866eb059SJunchao Zhang   #endif
906866eb059SJunchao Zhang #endif
9070e3ece09SJunchao Zhang   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
908076ba34aSJunchao Zhang 
9099566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
910076ba34aSJunchao Zhang   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
911076ba34aSJunchao Zhang   if (transA) {
9129566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
913076ba34aSJunchao Zhang     transA = false;
914076ba34aSJunchao Zhang   }
915076ba34aSJunchao Zhang 
916076ba34aSJunchao Zhang   if (transB) {
9179566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
918076ba34aSJunchao Zhang     transB = false;
919076ba34aSJunchao Zhang   }
920076ba34aSJunchao Zhang 
9210e3ece09SJunchao Zhang   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
922076ba34aSJunchao Zhang   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
923076ba34aSJunchao Zhang     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
924076ba34aSJunchao Zhang     calling new Mat_SeqAIJKokkos().
925076ba34aSJunchao Zhang     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
926076ba34aSJunchao Zhang   */
9270e3ece09SJunchao Zhang   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
9280e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
929866eb059SJunchao Zhang   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
930866eb059SJunchao Zhang   auto spgemmHandle = pdata->kh.get_spgemm_handle();
931866eb059SJunchao Zhang   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
932e944a159SJunchao Zhang #endif
9339566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
934076ba34aSJunchao Zhang 
9359566063dSJacob Faibussowitsch   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
9369566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
937076ba34aSJunchao Zhang   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
9383ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
939a3f881fbSStefano Zampini }
940a3f881fbSStefano Zampini 
941a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */
942d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
943d71ae5a4SJacob Faibussowitsch {
944076ba34aSJunchao Zhang   Mat_Product *product = mat->product;
945a3f881fbSStefano Zampini   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
946a3f881fbSStefano Zampini 
947a3f881fbSStefano Zampini   PetscFunctionBegin;
948a3f881fbSStefano Zampini   MatCheckProduct(mat, 1);
9499566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
95048a46eb9SPierre Jolivet   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
951a3f881fbSStefano Zampini   if (Biskok && Ciskok) {
952a3f881fbSStefano Zampini     switch (product->type) {
953a3f881fbSStefano Zampini     case MATPRODUCT_AB:
954a3f881fbSStefano Zampini     case MATPRODUCT_AtB:
955d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABt:
956d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
957d71ae5a4SJacob Faibussowitsch       break;
958a3f881fbSStefano Zampini     case MATPRODUCT_PtAP:
959a3f881fbSStefano Zampini     case MATPRODUCT_RARt:
960d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABC:
961d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
962d71ae5a4SJacob Faibussowitsch       break;
963d71ae5a4SJacob Faibussowitsch     default:
964d71ae5a4SJacob Faibussowitsch       break;
965a3f881fbSStefano Zampini     }
966a3f881fbSStefano Zampini   } else { /* fallback for AIJ */
9679566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
968a3f881fbSStefano Zampini   }
9693ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
970a3f881fbSStefano Zampini }
971a587d139SMark 
972d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
973d71ae5a4SJacob Faibussowitsch {
974f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok;
975f0cf5187SStefano Zampini 
976f0cf5187SStefano Zampini   PetscFunctionBegin;
9779566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
9789566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
979f0cf5187SStefano Zampini   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
980d326c3f1SJunchao Zhang   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
9819566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(A));
9829566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
9839566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
9843ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
985f0cf5187SStefano Zampini }
986f0cf5187SStefano Zampini 
987f4747e26SJunchao Zhang // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
988f4747e26SJunchao Zhang static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
989f4747e26SJunchao Zhang {
990f4747e26SJunchao Zhang   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
991f4747e26SJunchao Zhang 
992f4747e26SJunchao Zhang   PetscFunctionBegin;
993f4747e26SJunchao Zhang   if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
994f4747e26SJunchao Zhang     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
995f4747e26SJunchao Zhang 
996f4747e26SJunchao Zhang     PetscCall(PetscLogGpuTimeBegin());
997f4747e26SJunchao Zhang     PetscCall(MatSeqAIJKokkosSyncDevice(A));
998f4747e26SJunchao Zhang     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
999f4747e26SJunchao Zhang     const auto &Aa     = aijkok->a_dual.view_device();
1000f4747e26SJunchao Zhang     const auto &Adiag  = aijkok->diag_dual.view_device();
1001d326c3f1SJunchao Zhang     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1002f4747e26SJunchao Zhang     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1003f4747e26SJunchao Zhang     PetscCall(PetscLogGpuFlops(n));
1004f4747e26SJunchao Zhang     PetscCall(PetscLogGpuTimeEnd());
1005f4747e26SJunchao Zhang   } else { // need reassembly, very slow!
1006f4747e26SJunchao Zhang     PetscCall(MatShift_Basic(A, a));
1007f4747e26SJunchao Zhang   }
1008f4747e26SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1009f4747e26SJunchao Zhang }
1010f4747e26SJunchao Zhang 
1011f4747e26SJunchao Zhang static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1012f4747e26SJunchao Zhang {
1013f4747e26SJunchao Zhang   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1014f4747e26SJunchao Zhang 
1015f4747e26SJunchao Zhang   PetscFunctionBegin;
1016f4747e26SJunchao Zhang   if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1017f4747e26SJunchao Zhang     ConstPetscScalarKokkosView dv;
1018f4747e26SJunchao Zhang     PetscInt                   n, nv;
1019f4747e26SJunchao Zhang 
1020f4747e26SJunchao Zhang     PetscCall(PetscLogGpuTimeBegin());
1021f4747e26SJunchao Zhang     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1022f4747e26SJunchao Zhang     PetscCall(VecGetKokkosView(D, &dv));
1023f4747e26SJunchao Zhang     PetscCall(VecGetLocalSize(D, &nv));
1024f4747e26SJunchao Zhang     n = PetscMin(Y->rmap->n, Y->cmap->n);
1025f4747e26SJunchao Zhang     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1026f4747e26SJunchao Zhang 
1027f4747e26SJunchao Zhang     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1028f4747e26SJunchao Zhang     const auto &Aa     = aijkok->a_dual.view_device();
1029f4747e26SJunchao Zhang     const auto &Adiag  = aijkok->diag_dual.view_device();
1030f4747e26SJunchao Zhang     PetscCallCXX(Kokkos::parallel_for(
1031d326c3f1SJunchao Zhang       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1032f4747e26SJunchao Zhang         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1033f4747e26SJunchao Zhang         else Aa(Adiag(i)) += dv(i);
1034f4747e26SJunchao Zhang       }));
1035f4747e26SJunchao Zhang     PetscCall(VecRestoreKokkosView(D, &dv));
1036f4747e26SJunchao Zhang     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1037f4747e26SJunchao Zhang     PetscCall(PetscLogGpuFlops(n));
1038f4747e26SJunchao Zhang     PetscCall(PetscLogGpuTimeEnd());
1039f4747e26SJunchao Zhang   } else { // need reassembly, very slow!
1040f4747e26SJunchao Zhang     PetscCall(MatDiagonalSet_Default(Y, D, is));
1041f4747e26SJunchao Zhang   }
1042f4747e26SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1043f4747e26SJunchao Zhang }
1044f4747e26SJunchao Zhang 
1045f4747e26SJunchao Zhang static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1046f4747e26SJunchao Zhang {
1047f4747e26SJunchao Zhang   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1048f4747e26SJunchao Zhang   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1049f4747e26SJunchao Zhang   ConstPetscScalarKokkosView lv, rv;
1050f4747e26SJunchao Zhang 
1051f4747e26SJunchao Zhang   PetscFunctionBegin;
1052f4747e26SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1053f4747e26SJunchao Zhang   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1054f4747e26SJunchao Zhang   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1055f4747e26SJunchao Zhang   const auto &Aa     = aijkok->a_dual.view_device();
1056f4747e26SJunchao Zhang   const auto &Ai     = aijkok->i_dual.view_device();
1057f4747e26SJunchao Zhang   const auto &Aj     = aijkok->j_dual.view_device();
1058f4747e26SJunchao Zhang   if (ll) {
1059f4747e26SJunchao Zhang     PetscCall(VecGetLocalSize(ll, &m));
1060f4747e26SJunchao Zhang     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1061f4747e26SJunchao Zhang     PetscCall(VecGetKokkosView(ll, &lv));
1062f4747e26SJunchao Zhang     PetscCallCXX(Kokkos::parallel_for( // for each row
1063d326c3f1SJunchao Zhang       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1064f4747e26SJunchao Zhang         PetscInt i   = t.league_rank(); // row i
1065f4747e26SJunchao Zhang         PetscInt len = Ai(i + 1) - Ai(i);
1066f4747e26SJunchao Zhang         // scale entries on the row
1067f4747e26SJunchao Zhang         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1068f4747e26SJunchao Zhang       }));
1069f4747e26SJunchao Zhang     PetscCall(VecRestoreKokkosView(ll, &lv));
1070f4747e26SJunchao Zhang     PetscCall(PetscLogGpuFlops(nz));
1071f4747e26SJunchao Zhang   }
1072f4747e26SJunchao Zhang   if (rr) {
1073f4747e26SJunchao Zhang     PetscCall(VecGetLocalSize(rr, &n));
1074f4747e26SJunchao Zhang     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1075f4747e26SJunchao Zhang     PetscCall(VecGetKokkosView(rr, &rv));
1076f4747e26SJunchao Zhang     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1077d326c3f1SJunchao Zhang       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1078f4747e26SJunchao Zhang     PetscCall(VecRestoreKokkosView(rr, &lv));
1079f4747e26SJunchao Zhang     PetscCall(PetscLogGpuFlops(nz));
1080f4747e26SJunchao Zhang   }
1081f4747e26SJunchao Zhang   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1082f4747e26SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1083f4747e26SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1084f4747e26SJunchao Zhang }
1085f4747e26SJunchao Zhang 
1086d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1087d71ae5a4SJacob Faibussowitsch {
1088076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
1089a587d139SMark 
1090a587d139SMark   PetscFunctionBegin;
1091076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
10922328674fSJunchao Zhang   if (aijkok) { /* Only zero the device if data is already there */
1093d326c3f1SJunchao Zhang     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
10949566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(A));
10952328674fSJunchao Zhang   } else { /* Might be preallocated but not assembled */
10969566063dSJacob Faibussowitsch     PetscCall(MatZeroEntries_SeqAIJ(A));
10972328674fSJunchao Zhang   }
10983ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1099a587d139SMark }
1100a587d139SMark 
1101d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1102d71ae5a4SJacob Faibussowitsch {
1103f78ce678SMark Adams   Mat_SeqAIJKokkos     *aijkok;
1104f78ce678SMark Adams   PetscInt              n;
1105f78ce678SMark Adams   PetscScalarKokkosView xv;
1106f78ce678SMark Adams 
1107f78ce678SMark Adams   PetscFunctionBegin;
1108f78ce678SMark Adams   PetscCall(VecGetLocalSize(x, &n));
1109f78ce678SMark Adams   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1110f78ce678SMark Adams   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1111f78ce678SMark Adams 
1112f78ce678SMark Adams   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1113f78ce678SMark Adams   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1114f78ce678SMark Adams 
1115f78ce678SMark Adams   const auto &Aa    = aijkok->a_dual.view_device();
1116f78ce678SMark Adams   const auto &Ai    = aijkok->i_dual.view_device();
1117f78ce678SMark Adams   const auto &Adiag = aijkok->diag_dual.view_device();
1118f78ce678SMark Adams 
1119f78ce678SMark Adams   PetscCall(VecGetKokkosViewWrite(x, &xv));
11209371c9d4SSatish Balay   Kokkos::parallel_for(
1121d326c3f1SJunchao Zhang     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1122f78ce678SMark Adams       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1123f78ce678SMark Adams       else xv(i) = 0;
1124f78ce678SMark Adams     });
1125f78ce678SMark Adams   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
11263ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1127f78ce678SMark Adams }
1128f78ce678SMark Adams 
1129db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1130d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1131d71ae5a4SJacob Faibussowitsch {
1132db78de30SJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
1133db78de30SJunchao Zhang 
1134db78de30SJunchao Zhang   PetscFunctionBegin;
1135db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
11364f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1137db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
11389566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1139db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1140076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
11413ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1142db78de30SJunchao Zhang }
1143db78de30SJunchao Zhang 
1144d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1145d71ae5a4SJacob Faibussowitsch {
1146db78de30SJunchao Zhang   PetscFunctionBegin;
1147db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
11484f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1149db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
11503ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1151db78de30SJunchao Zhang }
1152db78de30SJunchao Zhang 
1153d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1154d71ae5a4SJacob Faibussowitsch {
1155db78de30SJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
1156db78de30SJunchao Zhang 
1157db78de30SJunchao Zhang   PetscFunctionBegin;
1158db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
11594f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1160db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
11619566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1162db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1163076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
11643ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1165db78de30SJunchao Zhang }
1166db78de30SJunchao Zhang 
1167d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1168d71ae5a4SJacob Faibussowitsch {
1169db78de30SJunchao Zhang   PetscFunctionBegin;
1170db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
11714f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1172db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
11739566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(A));
11743ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1175db78de30SJunchao Zhang }
1176db78de30SJunchao Zhang 
1177d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1178d71ae5a4SJacob Faibussowitsch {
1179db78de30SJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
1180db78de30SJunchao Zhang 
1181db78de30SJunchao Zhang   PetscFunctionBegin;
1182db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
11834f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1184db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1185db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1186076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
11873ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1188db78de30SJunchao Zhang }
1189db78de30SJunchao Zhang 
1190d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1191d71ae5a4SJacob Faibussowitsch {
1192db78de30SJunchao Zhang   PetscFunctionBegin;
1193db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
11944f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1195db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
11969566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(A));
11973ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1198db78de30SJunchao Zhang }
1199db78de30SJunchao Zhang 
1200c17cf699SJunchao Zhang /* Computes Y += alpha X */
1201d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1202d71ae5a4SJacob Faibussowitsch {
1203a587d139SMark   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1204c17cf699SJunchao Zhang   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1205c17cf699SJunchao Zhang   ConstMatScalarKokkosView Xa;
1206c17cf699SJunchao Zhang   MatScalarKokkosView      Ya;
1207d326c3f1SJunchao Zhang   auto                    &exec = PetscGetKokkosExecutionSpace();
1208a587d139SMark 
1209a587d139SMark   PetscFunctionBegin;
1210c17cf699SJunchao Zhang   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1211c17cf699SJunchao Zhang   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
12129566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
12139566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(X));
12149566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1215db78de30SJunchao Zhang 
1216c17cf699SJunchao Zhang   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1217a587d139SMark     PetscBool e;
12189566063dSJacob Faibussowitsch     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1219a587d139SMark     if (e) {
12209566063dSJacob Faibussowitsch       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1221c17cf699SJunchao Zhang       if (e) pattern = SAME_NONZERO_PATTERN;
1222a587d139SMark     }
1223a587d139SMark   }
1224db78de30SJunchao Zhang 
1225c17cf699SJunchao Zhang   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1226c17cf699SJunchao Zhang     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1227c17cf699SJunchao Zhang     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1228c17cf699SJunchao Zhang     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1229c17cf699SJunchao Zhang   */
1230c17cf699SJunchao Zhang   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1231c17cf699SJunchao Zhang   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1232c17cf699SJunchao Zhang   Xa   = xkok->a_dual.view_device();
1233c17cf699SJunchao Zhang   Ya   = ykok->a_dual.view_device();
1234c17cf699SJunchao Zhang 
1235c17cf699SJunchao Zhang   if (pattern == SAME_NONZERO_PATTERN) {
1236d326c3f1SJunchao Zhang     KokkosBlas::axpy(exec, alpha, Xa, Ya);
12379566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1238c17cf699SJunchao Zhang   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1239c17cf699SJunchao Zhang     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1240c17cf699SJunchao Zhang     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1241c17cf699SJunchao Zhang 
12429371c9d4SSatish Balay     Kokkos::parallel_for(
1243d326c3f1SJunchao Zhang       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
12440e3ece09SJunchao Zhang         PetscInt i = t.league_rank(); // row i
12450e3ece09SJunchao Zhang         Kokkos::single(Kokkos::PerTeam(t), [=]() {
12460e3ece09SJunchao Zhang           // Only one thread works in a team
1247c17cf699SJunchao Zhang           PetscInt p, q = Yi(i);
12480e3ece09SJunchao Zhang           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
12490e3ece09SJunchao Zhang             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
12500e3ece09SJunchao Zhang             if (Xj(p) == Yj(q)) {                        // Found it
1251c17cf699SJunchao Zhang               Ya(q) += alpha * Xa(p);
1252c17cf699SJunchao Zhang               q++;
1253a587d139SMark             } else {
12540e3ece09SJunchao Zhang             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
12550e3ece09SJunchao Zhang             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
12560e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
12570e3ece09SJunchao Zhang               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
12588b8b16f9SJunchao Zhang #else
12590e3ece09SJunchao Zhang               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
12608b8b16f9SJunchao Zhang #endif
1261a587d139SMark             }
1262c17cf699SJunchao Zhang           }
1263c17cf699SJunchao Zhang         });
1264c17cf699SJunchao Zhang       });
12659566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
12660e3ece09SJunchao Zhang   } else { // different nonzero patterns
1267c17cf699SJunchao Zhang     Mat             Z;
1268c17cf699SJunchao Zhang     KokkosCsrMatrix zcsr;
1269c17cf699SJunchao Zhang     KernelHandle    kh;
12700e3ece09SJunchao Zhang     kh.create_spadd_handle(true); // X, Y are sorted
1271c17cf699SJunchao Zhang     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1272c17cf699SJunchao Zhang     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1273c17cf699SJunchao Zhang     zkok = new Mat_SeqAIJKokkos(zcsr);
12749566063dSJacob Faibussowitsch     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
12759566063dSJacob Faibussowitsch     PetscCall(MatHeaderReplace(Y, &Z));
1276c17cf699SJunchao Zhang     kh.destroy_spadd_handle();
1277c17cf699SJunchao Zhang   }
12789566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
12790e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
12803ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1281a587d139SMark }
1282a587d139SMark 
12832c4ab24aSJunchao Zhang struct MatCOOStruct_SeqAIJKokkos {
12842c4ab24aSJunchao Zhang   PetscCount           n;
12852c4ab24aSJunchao Zhang   PetscCount           Atot;
12862c4ab24aSJunchao Zhang   PetscInt             nz;
12872c4ab24aSJunchao Zhang   PetscCountKokkosView jmap;
12882c4ab24aSJunchao Zhang   PetscCountKokkosView perm;
12892c4ab24aSJunchao Zhang 
12902c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
12912c4ab24aSJunchao Zhang   {
12922c4ab24aSJunchao Zhang     nz   = coo_h->nz;
12932c4ab24aSJunchao Zhang     n    = coo_h->n;
12942c4ab24aSJunchao Zhang     Atot = coo_h->Atot;
12952c4ab24aSJunchao Zhang     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
12962c4ab24aSJunchao Zhang     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
12972c4ab24aSJunchao Zhang   }
12982c4ab24aSJunchao Zhang };
12992c4ab24aSJunchao Zhang 
1300*49abdd8aSBarry Smith static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
13012c4ab24aSJunchao Zhang {
13022c4ab24aSJunchao Zhang   PetscFunctionBegin;
1303*49abdd8aSBarry Smith   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
13042c4ab24aSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
13052c4ab24aSJunchao Zhang }
13062c4ab24aSJunchao Zhang 
1307d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1308d71ae5a4SJacob Faibussowitsch {
130942550becSJunchao Zhang   Mat_SeqAIJKokkos          *akok;
131042550becSJunchao Zhang   Mat_SeqAIJ                *aseq;
131103e76207SPierre Jolivet   PetscContainer             container_h;
13122c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJ       *coo_h;
13132c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJKokkos *coo_d;
131442550becSJunchao Zhang 
131542550becSJunchao Zhang   PetscFunctionBegin;
13169566063dSJacob Faibussowitsch   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1317394ed5ebSJunchao Zhang   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
131842550becSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1319cbc6b225SStefano Zampini   delete akok;
1320f4747e26SJunchao Zhang   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
13219566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
13222c4ab24aSJunchao Zhang 
13232c4ab24aSJunchao Zhang   // Copy the COO struct to device
13242c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
13252c4ab24aSJunchao Zhang   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
13262c4ab24aSJunchao Zhang   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
13272c4ab24aSJunchao Zhang 
13282c4ab24aSJunchao Zhang   // Put the COO struct in a container and then attach that to the matrix
132903e76207SPierre Jolivet   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
13303ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
133142550becSJunchao Zhang }
133242550becSJunchao Zhang 
1333d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1334d71ae5a4SJacob Faibussowitsch {
133542550becSJunchao Zhang   MatScalarKokkosView        Aa;
133642550becSJunchao Zhang   ConstMatScalarKokkosView   kv;
133742550becSJunchao Zhang   PetscMemType               memtype;
13382c4ab24aSJunchao Zhang   PetscContainer             container;
13392c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJKokkos *coo;
134042550becSJunchao Zhang 
134142550becSJunchao Zhang   PetscFunctionBegin;
13422c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
13432c4ab24aSJunchao Zhang   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
13442c4ab24aSJunchao Zhang 
13452c4ab24aSJunchao Zhang   const auto &n    = coo->n;
13462c4ab24aSJunchao Zhang   const auto &Annz = coo->nz;
13472c4ab24aSJunchao Zhang   const auto &jmap = coo->jmap;
13482c4ab24aSJunchao Zhang   const auto &perm = coo->perm;
13492c4ab24aSJunchao Zhang 
13509566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(v, &memtype));
135142550becSJunchao Zhang   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
13522c4ab24aSJunchao Zhang     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
135342550becSJunchao Zhang   } else {
13542c4ab24aSJunchao Zhang     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
135542550becSJunchao Zhang   }
135642550becSJunchao Zhang 
1357c7b718f4SJunchao Zhang   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1358c7b718f4SJunchao Zhang   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
135942550becSJunchao Zhang 
136008bb9926SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
13619371c9d4SSatish Balay   Kokkos::parallel_for(
1362d326c3f1SJunchao Zhang     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1363c7b718f4SJunchao Zhang       PetscScalar sum = 0.0;
1364c7b718f4SJunchao Zhang       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1365c7b718f4SJunchao Zhang       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1366c7b718f4SJunchao Zhang     });
136708bb9926SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1368394ed5ebSJunchao Zhang 
13699566063dSJacob Faibussowitsch   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
13709566063dSJacob Faibussowitsch   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
13713ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
137242550becSJunchao Zhang }
137342550becSJunchao Zhang 
1374d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1375d71ae5a4SJacob Faibussowitsch {
13768f7e8f9dSMark Adams   PetscFunctionBegin;
13779566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncHost(A));
13789566063dSJacob Faibussowitsch   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
13798f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
13803ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
13818f7e8f9dSMark Adams }
13828f7e8f9dSMark Adams 
1383d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1384d71ae5a4SJacob Faibussowitsch {
1385076ba34aSJunchao Zhang   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1386076ba34aSJunchao Zhang 
13878c3ff71bSJunchao Zhang   PetscFunctionBegin;
1388076ba34aSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
13896f3d89d0SStefano Zampini   A->boundtocpu  = PETSC_FALSE;
13906f3d89d0SStefano Zampini 
13918c3ff71bSJunchao Zhang   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
13928c3ff71bSJunchao Zhang   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
13938c3ff71bSJunchao Zhang   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1394a587d139SMark   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1395f0cf5187SStefano Zampini   A->ops->scale                     = MatScale_SeqAIJKokkos;
1396a587d139SMark   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1397076ba34aSJunchao Zhang   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
13988c3ff71bSJunchao Zhang   A->ops->mult                      = MatMult_SeqAIJKokkos;
13998c3ff71bSJunchao Zhang   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
14008c3ff71bSJunchao Zhang   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
14018c3ff71bSJunchao Zhang   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
14028c3ff71bSJunchao Zhang   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
14038c3ff71bSJunchao Zhang   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1404076ba34aSJunchao Zhang   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
14050ecb592aSJunchao Zhang   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1406152b3e56SJunchao Zhang   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1407f78ce678SMark Adams   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1408f4747e26SJunchao Zhang   A->ops->shift                     = MatShift_SeqAIJKokkos;
1409f4747e26SJunchao Zhang   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1410f4747e26SJunchao Zhang   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1411076ba34aSJunchao Zhang   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1412076ba34aSJunchao Zhang   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1413076ba34aSJunchao Zhang   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1414076ba34aSJunchao Zhang   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1415076ba34aSJunchao Zhang   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1416076ba34aSJunchao Zhang   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
14177ee59b9bSJunchao Zhang   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
141842550becSJunchao Zhang 
14199566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
14209566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
14213ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1422076ba34aSJunchao Zhang }
1423076ba34aSJunchao Zhang 
14249d13fa56SJunchao Zhang /*
14259d13fa56SJunchao Zhang    Extract the (prescribled) diagonal blocks of the matrix and then invert them
14269d13fa56SJunchao Zhang 
14279d13fa56SJunchao Zhang   Input Parameters:
14289d13fa56SJunchao Zhang +  A       - the MATSEQAIJKOKKOS matrix
14299d13fa56SJunchao Zhang .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
14309d13fa56SJunchao Zhang .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
14319d13fa56SJunchao Zhang .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
14329d13fa56SJunchao Zhang -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine
14339d13fa56SJunchao Zhang 
14349d13fa56SJunchao Zhang   Output Parameter:
14359d13fa56SJunchao Zhang .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
14369d13fa56SJunchao Zhang */
14379d13fa56SJunchao Zhang PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
14389d13fa56SJunchao Zhang {
14399d13fa56SJunchao Zhang   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
14409d13fa56SJunchao Zhang   PetscInt          nblocks = bs.extent(0) - 1;
14419d13fa56SJunchao Zhang 
14429d13fa56SJunchao Zhang   PetscFunctionBegin;
14439d13fa56SJunchao Zhang   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
14449d13fa56SJunchao Zhang 
14459d13fa56SJunchao Zhang   // Pull out the diagonal blocks of the matrix and then invert the blocks
14469d13fa56SJunchao Zhang   auto Aa    = akok->a_dual.view_device();
14479d13fa56SJunchao Zhang   auto Ai    = akok->i_dual.view_device();
14489d13fa56SJunchao Zhang   auto Aj    = akok->j_dual.view_device();
14499d13fa56SJunchao Zhang   auto Adiag = akok->diag_dual.view_device();
14509d13fa56SJunchao Zhang   // TODO: how to tune the team size?
14519d13fa56SJunchao Zhang #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HOST)
14529d13fa56SJunchao Zhang   auto ts = Kokkos::AUTO();
14539d13fa56SJunchao Zhang #else
14549d13fa56SJunchao Zhang   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
14559d13fa56SJunchao Zhang #endif
14569d13fa56SJunchao Zhang   PetscCallCXX(Kokkos::parallel_for(
1457d326c3f1SJunchao Zhang     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
14589d13fa56SJunchao Zhang       const PetscInt bid    = teamMember.league_rank();                                                   // block id
14599d13fa56SJunchao Zhang       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
14609d13fa56SJunchao Zhang       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
14619d13fa56SJunchao Zhang       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
14629d13fa56SJunchao Zhang       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);
14639d13fa56SJunchao Zhang 
14649d13fa56SJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
14659d13fa56SJunchao Zhang         PetscInt i = rstart + r;                                                            // i-th row in A
14669d13fa56SJunchao Zhang 
14679d13fa56SJunchao Zhang         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
14689d13fa56SJunchao Zhang           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row
14699d13fa56SJunchao Zhang 
14709d13fa56SJunchao Zhang           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
14719d13fa56SJunchao Zhang             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
14729d13fa56SJunchao Zhang               B(r, c) = 0.0;
14739d13fa56SJunchao Zhang             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
14749d13fa56SJunchao Zhang               B(r, c) = Aa(first + c);
14759d13fa56SJunchao Zhang             } else { // this entry does not show up in the CSR
14769d13fa56SJunchao Zhang               B(r, c) = 0.0;
14779d13fa56SJunchao Zhang             }
14789d13fa56SJunchao Zhang           }
14799d13fa56SJunchao Zhang         } else { // rare case that the diagonal does not exist
14809d13fa56SJunchao Zhang           const PetscInt begin = Ai(i);
14819d13fa56SJunchao Zhang           const PetscInt end   = Ai(i + 1);
14829d13fa56SJunchao Zhang           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
14839d13fa56SJunchao Zhang           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
14849d13fa56SJunchao Zhang             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
14859d13fa56SJunchao Zhang             else if (Aj(j) >= rstart + m) break;
14869d13fa56SJunchao Zhang           }
14879d13fa56SJunchao Zhang         }
14889d13fa56SJunchao Zhang       });
14899d13fa56SJunchao Zhang 
14909d13fa56SJunchao Zhang       // LU-decompose B (w/o pivoting) and then invert B
14919d13fa56SJunchao Zhang       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
14929d13fa56SJunchao Zhang       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
14939d13fa56SJunchao Zhang     }));
14949d13fa56SJunchao Zhang   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
14959d13fa56SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
14969d13fa56SJunchao Zhang }
14979d13fa56SJunchao Zhang 
1498d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1499d71ae5a4SJacob Faibussowitsch {
1500076ba34aSJunchao Zhang   Mat_SeqAIJ *aseq;
1501076ba34aSJunchao Zhang   PetscInt    i, m, n;
1502e36ced11SJunchao Zhang   auto       &exec = PetscGetKokkosExecutionSpace();
1503076ba34aSJunchao Zhang 
1504076ba34aSJunchao Zhang   PetscFunctionBegin;
15055f80ce2aSJacob Faibussowitsch   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1506076ba34aSJunchao Zhang 
1507076ba34aSJunchao Zhang   m = akok->nrows();
1508076ba34aSJunchao Zhang   n = akok->ncols();
15099566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(A, m, n, m, n));
15109566063dSJacob Faibussowitsch   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1511076ba34aSJunchao Zhang 
1512076ba34aSJunchao Zhang   /* Set up data structures of A as a MATSEQAIJ */
15139566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
151457508eceSPierre Jolivet   aseq = (Mat_SeqAIJ *)A->data;
1515076ba34aSJunchao Zhang 
1516e36ced11SJunchao Zhang   PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */
1517e36ced11SJunchao Zhang   PetscCallCXX(akok->j_dual.sync_host(exec));
1518e36ced11SJunchao Zhang   PetscCallCXX(exec.fence());
1519076ba34aSJunchao Zhang 
1520076ba34aSJunchao Zhang   aseq->i       = akok->i_host_data();
1521076ba34aSJunchao Zhang   aseq->j       = akok->j_host_data();
1522076ba34aSJunchao Zhang   aseq->a       = akok->a_host_data();
1523076ba34aSJunchao Zhang   aseq->nonew   = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1524076ba34aSJunchao Zhang   aseq->free_a  = PETSC_FALSE;
1525076ba34aSJunchao Zhang   aseq->free_ij = PETSC_FALSE;
1526076ba34aSJunchao Zhang   aseq->nz      = akok->nnz();
1527076ba34aSJunchao Zhang   aseq->maxnz   = aseq->nz;
1528076ba34aSJunchao Zhang 
15299566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &aseq->imax));
15309566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &aseq->ilen));
1531ad540459SPierre Jolivet   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1532076ba34aSJunchao Zhang 
1533076ba34aSJunchao Zhang   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1534076ba34aSJunchao Zhang   akok->nonzerostate = A->nonzerostate;
1535ff751488SJunchao Zhang   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
15369566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
15379566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
15383ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1539076ba34aSJunchao Zhang }
1540076ba34aSJunchao Zhang 
15410e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
15420e3ece09SJunchao Zhang {
15430e3ece09SJunchao Zhang   PetscFunctionBegin;
15440e3ece09SJunchao Zhang   PetscCall(MatSeqAIJKokkosSyncDevice(A));
15450e3ece09SJunchao Zhang   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
15460e3ece09SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
15470e3ece09SJunchao Zhang }
15480e3ece09SJunchao Zhang 
15490e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
15500e3ece09SJunchao Zhang {
15510e3ece09SJunchao Zhang   Mat_SeqAIJKokkos *akok;
15524d86920dSPierre Jolivet 
15530e3ece09SJunchao Zhang   PetscFunctionBegin;
15540e3ece09SJunchao Zhang   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
15550e3ece09SJunchao Zhang   PetscCall(MatCreate(comm, A));
15560e3ece09SJunchao Zhang   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
15570e3ece09SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
15580e3ece09SJunchao Zhang }
15590e3ece09SJunchao Zhang 
1560076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1561076ba34aSJunchao Zhang 
1562076ba34aSJunchao Zhang    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1563076ba34aSJunchao Zhang  */
1564d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1565d71ae5a4SJacob Faibussowitsch {
1566076ba34aSJunchao Zhang   PetscFunctionBegin;
15679566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
15689566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
15693ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
15708c3ff71bSJunchao Zhang }
15718c3ff71bSJunchao Zhang 
1572152b3e56SJunchao Zhang /*@C
157311a5261eSBarry Smith   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
15748c3ff71bSJunchao Zhang   (the default parallel PETSc format). This matrix will ultimately be handled by
157520f4b53cSBarry Smith   Kokkos for calculations.
15768c3ff71bSJunchao Zhang 
15778c3ff71bSJunchao Zhang   Collective
15788c3ff71bSJunchao Zhang 
15798c3ff71bSJunchao Zhang   Input Parameters:
158011a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF`
15818c3ff71bSJunchao Zhang . m    - number of rows
15828c3ff71bSJunchao Zhang . n    - number of columns
158320f4b53cSBarry Smith . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
158420f4b53cSBarry Smith - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
15858c3ff71bSJunchao Zhang 
15868c3ff71bSJunchao Zhang   Output Parameter:
15878c3ff71bSJunchao Zhang . A - the matrix
15888c3ff71bSJunchao Zhang 
15892ef1f0ffSBarry Smith   Level: intermediate
15902ef1f0ffSBarry Smith 
15912ef1f0ffSBarry Smith   Notes:
159211a5261eSBarry Smith   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
15938c3ff71bSJunchao Zhang   MatXXXXSetPreallocation() paradgm instead of this routine directly.
159411a5261eSBarry Smith   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
15958c3ff71bSJunchao Zhang 
159611a5261eSBarry Smith   The AIJ format, also called
15972ef1f0ffSBarry Smith   compressed row storage, is fully compatible with standard Fortran
15988c3ff71bSJunchao Zhang   storage.  That is, the stored row and column indices can begin at
159920f4b53cSBarry Smith   either one (as in Fortran) or zero.
16008c3ff71bSJunchao Zhang 
16012ef1f0ffSBarry Smith   Specify the preallocated storage with either `nz` or `nnz` (not both).
16022ef1f0ffSBarry Smith   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
16032ef1f0ffSBarry Smith   allocation.
16048c3ff71bSJunchao Zhang 
1605fe59aa6dSJacob Faibussowitsch .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
16068c3ff71bSJunchao Zhang @*/
1607d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1608d71ae5a4SJacob Faibussowitsch {
16098c3ff71bSJunchao Zhang   PetscFunctionBegin;
16109566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
16119566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
16129566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, m, n));
16139566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
16149566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
16153ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16168c3ff71bSJunchao Zhang }
1617930e68a5SMark Adams 
1618d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1619d71ae5a4SJacob Faibussowitsch {
1620930e68a5SMark Adams   PetscFunctionBegin;
16219566063dSJacob Faibussowitsch   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
162286a27549SJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
16233ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
162486a27549SJunchao Zhang }
162586a27549SJunchao Zhang 
1626d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1627d71ae5a4SJacob Faibussowitsch {
162886a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
162986a27549SJunchao Zhang 
163086a27549SJunchao Zhang   PetscFunctionBegin;
163186a27549SJunchao Zhang   if (!factors->sptrsv_symbolic_completed) {
163286a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
163386a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
163486a27549SJunchao Zhang     factors->sptrsv_symbolic_completed = PETSC_TRUE;
163586a27549SJunchao Zhang   }
16363ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
163786a27549SJunchao Zhang }
163886a27549SJunchao Zhang 
163986a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */
1640d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1641d71ae5a4SJacob Faibussowitsch {
164286a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1643076ba34aSJunchao Zhang   MatColIdxType               n       = A->rmap->n;
164486a27549SJunchao Zhang 
164586a27549SJunchao Zhang   PetscFunctionBegin;
164686a27549SJunchao Zhang   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
164786a27549SJunchao Zhang     /* Update L^T and do sptrsv symbolic */
16487b8d4ba6SJunchao Zhang     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0
16497b8d4ba6SJunchao Zhang     factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
16507b8d4ba6SJunchao Zhang     factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
165186a27549SJunchao Zhang 
16529371c9d4SSatish Balay     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
165386a27549SJunchao Zhang                                                                                                                                                                                                               factors->iLt_d, factors->jLt_d, factors->aLt_d);
165486a27549SJunchao Zhang 
165586a27549SJunchao Zhang     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
165686a27549SJunchao Zhang       We have to sort the indices, until KK provides finer control options.
165786a27549SJunchao Zhang     */
16589371c9d4SSatish Balay     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);
165986a27549SJunchao Zhang 
166086a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);
166186a27549SJunchao Zhang 
166286a27549SJunchao Zhang     /* Update U^T and do sptrsv symbolic */
16637b8d4ba6SJunchao Zhang     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0
16647b8d4ba6SJunchao Zhang     factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
16657b8d4ba6SJunchao Zhang     factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
166686a27549SJunchao Zhang 
16679371c9d4SSatish Balay     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
166886a27549SJunchao Zhang                                                                                                                                                                                                               factors->iUt_d, factors->jUt_d, factors->aUt_d);
166986a27549SJunchao Zhang 
167086a27549SJunchao Zhang     /* Sort indices. See comments above */
16719371c9d4SSatish Balay     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);
167286a27549SJunchao Zhang 
167386a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
167486a27549SJunchao Zhang     factors->transpose_updated = PETSC_TRUE;
167586a27549SJunchao Zhang   }
16763ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
167786a27549SJunchao Zhang }
167886a27549SJunchao Zhang 
167986a27549SJunchao Zhang /* Solve Ax = b, with A = LU */
1680d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x)
1681d71ae5a4SJacob Faibussowitsch {
168286a27549SJunchao Zhang   ConstPetscScalarKokkosView  bv;
168386a27549SJunchao Zhang   PetscScalarKokkosView       xv;
168486a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
168586a27549SJunchao Zhang 
168686a27549SJunchao Zhang   PetscFunctionBegin;
16879566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
16889566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
16899566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(b, &bv));
16909566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(x, &xv));
169186a27549SJunchao Zhang   /* Solve L tmpv = b */
16929566063dSJacob Faibussowitsch   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector));
169386a27549SJunchao Zhang   /* Solve Ux = tmpv */
16949566063dSJacob Faibussowitsch   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv));
16959566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(b, &bv));
16969566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
16979566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
16983ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
169986a27549SJunchao Zhang }
170086a27549SJunchao Zhang 
1701076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */
1702d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x)
1703d71ae5a4SJacob Faibussowitsch {
170486a27549SJunchao Zhang   ConstPetscScalarKokkosView  bv;
170586a27549SJunchao Zhang   PetscScalarKokkosView       xv;
170686a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
170786a27549SJunchao Zhang 
170886a27549SJunchao Zhang   PetscFunctionBegin;
17099566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
17109566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
17119566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(b, &bv));
17129566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(x, &xv));
171386a27549SJunchao Zhang   /* Solve U^T tmpv = b */
171486a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);
171586a27549SJunchao Zhang 
171686a27549SJunchao Zhang   /* Solve L^T x = tmpv */
171786a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
17189566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(b, &bv));
17199566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
17209566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
17213ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
172286a27549SJunchao Zhang }
172386a27549SJunchao Zhang 
1724d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1725d71ae5a4SJacob Faibussowitsch {
172686a27549SJunchao Zhang   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
172786a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
172886a27549SJunchao Zhang   PetscInt                    fill_lev = info->levels;
172986a27549SJunchao Zhang 
173086a27549SJunchao Zhang   PetscFunctionBegin;
17319566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
17329566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1733076ba34aSJunchao Zhang 
1734076ba34aSJunchao Zhang   auto a_d = aijkok->a_dual.view_device();
1735076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1736076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1737076ba34aSJunchao Zhang 
1738076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);
173986a27549SJunchao Zhang 
174086a27549SJunchao Zhang   B->assembled              = PETSC_TRUE;
174186a27549SJunchao Zhang   B->preallocated           = PETSC_TRUE;
174286a27549SJunchao Zhang   B->ops->solve             = MatSolve_SeqAIJKokkos;
174386a27549SJunchao Zhang   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos;
174486a27549SJunchao Zhang   B->ops->matsolve          = NULL;
174586a27549SJunchao Zhang   B->ops->matsolvetranspose = NULL;
174686a27549SJunchao Zhang   B->offloadmask            = PETSC_OFFLOAD_GPU;
174786a27549SJunchao Zhang 
174886a27549SJunchao Zhang   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
174986a27549SJunchao Zhang   factors->transpose_updated         = PETSC_FALSE;
175086a27549SJunchao Zhang   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1751eeadb341SJunchao Zhang   /* TODO: log flops, but how to know that? */
17529566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
17533ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
175486a27549SJunchao Zhang }
175586a27549SJunchao Zhang 
1756d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1757d71ae5a4SJacob Faibussowitsch {
175886a27549SJunchao Zhang   Mat_SeqAIJKokkos           *aijkok;
175986a27549SJunchao Zhang   Mat_SeqAIJ                 *b;
176086a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
176186a27549SJunchao Zhang   PetscInt                    fill_lev = info->levels;
176286a27549SJunchao Zhang   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
176386a27549SJunchao Zhang   PetscInt                    n        = A->rmap->n;
176486a27549SJunchao Zhang 
176586a27549SJunchao Zhang   PetscFunctionBegin;
17669566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
176786a27549SJunchao Zhang   /* Rebuild factors */
17689371c9d4SSatish Balay   if (factors) {
17699371c9d4SSatish Balay     factors->Destroy();
17709371c9d4SSatish Balay   } /* Destroy the old if it exists */
17719371c9d4SSatish Balay   else {
17729371c9d4SSatish Balay     B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
17739371c9d4SSatish Balay   }
177486a27549SJunchao Zhang 
177586a27549SJunchao Zhang   /* Create a spiluk handle and then do symbolic factorization */
177686a27549SJunchao Zhang   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
177786a27549SJunchao Zhang   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
177886a27549SJunchao Zhang 
177986a27549SJunchao Zhang   auto spiluk_handle = factors->kh.get_spiluk_handle();
178086a27549SJunchao Zhang 
178186a27549SJunchao Zhang   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
178286a27549SJunchao Zhang   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
178386a27549SJunchao Zhang   Kokkos::realloc(factors->iU_d, n + 1);
178486a27549SJunchao Zhang   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
178586a27549SJunchao Zhang 
178686a27549SJunchao Zhang   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1787076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1788076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1789076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
179086a27549SJunchao Zhang   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
179186a27549SJunchao Zhang 
179286a27549SJunchao Zhang   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
179386a27549SJunchao Zhang   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
179486a27549SJunchao Zhang   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
179586a27549SJunchao Zhang   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
179686a27549SJunchao Zhang 
179786a27549SJunchao Zhang   /* TODO: add options to select sptrsv algorithms */
179886a27549SJunchao Zhang   /* Create sptrsv handles for L, U and their transpose */
179986a27549SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
180086a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
180186a27549SJunchao Zhang #else
180286a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
180386a27549SJunchao Zhang #endif
180486a27549SJunchao Zhang 
180586a27549SJunchao Zhang   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
180686a27549SJunchao Zhang   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
180786a27549SJunchao Zhang   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
180886a27549SJunchao Zhang   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
180986a27549SJunchao Zhang 
181086a27549SJunchao Zhang   /* Fill fields of the factor matrix B */
18119566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
181286a27549SJunchao Zhang   b     = (Mat_SeqAIJ *)B->data;
181386a27549SJunchao Zhang   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
181486a27549SJunchao Zhang   B->info.fill_ratio_given  = info->fill;
1815a1e4e190SStefano Zampini   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
181686a27549SJunchao Zhang 
181786a27549SJunchao Zhang   B->offloadmask          = PETSC_OFFLOAD_GPU;
181886a27549SJunchao Zhang   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
18193ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1820930e68a5SMark Adams }
1821930e68a5SMark Adams 
1822d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type)
1823d71ae5a4SJacob Faibussowitsch {
1824930e68a5SMark Adams   PetscFunctionBegin;
1825930e68a5SMark Adams   *type = MATSOLVERKOKKOS;
18263ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1827930e68a5SMark Adams }
1828930e68a5SMark Adams 
1829930e68a5SMark Adams /*MC
183086a27549SJunchao Zhang   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
183111a5261eSBarry Smith   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
1832930e68a5SMark Adams 
1833930e68a5SMark Adams   Level: beginner
1834930e68a5SMark Adams 
18351cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1836930e68a5SMark Adams M*/
183786a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1838930e68a5SMark Adams {
1839930e68a5SMark Adams   PetscInt n = A->rmap->n;
1840930e68a5SMark Adams 
1841930e68a5SMark Adams   PetscFunctionBegin;
18429566063dSJacob Faibussowitsch   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
18439566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*B, n, n, n, n));
1844930e68a5SMark Adams   (*B)->factortype = ftype;
18459566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
18469566063dSJacob Faibussowitsch   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1847930e68a5SMark Adams 
18488f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
18499566063dSJacob Faibussowitsch     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
185086a27549SJunchao Zhang     (*B)->canuseordering        = PETSC_TRUE;
185186a27549SJunchao Zhang     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
185286a27549SJunchao Zhang   } else if (ftype == MAT_FACTOR_ILU) {
18539566063dSJacob Faibussowitsch     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
185486a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_FALSE;
185586a27549SJunchao Zhang     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
185698921bdaSJacob Faibussowitsch   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1857930e68a5SMark Adams 
18589566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1859f4f49eeaSPierre Jolivet   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos));
18603ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1861930e68a5SMark Adams }
18628f7e8f9dSMark Adams 
1863d1f0640dSPierre Jolivet PETSC_INTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1864d71ae5a4SJacob Faibussowitsch {
186586a27549SJunchao Zhang   PetscFunctionBegin;
18669566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
18679566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
18683ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
186986a27549SJunchao Zhang }
187086a27549SJunchao Zhang 
1871076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */
1872d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
1873d71ae5a4SJacob Faibussowitsch {
1874076ba34aSJunchao Zhang   const auto        &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map);
1875076ba34aSJunchao Zhang   const auto        &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries);
1876076ba34aSJunchao Zhang   const auto        &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values);
1877076ba34aSJunchao Zhang   const PetscInt    *i  = iv.data();
1878076ba34aSJunchao Zhang   const PetscInt    *j  = jv.data();
1879076ba34aSJunchao Zhang   const PetscScalar *a  = av.data();
1880076ba34aSJunchao Zhang   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
1881076ba34aSJunchao Zhang 
1882076ba34aSJunchao Zhang   PetscFunctionBegin;
18839566063dSJacob Faibussowitsch   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
1884076ba34aSJunchao Zhang   for (PetscInt k = 0; k < m; k++) {
18859566063dSJacob Faibussowitsch     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
188648a46eb9SPierre Jolivet     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
18879566063dSJacob Faibussowitsch     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
1888076ba34aSJunchao Zhang   }
18893ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1890076ba34aSJunchao Zhang }
1891