xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 2695cf96bf725a23d8edb31ff8aa2ca9ba36b655)
1e36ced11SJunchao Zhang #include <petsc_kokkos.hpp>
211d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp>
3c0c276a7Ssdargavi #include <petscmat_kokkos.hpp>
4076ba34aSJunchao Zhang #include <petscpkg_version.h>
5152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h>
642550becSJunchao Zhang #include <petsc/private/sfimpl.h>
7aac854edSJunchao Zhang #include <petsc/private/kokkosimpl.hpp>
87233ce55SJed Brown #include <petscsys.h>
98c3ff71bSJunchao Zhang 
108c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp>
11f0cf5187SStefano Zampini #include <KokkosBlas.hpp>
128c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp>
13cc6e31f1SJunchao Zhang 
14cc6e31f1SJunchao Zhang // To suppress compiler warnings:
15cc6e31f1SJunchao Zhang // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
16cc6e31f1SJunchao Zhang // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
17cc6e31f1SJunchao Zhang // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
18cc6e31f1SJunchao Zhang // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
19*2695cf96SNuno Nobre #define DISABLE_CUSPARSE_DEPRECATED
208c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
21cc6e31f1SJunchao Zhang 
2286a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp>
2386a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp>
24076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp>
25076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp>
269d13fa56SJunchao Zhang #include <KokkosBatched_LU_Decl.hpp>
279d13fa56SJunchao Zhang #include <KokkosBatched_InverseLU_Decl.hpp>
2886a27549SJunchao Zhang 
2942550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
308c3ff71bSJunchao Zhang 
310e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
32f98996d3SJunchao Zhang   #include <KokkosSparse_Utils.hpp>
33f98996d3SJunchao Zhang using KokkosSparse::sort_crs_matrix;
349371c9d4SSatish Balay using KokkosSparse::Impl::transpose_matrix;
35f98996d3SJunchao Zhang #else
36f98996d3SJunchao Zhang   #include <KokkosKernels_Sorting.hpp>
37f98996d3SJunchao Zhang using KokkosKernels::sort_crs_matrix;
389371c9d4SSatish Balay using KokkosKernels::Impl::transpose_matrix;
39f98996d3SJunchao Zhang #endif
40f98996d3SJunchao Zhang 
41aac854edSJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0)
42aac854edSJunchao Zhang using KokkosSparse::spiluk_symbolic;
43aac854edSJunchao Zhang using KokkosSparse::spiluk_numeric;
44aac854edSJunchao Zhang using KokkosSparse::sptrsv_symbolic;
45aac854edSJunchao Zhang using KokkosSparse::sptrsv_solve;
46aac854edSJunchao Zhang using KokkosSparse::Experimental::SPTRSVAlgorithm;
47aac854edSJunchao Zhang using KokkosSparse::Experimental::SPILUKAlgorithm;
48aac854edSJunchao Zhang #else
49aac854edSJunchao Zhang using KokkosSparse::Experimental::spiluk_symbolic;
50aac854edSJunchao Zhang using KokkosSparse::Experimental::spiluk_numeric;
51aac854edSJunchao Zhang using KokkosSparse::Experimental::sptrsv_symbolic;
52aac854edSJunchao Zhang using KokkosSparse::Experimental::sptrsv_solve;
53aac854edSJunchao Zhang using KokkosSparse::Experimental::SPTRSVAlgorithm;
54aac854edSJunchao Zhang using KokkosSparse::Experimental::SPILUKAlgorithm;
55aac854edSJunchao Zhang #endif
56aac854edSJunchao Zhang 
578c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
588c3ff71bSJunchao Zhang 
59076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
60076ba34aSJunchao Zhang    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
61076ba34aSJunchao Zhang    In the latter case, it is important to set a_dual's sync state correctly.
62076ba34aSJunchao Zhang  */
63d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
64d71ae5a4SJacob Faibussowitsch {
65076ba34aSJunchao Zhang   Mat_SeqAIJ       *aijseq;
66076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
678c3ff71bSJunchao Zhang 
688c3ff71bSJunchao Zhang   PetscFunctionBegin;
693ba16761SJacob Faibussowitsch   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
709566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
71076ba34aSJunchao Zhang 
72076ba34aSJunchao Zhang   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
73076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
74076ba34aSJunchao Zhang 
75076ba34aSJunchao Zhang   /* If aijkok does not exist, we just copy i, j to device.
76076ba34aSJunchao Zhang      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
77076ba34aSJunchao Zhang      In both cases, we build a new aijkok structure.
78076ba34aSJunchao Zhang   */
79076ba34aSJunchao Zhang   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
8093a54799SPierre Jolivet     if (aijkok && aijkok->host_aij_allocated_by_kokkos) {   /* Avoid accidentally freeing much needed a,i,j on host when deleting aijkok */
81d1c799ffSJunchao Zhang       PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i));
82d1c799ffSJunchao Zhang       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j));
83d1c799ffSJunchao Zhang       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a));
84d1c799ffSJunchao Zhang       PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1));
85d1c799ffSJunchao Zhang       PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz()));
86d1c799ffSJunchao Zhang       PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz()));
87d1c799ffSJunchao Zhang       aijseq->free_a  = PETSC_TRUE;
88d1c799ffSJunchao Zhang       aijseq->free_ij = PETSC_TRUE;
89d1c799ffSJunchao Zhang       /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where
90d1c799ffSJunchao Zhang          we have the CsrMatrix on device first and then copy to host, followed by
91d1c799ffSJunchao Zhang          MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL.
92d1c799ffSJunchao Zhang          One could improve it by not using NULL garray.
93d1c799ffSJunchao Zhang       */
94d1c799ffSJunchao Zhang     }
95076ba34aSJunchao Zhang     delete aijkok;
96f4747e26SJunchao Zhang     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
97076ba34aSJunchao Zhang     A->spptr = aijkok;
98f4747e26SJunchao Zhang   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
99f4747e26SJunchao Zhang     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
100f4747e26SJunchao Zhang     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
101f4747e26SJunchao Zhang     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
102076ba34aSJunchao Zhang   }
1033ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1048c3ff71bSJunchao Zhang }
1058c3ff71bSJunchao Zhang 
10686a27549SJunchao Zhang /* Sync CSR data to device if not yet */
107d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
108d71ae5a4SJacob Faibussowitsch {
1098c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1108c3ff71bSJunchao Zhang 
1118c3ff71bSJunchao Zhang   PetscFunctionBegin;
112aaa8cc7dSPierre Jolivet   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
1135f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
114076ba34aSJunchao Zhang   if (aijkok->a_dual.need_sync_device()) {
115f3d3cd90SZach Atkins     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
116580c7c76SPierre Jolivet     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
11786a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_FALSE;
1188c3ff71bSJunchao Zhang   }
1193ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1208c3ff71bSJunchao Zhang }
1218c3ff71bSJunchao Zhang 
122076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */
123d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
124d71ae5a4SJacob Faibussowitsch {
12586a27549SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
12686a27549SJunchao Zhang 
12786a27549SJunchao Zhang   PetscFunctionBegin;
1285f80ce2aSJacob Faibussowitsch   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
12986a27549SJunchao Zhang   aijkok->a_dual.clear_sync_state();
13086a27549SJunchao Zhang   aijkok->a_dual.modify_device();
13186a27549SJunchao Zhang   aijkok->transpose_updated = PETSC_FALSE;
13286a27549SJunchao Zhang   aijkok->hermitian_updated = PETSC_FALSE;
1339566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
1349566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
1353ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
13686a27549SJunchao Zhang }
13786a27549SJunchao Zhang 
138d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
139d71ae5a4SJacob Faibussowitsch {
140f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1414df4a32cSJunchao Zhang   auto              exec   = PetscGetKokkosExecutionSpace();
142f0cf5187SStefano Zampini 
143f0cf5187SStefano Zampini   PetscFunctionBegin;
144f0cf5187SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
14586a27549SJunchao Zhang   /* We do not expect one needs factors on host  */
146aaa8cc7dSPierre Jolivet   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
1475f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
148f3d3cd90SZach Atkins   PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec));
1493ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
150f0cf5187SStefano Zampini }
151f0cf5187SStefano Zampini 
152d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
153d71ae5a4SJacob Faibussowitsch {
154076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
155f0cf5187SStefano Zampini 
156f0cf5187SStefano Zampini   PetscFunctionBegin;
1575519a089SJose E. Roman   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
1585519a089SJose E. Roman     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
1595519a089SJose E. Roman     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
1605519a089SJose E. Roman     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
1615519a089SJose E. Roman   */
1625519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
163f3d3cd90SZach Atkins     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
164076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
165076ba34aSJunchao Zhang   } else { /* Happens when calling MatSetValues on a newly created matrix */
166076ba34aSJunchao Zhang     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
167076ba34aSJunchao Zhang   }
1683ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
169076ba34aSJunchao Zhang }
170076ba34aSJunchao Zhang 
171d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
172d71ae5a4SJacob Faibussowitsch {
173076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
174076ba34aSJunchao Zhang 
175076ba34aSJunchao Zhang   PetscFunctionBegin;
1765519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
1773ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
178076ba34aSJunchao Zhang }
179076ba34aSJunchao Zhang 
180d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
181d71ae5a4SJacob Faibussowitsch {
182076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
183076ba34aSJunchao Zhang 
184076ba34aSJunchao Zhang   PetscFunctionBegin;
1855519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
186f3d3cd90SZach Atkins     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
187076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1882328674fSJunchao Zhang   } else {
1892328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
1902328674fSJunchao Zhang   }
1913ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
192076ba34aSJunchao Zhang }
193076ba34aSJunchao Zhang 
194d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
195d71ae5a4SJacob Faibussowitsch {
196076ba34aSJunchao Zhang   PetscFunctionBegin;
197076ba34aSJunchao Zhang   *array = NULL;
1983ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
199076ba34aSJunchao Zhang }
200076ba34aSJunchao Zhang 
201d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
202d71ae5a4SJacob Faibussowitsch {
203076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
204076ba34aSJunchao Zhang 
205076ba34aSJunchao Zhang   PetscFunctionBegin;
2065519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
207076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
2082328674fSJunchao Zhang   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
2092328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
2102328674fSJunchao Zhang   }
2113ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
212076ba34aSJunchao Zhang }
213076ba34aSJunchao Zhang 
214d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
215d71ae5a4SJacob Faibussowitsch {
216076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
217076ba34aSJunchao Zhang 
218076ba34aSJunchao Zhang   PetscFunctionBegin;
2195519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
220076ba34aSJunchao Zhang     aijkok->a_dual.clear_sync_state();
221076ba34aSJunchao Zhang     aijkok->a_dual.modify_host();
2222328674fSJunchao Zhang   }
2233ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
224f0cf5187SStefano Zampini }
225f0cf5187SStefano Zampini 
226d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
227d71ae5a4SJacob Faibussowitsch {
2287ee59b9bSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
2297ee59b9bSJunchao Zhang 
2307ee59b9bSJunchao Zhang   PetscFunctionBegin;
2317ee59b9bSJunchao Zhang   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
2327ee59b9bSJunchao Zhang 
2337ee59b9bSJunchao Zhang   if (i) *i = aijkok->i_device_data();
2347ee59b9bSJunchao Zhang   if (j) *j = aijkok->j_device_data();
2357ee59b9bSJunchao Zhang   if (a) {
236f3d3cd90SZach Atkins     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
2377ee59b9bSJunchao Zhang     *a = aijkok->a_device_data();
2387ee59b9bSJunchao Zhang   }
2397ee59b9bSJunchao Zhang   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
2403ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2417ee59b9bSJunchao Zhang }
2427ee59b9bSJunchao Zhang 
24303db1824SAlex Lindsay static PetscErrorCode MatGetCurrentMemType_SeqAIJKokkos(PETSC_UNUSED Mat A, PetscMemType *mtype)
24403db1824SAlex Lindsay {
24503db1824SAlex Lindsay   PetscFunctionBegin;
24603db1824SAlex Lindsay   *mtype = PETSC_MEMTYPE_KOKKOS;
24703db1824SAlex Lindsay   PetscFunctionReturn(PETSC_SUCCESS);
24803db1824SAlex Lindsay }
24903db1824SAlex Lindsay 
2500e3ece09SJunchao Zhang /*
2510e3ece09SJunchao Zhang   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
2520e3ece09SJunchao Zhang 
2530e3ece09SJunchao Zhang   Input Parameter:
2540e3ece09SJunchao Zhang .  A       - the MATSEQAIJKOKKOS matrix
2550e3ece09SJunchao Zhang 
2560e3ece09SJunchao Zhang   Output Parameters:
2570e3ece09SJunchao Zhang +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
258aaa8cc7dSPierre Jolivet -  T_d    - the transpose on device, whose value array is allocated but not initialized
2590e3ece09SJunchao Zhang */
2600e3ece09SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
261d71ae5a4SJacob Faibussowitsch {
2620e3ece09SJunchao Zhang   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
2630e3ece09SJunchao Zhang   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
2640e3ece09SJunchao Zhang   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
2657b8d4ba6SJunchao Zhang   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
2660e3ece09SJunchao Zhang   MatRowMapType          *Ti = Ti_h.data();
2677b8d4ba6SJunchao Zhang   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
2687b8d4ba6SJunchao Zhang   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
2690e3ece09SJunchao Zhang   PetscInt               *Tj   = Tj_h.data();
2700e3ece09SJunchao Zhang   PetscInt               *perm = perm_h.data();
2710e3ece09SJunchao Zhang   PetscInt               *offset;
272152b3e56SJunchao Zhang 
273152b3e56SJunchao Zhang   PetscFunctionBegin;
2740e3ece09SJunchao Zhang   // Populate Ti
2750e3ece09SJunchao Zhang   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
2760e3ece09SJunchao Zhang   Ti++;
2770e3ece09SJunchao Zhang   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
2780e3ece09SJunchao Zhang   Ti--;
2790e3ece09SJunchao Zhang   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
2800e3ece09SJunchao Zhang 
2810e3ece09SJunchao Zhang   // Populate Tj and the permutation array
2820e3ece09SJunchao Zhang   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
2830e3ece09SJunchao Zhang   for (PetscInt i = 0; i < m; i++) {
2840e3ece09SJunchao Zhang     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
2850e3ece09SJunchao Zhang       PetscInt r    = Aj[j];                       // row r of T
2860e3ece09SJunchao Zhang       PetscInt disp = Ti[r] + offset[r];
2870e3ece09SJunchao Zhang 
2880e3ece09SJunchao Zhang       Tj[disp]   = i; // col i of T
2890e3ece09SJunchao Zhang       perm[disp] = j;
2900e3ece09SJunchao Zhang       offset[r]++;
291076ba34aSJunchao Zhang     }
2920e3ece09SJunchao Zhang   }
2930e3ece09SJunchao Zhang   PetscCall(PetscFree(offset));
2940e3ece09SJunchao Zhang 
2950e3ece09SJunchao Zhang   // Sort each row of T, along with the permutation array
2960e3ece09SJunchao Zhang   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
2970e3ece09SJunchao Zhang 
2980e3ece09SJunchao Zhang   // Output perm and T on device
2990e3ece09SJunchao Zhang   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
3000e3ece09SJunchao Zhang   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
3010e3ece09SJunchao Zhang   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
3020e3ece09SJunchao Zhang   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
3033ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
304152b3e56SJunchao Zhang }
305152b3e56SJunchao Zhang 
3060e3ece09SJunchao Zhang // Generate the transpose on device and cache it internally
3070e3ece09SJunchao Zhang // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
3080e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
309d71ae5a4SJacob Faibussowitsch {
3100e3ece09SJunchao Zhang   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
3110e3ece09SJunchao Zhang   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
3120e3ece09SJunchao Zhang   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
3130e3ece09SJunchao Zhang   KokkosCsrMatrix  &T = akok->csrmatT;
314152b3e56SJunchao Zhang 
315152b3e56SJunchao Zhang   PetscFunctionBegin;
3160e3ece09SJunchao Zhang   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
317f3d3cd90SZach Atkins   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
3180e3ece09SJunchao Zhang 
3190e3ece09SJunchao Zhang   const auto &Aa = akok->a_dual.view_device();
3200e3ece09SJunchao Zhang 
3210e3ece09SJunchao Zhang   if (A->symmetric == PETSC_BOOL3_TRUE) {
3220e3ece09SJunchao Zhang     *csrmatT = akok->csrmat;
3230e3ece09SJunchao Zhang   } else {
3240e3ece09SJunchao Zhang     // See if we already have a cached transpose and its value is up to date
3250e3ece09SJunchao Zhang     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
3260e3ece09SJunchao Zhang       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
3270e3ece09SJunchao Zhang         const auto &perm = akok->transpose_perm; // get the permutation array
3280e3ece09SJunchao Zhang         auto       &Ta   = T.values;
3290e3ece09SJunchao Zhang 
330d326c3f1SJunchao Zhang         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
331076ba34aSJunchao Zhang       }
3320e3ece09SJunchao Zhang     } else { // Generate T of size n x m for the first time
3330e3ece09SJunchao Zhang       MatRowMapKokkosView perm;
3340e3ece09SJunchao Zhang 
3350e3ece09SJunchao Zhang       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
3360e3ece09SJunchao Zhang       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
337d326c3f1SJunchao Zhang       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
3380e3ece09SJunchao Zhang     }
3390e3ece09SJunchao Zhang     akok->transpose_updated = PETSC_TRUE;
3400e3ece09SJunchao Zhang     *csrmatT                = akok->csrmatT;
3410e3ece09SJunchao Zhang   }
3420e3ece09SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
3430e3ece09SJunchao Zhang }
3440e3ece09SJunchao Zhang 
3450e3ece09SJunchao Zhang // Generate the Hermitian on device and cache it internally
3460e3ece09SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
3470e3ece09SJunchao Zhang {
3480e3ece09SJunchao Zhang   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
3490e3ece09SJunchao Zhang   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
3500e3ece09SJunchao Zhang   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
3510e3ece09SJunchao Zhang   KokkosCsrMatrix  &T = akok->csrmatH;
3520e3ece09SJunchao Zhang 
3530e3ece09SJunchao Zhang   PetscFunctionBegin;
3540e3ece09SJunchao Zhang   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
355f3d3cd90SZach Atkins   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
3560e3ece09SJunchao Zhang 
3570e3ece09SJunchao Zhang   const auto &Aa = akok->a_dual.view_device();
3580e3ece09SJunchao Zhang 
3590e3ece09SJunchao Zhang   if (A->hermitian == PETSC_BOOL3_TRUE) {
3600e3ece09SJunchao Zhang     *csrmatH = akok->csrmat;
3610e3ece09SJunchao Zhang   } else {
3620e3ece09SJunchao Zhang     // See if we already have a cached hermitian and its value is up to date
3630e3ece09SJunchao Zhang     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
3640e3ece09SJunchao Zhang       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
3650e3ece09SJunchao Zhang         const auto &perm = akok->transpose_perm; // get the permutation array
3660e3ece09SJunchao Zhang         auto       &Ta   = T.values;
3670e3ece09SJunchao Zhang 
368d326c3f1SJunchao Zhang         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
3690e3ece09SJunchao Zhang       }
3700e3ece09SJunchao Zhang     } else { // Generate T of size n x m for the first time
3710e3ece09SJunchao Zhang       MatRowMapKokkosView perm;
3720e3ece09SJunchao Zhang 
3730e3ece09SJunchao Zhang       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
3740e3ece09SJunchao Zhang       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
375d326c3f1SJunchao Zhang       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
3760e3ece09SJunchao Zhang     }
3770e3ece09SJunchao Zhang     akok->hermitian_updated = PETSC_TRUE;
3780e3ece09SJunchao Zhang     *csrmatH                = akok->csrmatH;
3790e3ece09SJunchao Zhang   }
3803ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
381152b3e56SJunchao Zhang }
382a587d139SMark 
3838c3ff71bSJunchao Zhang /* y = A x */
384d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
385d71ae5a4SJacob Faibussowitsch {
3868c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
387152b3e56SJunchao Zhang   ConstPetscScalarKokkosView xv;
388152b3e56SJunchao Zhang   PetscScalarKokkosView      yv;
3898c3ff71bSJunchao Zhang 
3908c3ff71bSJunchao Zhang   PetscFunctionBegin;
3919566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3929566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
3939566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
3949566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(yy, &yv));
3958c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
396d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
3979566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
3989566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
399076ba34aSJunchao Zhang   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
4009566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
4019566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
4023ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4038c3ff71bSJunchao Zhang }
4048c3ff71bSJunchao Zhang 
4058c3ff71bSJunchao Zhang /* y = A^T x */
406d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
407d71ae5a4SJacob Faibussowitsch {
4088c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
409152b3e56SJunchao Zhang   const char                *mode;
410152b3e56SJunchao Zhang   ConstPetscScalarKokkosView xv;
411152b3e56SJunchao Zhang   PetscScalarKokkosView      yv;
4120e3ece09SJunchao Zhang   KokkosCsrMatrix            csrmat;
4138c3ff71bSJunchao Zhang 
4148c3ff71bSJunchao Zhang   PetscFunctionBegin;
4159566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4169566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
4179566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
4189566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(yy, &yv));
419152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
4209566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
421152b3e56SJunchao Zhang     mode = "N";
422152b3e56SJunchao Zhang   } else {
423076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
4240e3ece09SJunchao Zhang     csrmat = aijkok->csrmat;
425152b3e56SJunchao Zhang     mode   = "T";
426152b3e56SJunchao Zhang   }
427d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
4289566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
4299566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
4300e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
4319566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
4323ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4338c3ff71bSJunchao Zhang }
4348c3ff71bSJunchao Zhang 
4358c3ff71bSJunchao Zhang /* y = A^H x */
436d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
437d71ae5a4SJacob Faibussowitsch {
4388c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
439152b3e56SJunchao Zhang   const char                *mode;
440152b3e56SJunchao Zhang   ConstPetscScalarKokkosView xv;
441152b3e56SJunchao Zhang   PetscScalarKokkosView      yv;
4420e3ece09SJunchao Zhang   KokkosCsrMatrix            csrmat;
4438c3ff71bSJunchao Zhang 
4448c3ff71bSJunchao Zhang   PetscFunctionBegin;
4459566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4469566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
4479566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
4489566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(yy, &yv));
449152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
4509566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
451152b3e56SJunchao Zhang     mode = "N";
452152b3e56SJunchao Zhang   } else {
453076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
4540e3ece09SJunchao Zhang     csrmat = aijkok->csrmat;
455152b3e56SJunchao Zhang     mode   = "C";
456152b3e56SJunchao Zhang   }
457d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
4589566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
4599566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
4600e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
4619566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
4623ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4638c3ff71bSJunchao Zhang }
4648c3ff71bSJunchao Zhang 
4658c3ff71bSJunchao Zhang /* z = A x + y */
466d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
467d71ae5a4SJacob Faibussowitsch {
4688c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
46992896123SJunchao Zhang   ConstPetscScalarKokkosView xv;
470152b3e56SJunchao Zhang   PetscScalarKokkosView      zv;
4718c3ff71bSJunchao Zhang 
4728c3ff71bSJunchao Zhang   PetscFunctionBegin;
4739566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4749566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
47592896123SJunchao Zhang   if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
4769566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
47792896123SJunchao Zhang   PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
4788c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
479d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
4809566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
48192896123SJunchao Zhang   PetscCall(VecRestoreKokkosView(zz, &zv));
4829566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
4839566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
4843ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4858c3ff71bSJunchao Zhang }
4868c3ff71bSJunchao Zhang 
4878c3ff71bSJunchao Zhang /* z = A^T x + y */
488d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
489d71ae5a4SJacob Faibussowitsch {
4908c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
491152b3e56SJunchao Zhang   const char                *mode;
49292896123SJunchao Zhang   ConstPetscScalarKokkosView xv;
493152b3e56SJunchao Zhang   PetscScalarKokkosView      zv;
4940e3ece09SJunchao Zhang   KokkosCsrMatrix            csrmat;
4958c3ff71bSJunchao Zhang 
4968c3ff71bSJunchao Zhang   PetscFunctionBegin;
4979566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4989566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
49992896123SJunchao Zhang   if (zz != yy) PetscCall(VecCopy(yy, zz));
5009566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
50192896123SJunchao Zhang   PetscCall(VecGetKokkosView(zz, &zv));
502152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
5039566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
504152b3e56SJunchao Zhang     mode = "N";
505152b3e56SJunchao Zhang   } else {
506076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
5070e3ece09SJunchao Zhang     csrmat = aijkok->csrmat;
508152b3e56SJunchao Zhang     mode   = "T";
509152b3e56SJunchao Zhang   }
510d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
5119566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
51292896123SJunchao Zhang   PetscCall(VecRestoreKokkosView(zz, &zv));
5130e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
5149566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
5153ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5168c3ff71bSJunchao Zhang }
5178c3ff71bSJunchao Zhang 
5188c3ff71bSJunchao Zhang /* z = A^H x + y */
519d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
520d71ae5a4SJacob Faibussowitsch {
5218c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
522152b3e56SJunchao Zhang   const char                *mode;
52392896123SJunchao Zhang   ConstPetscScalarKokkosView xv;
524152b3e56SJunchao Zhang   PetscScalarKokkosView      zv;
5250e3ece09SJunchao Zhang   KokkosCsrMatrix            csrmat;
5268c3ff71bSJunchao Zhang 
5278c3ff71bSJunchao Zhang   PetscFunctionBegin;
5289566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
5299566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
53092896123SJunchao Zhang   if (zz != yy) PetscCall(VecCopy(yy, zz));
5319566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
53292896123SJunchao Zhang   PetscCall(VecGetKokkosView(zz, &zv));
533152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
5349566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
535152b3e56SJunchao Zhang     mode = "N";
536152b3e56SJunchao Zhang   } else {
537076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
5380e3ece09SJunchao Zhang     csrmat = aijkok->csrmat;
539152b3e56SJunchao Zhang     mode   = "C";
540152b3e56SJunchao Zhang   }
541d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
5429566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
54392896123SJunchao Zhang   PetscCall(VecRestoreKokkosView(zz, &zv));
5440e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
5459566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
5463ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
547152b3e56SJunchao Zhang }
548152b3e56SJunchao Zhang 
54966976f2fSJacob Faibussowitsch static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
550d71ae5a4SJacob Faibussowitsch {
551152b3e56SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
552152b3e56SJunchao Zhang 
553152b3e56SJunchao Zhang   PetscFunctionBegin;
554152b3e56SJunchao Zhang   switch (op) {
555152b3e56SJunchao Zhang   case MAT_FORM_EXPLICIT_TRANSPOSE:
556152b3e56SJunchao Zhang     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
5579566063dSJacob Faibussowitsch     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
558152b3e56SJunchao Zhang     A->form_explicit_transpose = flg;
559152b3e56SJunchao Zhang     break;
560d71ae5a4SJacob Faibussowitsch   default:
561d71ae5a4SJacob Faibussowitsch     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
562d71ae5a4SJacob Faibussowitsch     break;
563152b3e56SJunchao Zhang   }
5643ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5658c3ff71bSJunchao Zhang }
5668c3ff71bSJunchao Zhang 
567076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */
568d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
569d71ae5a4SJacob Faibussowitsch {
570076ba34aSJunchao Zhang   Mat_SeqAIJ *aseq;
5718c3ff71bSJunchao Zhang 
5728c3ff71bSJunchao Zhang   PetscFunctionBegin;
5739566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
574076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
57551ece73cSJunchao Zhang     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
57651ece73cSJunchao Zhang     PetscCall(MatSetType(*newmat, mtype));
5778c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
5789566063dSJacob Faibussowitsch     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
579076ba34aSJunchao Zhang   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
5805f80ce2aSJacob Faibussowitsch     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
5819566063dSJacob Faibussowitsch     PetscCall(PetscFree(A->defaultvectype));
5829566063dSJacob Faibussowitsch     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
5839566063dSJacob Faibussowitsch     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
5849566063dSJacob Faibussowitsch     PetscCall(MatSetOps_SeqAIJKokkos(A));
585076ba34aSJunchao Zhang     aseq = static_cast<Mat_SeqAIJ *>(A->data);
586394ed5ebSJunchao Zhang     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
5875f80ce2aSJacob Faibussowitsch       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
588f4747e26SJunchao Zhang       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
5898c3ff71bSJunchao Zhang     }
590076ba34aSJunchao Zhang   }
5913ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5928c3ff71bSJunchao Zhang }
5938c3ff71bSJunchao Zhang 
594076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
595076ba34aSJunchao Zhang    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
596076ba34aSJunchao Zhang  */
597d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
598d71ae5a4SJacob Faibussowitsch {
599076ba34aSJunchao Zhang   Mat_SeqAIJ       *bseq;
600076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
601076ba34aSJunchao Zhang   Mat               mat;
6028c3ff71bSJunchao Zhang 
6038c3ff71bSJunchao Zhang   PetscFunctionBegin;
604076ba34aSJunchao Zhang   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
6059566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
606076ba34aSJunchao Zhang   mat = *B;
607f4747e26SJunchao Zhang   if (A->assembled) {
608076ba34aSJunchao Zhang     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
609f4747e26SJunchao Zhang     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
610076ba34aSJunchao Zhang     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
611076ba34aSJunchao Zhang     /* Now copy values to B if needed */
612076ba34aSJunchao Zhang     if (dupOption == MAT_COPY_VALUES) {
613076ba34aSJunchao Zhang       if (akok->a_dual.need_sync_device()) {
614076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
615076ba34aSJunchao Zhang         bkok->a_dual.modify_host();
616076ba34aSJunchao Zhang       } else { /* If device has the latest data, we only copy data on device */
617076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
618076ba34aSJunchao Zhang         bkok->a_dual.modify_device();
619076ba34aSJunchao Zhang       }
620076ba34aSJunchao Zhang     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
621076ba34aSJunchao Zhang       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
622076ba34aSJunchao Zhang       bkok->a_dual.modify_host();
623076ba34aSJunchao Zhang     }
624076ba34aSJunchao Zhang     mat->spptr = bkok;
625076ba34aSJunchao Zhang   }
626076ba34aSJunchao Zhang 
6279566063dSJacob Faibussowitsch   PetscCall(PetscFree(mat->defaultvectype));
6289566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
6299566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
6309566063dSJacob Faibussowitsch   PetscCall(MatSetOps_SeqAIJKokkos(mat));
6313ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6328c3ff71bSJunchao Zhang }
6338c3ff71bSJunchao Zhang 
634d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
635d71ae5a4SJacob Faibussowitsch {
6360ecb592aSJunchao Zhang   Mat               At;
6370e3ece09SJunchao Zhang   KokkosCsrMatrix   internT;
6380ecb592aSJunchao Zhang   Mat_SeqAIJKokkos *atkok, *bkok;
6390ecb592aSJunchao Zhang 
6400ecb592aSJunchao Zhang   PetscFunctionBegin;
6417fb60732SBarry Smith   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
6429566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
6430ecb592aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
644ff751488SJunchao Zhang     /* Deep copy internT, as we want to isolate the internal transpose */
6450e3ece09SJunchao Zhang     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
6469566063dSJacob Faibussowitsch     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
6470ecb592aSJunchao Zhang     if (reuse == MAT_INITIAL_MATRIX) *B = At;
6489566063dSJacob Faibussowitsch     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
6490ecb592aSJunchao Zhang   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
6500ecb592aSJunchao Zhang     if ((*B)->assembled) {
6510ecb592aSJunchao Zhang       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
6520e3ece09SJunchao Zhang       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
6539566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
6540ecb592aSJunchao Zhang     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
6550ecb592aSJunchao Zhang       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
6560e3ece09SJunchao Zhang       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
6570e3ece09SJunchao Zhang       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
6580e3ece09SJunchao Zhang       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
6590e3ece09SJunchao Zhang       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
6600ecb592aSJunchao Zhang     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
6610ecb592aSJunchao Zhang   }
6623ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6630ecb592aSJunchao Zhang }
6640ecb592aSJunchao Zhang 
665d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
666d71ae5a4SJacob Faibussowitsch {
66786a27549SJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
6688c3ff71bSJunchao Zhang 
6698c3ff71bSJunchao Zhang   PetscFunctionBegin;
67086a27549SJunchao Zhang   if (A->factortype == MAT_FACTOR_NONE) {
67186a27549SJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
6728c3ff71bSJunchao Zhang     delete aijkok;
67386a27549SJunchao Zhang   } else {
67486a27549SJunchao Zhang     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
67586a27549SJunchao Zhang   }
676cbc6b225SStefano Zampini   A->spptr = NULL;
6779566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
6789566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
6799566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
68057761e9aSJunchao Zhang #if defined(PETSC_HAVE_HYPRE)
68157761e9aSJunchao Zhang   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
68257761e9aSJunchao Zhang #endif
6839566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
6843ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6858c3ff71bSJunchao Zhang }
6868c3ff71bSJunchao Zhang 
6873f3ba80aSJunchao Zhang /*MC
6883f3ba80aSJunchao Zhang    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
6893f3ba80aSJunchao Zhang 
69015229ffcSPierre Jolivet    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
6913f3ba80aSJunchao Zhang 
6922ef1f0ffSBarry Smith    Options Database Key:
69311a5261eSBarry Smith .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
6943f3ba80aSJunchao Zhang 
6953f3ba80aSJunchao Zhang   Level: beginner
6963f3ba80aSJunchao Zhang 
6971cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
6983f3ba80aSJunchao Zhang M*/
699d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
700d71ae5a4SJacob Faibussowitsch {
70186a27549SJunchao Zhang   PetscFunctionBegin;
7029566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
7039566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(A));
7049566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
7053ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
70686a27549SJunchao Zhang }
70786a27549SJunchao Zhang 
708076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
709d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
710d71ae5a4SJacob Faibussowitsch {
711076ba34aSJunchao Zhang   Mat_SeqAIJ         *a, *b;
712076ba34aSJunchao Zhang   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
713076ba34aSJunchao Zhang   MatScalarKokkosView aa, ba, ca;
714076ba34aSJunchao Zhang   MatRowMapKokkosView ai, bi, ci;
715076ba34aSJunchao Zhang   MatColIdxKokkosView aj, bj, cj;
716076ba34aSJunchao Zhang   PetscInt            m, n, nnz, aN;
717a3f881fbSStefano Zampini 
718a3f881fbSStefano Zampini   PetscFunctionBegin;
719076ba34aSJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
720076ba34aSJunchao Zhang   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
7214f572ea9SToby Isaac   PetscAssertPointer(C, 4);
722076ba34aSJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
723076ba34aSJunchao Zhang   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
7245f80ce2aSJacob Faibussowitsch   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
7255f80ce2aSJacob Faibussowitsch   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
726076ba34aSJunchao Zhang 
7279566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
7289566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B));
729076ba34aSJunchao Zhang   a    = static_cast<Mat_SeqAIJ *>(A->data);
730076ba34aSJunchao Zhang   b    = static_cast<Mat_SeqAIJ *>(B->data);
731076ba34aSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
732076ba34aSJunchao Zhang   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
733076ba34aSJunchao Zhang   aa   = akok->a_dual.view_device();
734076ba34aSJunchao Zhang   ai   = akok->i_dual.view_device();
735076ba34aSJunchao Zhang   ba   = bkok->a_dual.view_device();
736076ba34aSJunchao Zhang   bi   = bkok->i_dual.view_device();
737076ba34aSJunchao Zhang   m    = A->rmap->n; /* M, N and nnz of C */
738076ba34aSJunchao Zhang   n    = A->cmap->n + B->cmap->n;
739076ba34aSJunchao Zhang   nnz  = a->nz + b->nz;
740076ba34aSJunchao Zhang   aN   = A->cmap->n; /* N of A */
741076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) {
742076ba34aSJunchao Zhang     aj      = akok->j_dual.view_device();
743076ba34aSJunchao Zhang     bj      = bkok->j_dual.view_device();
744ecd797f4SJunchao Zhang     auto ca = MatScalarKokkosView("a", aa.extent(0) + ba.extent(0));
745ecd797f4SJunchao Zhang     auto ci = MatRowMapKokkosView("i", ai.extent(0));
746ecd797f4SJunchao Zhang     auto cj = MatColIdxKokkosView("j", aj.extent(0) + bj.extent(0));
747076ba34aSJunchao Zhang 
748076ba34aSJunchao Zhang     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
7499371c9d4SSatish Balay     Kokkos::parallel_for(
750d326c3f1SJunchao Zhang       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
751076ba34aSJunchao Zhang         PetscInt i       = t.league_rank(); /* row i */
752076ba34aSJunchao Zhang         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
753076ba34aSJunchao Zhang 
754076ba34aSJunchao Zhang         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
755076ba34aSJunchao Zhang                                                    ci(i) = coffset;
756076ba34aSJunchao Zhang                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
757076ba34aSJunchao Zhang         });
758076ba34aSJunchao Zhang 
759076ba34aSJunchao Zhang         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
760076ba34aSJunchao Zhang           if (k < alen) {
761076ba34aSJunchao Zhang             ca(coffset + k) = aa(ai(i) + k);
762076ba34aSJunchao Zhang             cj(coffset + k) = aj(ai(i) + k);
763076ba34aSJunchao Zhang           } else {
764076ba34aSJunchao Zhang             ca(coffset + k) = ba(bi(i) + k - alen);
765076ba34aSJunchao Zhang             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
766076ba34aSJunchao Zhang           }
767076ba34aSJunchao Zhang         });
768076ba34aSJunchao Zhang       });
769ecd797f4SJunchao Zhang     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci, cj, ca));
7709566063dSJacob Faibussowitsch     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
771076ba34aSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {
772076ba34aSJunchao Zhang     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
773076ba34aSJunchao Zhang     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
774076ba34aSJunchao Zhang     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
775076ba34aSJunchao Zhang     ca   = ckok->a_dual.view_device();
776076ba34aSJunchao Zhang     ci   = ckok->i_dual.view_device();
777076ba34aSJunchao Zhang 
7789371c9d4SSatish Balay     Kokkos::parallel_for(
779d326c3f1SJunchao Zhang       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
780076ba34aSJunchao Zhang         PetscInt i    = t.league_rank(); /* row i */
781076ba34aSJunchao Zhang         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
782076ba34aSJunchao Zhang         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
783076ba34aSJunchao Zhang           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
784076ba34aSJunchao Zhang           else ca(ci(i) + k) = ba(bi(i) + k - alen);
785076ba34aSJunchao Zhang         });
786076ba34aSJunchao Zhang       });
7879566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
788076ba34aSJunchao Zhang   }
7893ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
790076ba34aSJunchao Zhang }
791076ba34aSJunchao Zhang 
792d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
793d71ae5a4SJacob Faibussowitsch {
794076ba34aSJunchao Zhang   PetscFunctionBegin;
795076ba34aSJunchao Zhang   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
7963ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
797a3f881fbSStefano Zampini }
798a3f881fbSStefano Zampini 
799d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
800d71ae5a4SJacob Faibussowitsch {
801a3f881fbSStefano Zampini   Mat_Product                 *product = C->product;
802a3f881fbSStefano Zampini   Mat                          A, B;
803076ba34aSJunchao Zhang   bool                         transA, transB; /* use bool, since KK needs this type */
804a3f881fbSStefano Zampini   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
805a3f881fbSStefano Zampini   Mat_SeqAIJ                  *c;
806076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos *pdata;
8070e3ece09SJunchao Zhang   KokkosCsrMatrix              csrmatA, csrmatB;
808a3f881fbSStefano Zampini 
809a3f881fbSStefano Zampini   PetscFunctionBegin;
810a3f881fbSStefano Zampini   MatCheckProduct(C, 1);
8115f80ce2aSJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
812076ba34aSJunchao Zhang   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
813076ba34aSJunchao Zhang 
8140e3ece09SJunchao Zhang   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
8150e3ece09SJunchao Zhang   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
8160e3ece09SJunchao Zhang   // we still do numeric.
8170e3ece09SJunchao Zhang   if (pdata->reusesym) { // numeric reuses results from symbolic
8180e3ece09SJunchao Zhang     pdata->reusesym = PETSC_FALSE;
8193ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
820076ba34aSJunchao Zhang   }
821076ba34aSJunchao Zhang 
822076ba34aSJunchao Zhang   switch (product->type) {
8239371c9d4SSatish Balay   case MATPRODUCT_AB:
8249371c9d4SSatish Balay     transA = false;
8259371c9d4SSatish Balay     transB = false;
8269371c9d4SSatish Balay     break;
8279371c9d4SSatish Balay   case MATPRODUCT_AtB:
8289371c9d4SSatish Balay     transA = true;
8299371c9d4SSatish Balay     transB = false;
8309371c9d4SSatish Balay     break;
8319371c9d4SSatish Balay   case MATPRODUCT_ABt:
8329371c9d4SSatish Balay     transA = false;
8339371c9d4SSatish Balay     transB = true;
8349371c9d4SSatish Balay     break;
835d71ae5a4SJacob Faibussowitsch   default:
836d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
837076ba34aSJunchao Zhang   }
838076ba34aSJunchao Zhang 
839a3f881fbSStefano Zampini   A = product->A;
840a3f881fbSStefano Zampini   B = product->B;
8419566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
8429566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B));
843a3f881fbSStefano Zampini   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
844a3f881fbSStefano Zampini   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
845a3f881fbSStefano Zampini   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
846076ba34aSJunchao Zhang 
8475f80ce2aSJacob Faibussowitsch   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
848076ba34aSJunchao Zhang 
8490e3ece09SJunchao Zhang   csrmatA = akok->csrmat;
8500e3ece09SJunchao Zhang   csrmatB = bkok->csrmat;
851076ba34aSJunchao Zhang 
852076ba34aSJunchao Zhang   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
853076ba34aSJunchao Zhang   if (transA) {
8549566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
855076ba34aSJunchao Zhang     transA = false;
856a3f881fbSStefano Zampini   }
857a3f881fbSStefano Zampini 
858076ba34aSJunchao Zhang   if (transB) {
8599566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
860076ba34aSJunchao Zhang     transB = false;
861076ba34aSJunchao Zhang   }
8629566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
8630e3ece09SJunchao Zhang   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
8640e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
865866eb059SJunchao Zhang   auto spgemmHandle = pdata->kh.get_spgemm_handle();
866866eb059SJunchao Zhang   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
867e944a159SJunchao Zhang #endif
868866eb059SJunchao Zhang 
8699566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
8709566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(C));
871a3f881fbSStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
872a3f881fbSStefano Zampini   c = (Mat_SeqAIJ *)C->data;
8739566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
8749566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
8759566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
876a3f881fbSStefano Zampini   c->reallocs         = 0;
877076ba34aSJunchao Zhang   C->info.mallocs     = 0;
878a3f881fbSStefano Zampini   C->info.nz_unneeded = 0;
879a3f881fbSStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
880a3f881fbSStefano Zampini   C->num_ass++;
8813ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
882a3f881fbSStefano Zampini }
883a3f881fbSStefano Zampini 
884d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
885d71ae5a4SJacob Faibussowitsch {
886076ba34aSJunchao Zhang   Mat_Product                 *product = C->product;
887076ba34aSJunchao Zhang   MatProductType               ptype;
888076ba34aSJunchao Zhang   Mat                          A, B;
889076ba34aSJunchao Zhang   bool                         transA, transB;
890076ba34aSJunchao Zhang   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
891076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos *pdata;
892076ba34aSJunchao Zhang   MPI_Comm                     comm;
8930e3ece09SJunchao Zhang   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
894a3f881fbSStefano Zampini 
895a3f881fbSStefano Zampini   PetscFunctionBegin;
896a3f881fbSStefano Zampini   MatCheckProduct(C, 1);
8979566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
8985f80ce2aSJacob Faibussowitsch   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
899a3f881fbSStefano Zampini   A = product->A;
900a3f881fbSStefano Zampini   B = product->B;
9019566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
9029566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B));
903a3f881fbSStefano Zampini   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
904a3f881fbSStefano Zampini   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
9050e3ece09SJunchao Zhang   csrmatA = akok->csrmat;
9060e3ece09SJunchao Zhang   csrmatB = bkok->csrmat;
907076ba34aSJunchao Zhang 
908a3f881fbSStefano Zampini   ptype = product->type;
9090e3ece09SJunchao Zhang   // Take advantage of the symmetry if true
9100e3ece09SJunchao Zhang   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
9110e3ece09SJunchao Zhang     ptype                                          = MATPRODUCT_AB;
9120e3ece09SJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
9130e3ece09SJunchao Zhang   }
9140e3ece09SJunchao Zhang   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
9150e3ece09SJunchao Zhang     ptype                                          = MATPRODUCT_AB;
9160e3ece09SJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
9170e3ece09SJunchao Zhang   }
9180e3ece09SJunchao Zhang 
919a3f881fbSStefano Zampini   switch (ptype) {
9209371c9d4SSatish Balay   case MATPRODUCT_AB:
9219371c9d4SSatish Balay     transA = false;
9229371c9d4SSatish Balay     transB = false;
9230e6a1e94SMark Adams     PetscCall(MatSetBlockSizesFromMats(C, A, B));
9249371c9d4SSatish Balay     break;
9259371c9d4SSatish Balay   case MATPRODUCT_AtB:
9269371c9d4SSatish Balay     transA = true;
9279371c9d4SSatish Balay     transB = false;
9280e6a1e94SMark Adams     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
9290e6a1e94SMark Adams     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
9309371c9d4SSatish Balay     break;
9319371c9d4SSatish Balay   case MATPRODUCT_ABt:
9329371c9d4SSatish Balay     transA = false;
9339371c9d4SSatish Balay     transB = true;
9340e6a1e94SMark Adams     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
9350e6a1e94SMark Adams     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
9369371c9d4SSatish Balay     break;
937d71ae5a4SJacob Faibussowitsch   default:
938d71ae5a4SJacob Faibussowitsch     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
939a3f881fbSStefano Zampini   }
9400e3ece09SJunchao Zhang   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
941076ba34aSJunchao Zhang   pdata->reusesym = product->api_user;
942a3f881fbSStefano Zampini 
943076ba34aSJunchao Zhang   /* TODO: add command line options to select spgemm algorithms */
944866eb059SJunchao Zhang   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
945866eb059SJunchao Zhang 
946866eb059SJunchao Zhang   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
947866eb059SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
948866eb059SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
949866eb059SJunchao Zhang   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
950866eb059SJunchao Zhang   #endif
951866eb059SJunchao Zhang #endif
9520e3ece09SJunchao Zhang   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
953076ba34aSJunchao Zhang 
9549566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
955076ba34aSJunchao Zhang   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
956076ba34aSJunchao Zhang   if (transA) {
9579566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
958076ba34aSJunchao Zhang     transA = false;
959076ba34aSJunchao Zhang   }
960076ba34aSJunchao Zhang 
961076ba34aSJunchao Zhang   if (transB) {
9629566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
963076ba34aSJunchao Zhang     transB = false;
964076ba34aSJunchao Zhang   }
965076ba34aSJunchao Zhang 
9660e3ece09SJunchao Zhang   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
967076ba34aSJunchao Zhang   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
968076ba34aSJunchao Zhang     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
969076ba34aSJunchao Zhang     calling new Mat_SeqAIJKokkos().
970076ba34aSJunchao Zhang     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
971076ba34aSJunchao Zhang   */
9720e3ece09SJunchao Zhang   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
9730e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
974866eb059SJunchao Zhang   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
975866eb059SJunchao Zhang   auto spgemmHandle = pdata->kh.get_spgemm_handle();
976866eb059SJunchao Zhang   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
977e944a159SJunchao Zhang #endif
9789566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
979076ba34aSJunchao Zhang 
9809566063dSJacob Faibussowitsch   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
9819566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
982076ba34aSJunchao Zhang   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
9833ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
984a3f881fbSStefano Zampini }
985a3f881fbSStefano Zampini 
986a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */
987d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
988d71ae5a4SJacob Faibussowitsch {
989076ba34aSJunchao Zhang   Mat_Product *product = mat->product;
990a3f881fbSStefano Zampini   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
991a3f881fbSStefano Zampini 
992a3f881fbSStefano Zampini   PetscFunctionBegin;
993a3f881fbSStefano Zampini   MatCheckProduct(mat, 1);
9949566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
99548a46eb9SPierre Jolivet   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
996a3f881fbSStefano Zampini   if (Biskok && Ciskok) {
997a3f881fbSStefano Zampini     switch (product->type) {
998a3f881fbSStefano Zampini     case MATPRODUCT_AB:
999a3f881fbSStefano Zampini     case MATPRODUCT_AtB:
1000d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABt:
1001d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
1002d71ae5a4SJacob Faibussowitsch       break;
1003a3f881fbSStefano Zampini     case MATPRODUCT_PtAP:
1004a3f881fbSStefano Zampini     case MATPRODUCT_RARt:
1005d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABC:
1006d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
1007d71ae5a4SJacob Faibussowitsch       break;
1008d71ae5a4SJacob Faibussowitsch     default:
1009d71ae5a4SJacob Faibussowitsch       break;
1010a3f881fbSStefano Zampini     }
1011a3f881fbSStefano Zampini   } else { /* fallback for AIJ */
10129566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
1013a3f881fbSStefano Zampini   }
10143ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1015a3f881fbSStefano Zampini }
1016a587d139SMark 
1017d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
1018d71ae5a4SJacob Faibussowitsch {
1019f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok;
1020f0cf5187SStefano Zampini 
1021f0cf5187SStefano Zampini   PetscFunctionBegin;
10229566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
10239566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1024f0cf5187SStefano Zampini   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1025d326c3f1SJunchao Zhang   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
10269566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(A));
10279566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
10289566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
10293ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1030f0cf5187SStefano Zampini }
1031f0cf5187SStefano Zampini 
1032f4747e26SJunchao Zhang // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
1033f4747e26SJunchao Zhang static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1034f4747e26SJunchao Zhang {
1035f4747e26SJunchao Zhang   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1036f4747e26SJunchao Zhang 
1037f4747e26SJunchao Zhang   PetscFunctionBegin;
1038f4747e26SJunchao Zhang   if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
1039f4747e26SJunchao Zhang     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1040f4747e26SJunchao Zhang 
1041f4747e26SJunchao Zhang     PetscCall(PetscLogGpuTimeBegin());
1042f4747e26SJunchao Zhang     PetscCall(MatSeqAIJKokkosSyncDevice(A));
1043f4747e26SJunchao Zhang     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1044f4747e26SJunchao Zhang     const auto &Aa     = aijkok->a_dual.view_device();
1045f4747e26SJunchao Zhang     const auto &Adiag  = aijkok->diag_dual.view_device();
1046d326c3f1SJunchao Zhang     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1047f4747e26SJunchao Zhang     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1048f4747e26SJunchao Zhang     PetscCall(PetscLogGpuFlops(n));
1049f4747e26SJunchao Zhang     PetscCall(PetscLogGpuTimeEnd());
1050f4747e26SJunchao Zhang   } else { // need reassembly, very slow!
1051f4747e26SJunchao Zhang     PetscCall(MatShift_Basic(A, a));
1052f4747e26SJunchao Zhang   }
1053f4747e26SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1054f4747e26SJunchao Zhang }
1055f4747e26SJunchao Zhang 
1056f4747e26SJunchao Zhang static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1057f4747e26SJunchao Zhang {
1058f4747e26SJunchao Zhang   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1059f4747e26SJunchao Zhang 
1060f4747e26SJunchao Zhang   PetscFunctionBegin;
1061f4747e26SJunchao Zhang   if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1062f4747e26SJunchao Zhang     ConstPetscScalarKokkosView dv;
1063f4747e26SJunchao Zhang     PetscInt                   n, nv;
1064f4747e26SJunchao Zhang 
1065f4747e26SJunchao Zhang     PetscCall(PetscLogGpuTimeBegin());
1066f4747e26SJunchao Zhang     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1067f4747e26SJunchao Zhang     PetscCall(VecGetKokkosView(D, &dv));
1068f4747e26SJunchao Zhang     PetscCall(VecGetLocalSize(D, &nv));
1069f4747e26SJunchao Zhang     n = PetscMin(Y->rmap->n, Y->cmap->n);
1070f4747e26SJunchao Zhang     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1071f4747e26SJunchao Zhang 
1072f4747e26SJunchao Zhang     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1073f4747e26SJunchao Zhang     const auto &Aa     = aijkok->a_dual.view_device();
1074f4747e26SJunchao Zhang     const auto &Adiag  = aijkok->diag_dual.view_device();
1075f4747e26SJunchao Zhang     PetscCallCXX(Kokkos::parallel_for(
1076d326c3f1SJunchao Zhang       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1077f4747e26SJunchao Zhang         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1078f4747e26SJunchao Zhang         else Aa(Adiag(i)) += dv(i);
1079f4747e26SJunchao Zhang       }));
1080f4747e26SJunchao Zhang     PetscCall(VecRestoreKokkosView(D, &dv));
1081f4747e26SJunchao Zhang     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1082f4747e26SJunchao Zhang     PetscCall(PetscLogGpuFlops(n));
1083f4747e26SJunchao Zhang     PetscCall(PetscLogGpuTimeEnd());
1084f4747e26SJunchao Zhang   } else { // need reassembly, very slow!
1085f4747e26SJunchao Zhang     PetscCall(MatDiagonalSet_Default(Y, D, is));
1086f4747e26SJunchao Zhang   }
1087f4747e26SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1088f4747e26SJunchao Zhang }
1089f4747e26SJunchao Zhang 
1090f4747e26SJunchao Zhang static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1091f4747e26SJunchao Zhang {
1092f4747e26SJunchao Zhang   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1093f4747e26SJunchao Zhang   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1094f4747e26SJunchao Zhang   ConstPetscScalarKokkosView lv, rv;
1095f4747e26SJunchao Zhang 
1096f4747e26SJunchao Zhang   PetscFunctionBegin;
1097f4747e26SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1098f4747e26SJunchao Zhang   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1099f4747e26SJunchao Zhang   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1100f4747e26SJunchao Zhang   const auto &Aa     = aijkok->a_dual.view_device();
1101f4747e26SJunchao Zhang   const auto &Ai     = aijkok->i_dual.view_device();
1102f4747e26SJunchao Zhang   const auto &Aj     = aijkok->j_dual.view_device();
1103f4747e26SJunchao Zhang   if (ll) {
1104f4747e26SJunchao Zhang     PetscCall(VecGetLocalSize(ll, &m));
1105f4747e26SJunchao Zhang     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1106f4747e26SJunchao Zhang     PetscCall(VecGetKokkosView(ll, &lv));
1107f4747e26SJunchao Zhang     PetscCallCXX(Kokkos::parallel_for( // for each row
1108d326c3f1SJunchao Zhang       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1109f4747e26SJunchao Zhang         PetscInt i   = t.league_rank(); // row i
1110f4747e26SJunchao Zhang         PetscInt len = Ai(i + 1) - Ai(i);
1111f4747e26SJunchao Zhang         // scale entries on the row
1112f4747e26SJunchao Zhang         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1113f4747e26SJunchao Zhang       }));
1114f4747e26SJunchao Zhang     PetscCall(VecRestoreKokkosView(ll, &lv));
1115f4747e26SJunchao Zhang     PetscCall(PetscLogGpuFlops(nz));
1116f4747e26SJunchao Zhang   }
1117f4747e26SJunchao Zhang   if (rr) {
1118f4747e26SJunchao Zhang     PetscCall(VecGetLocalSize(rr, &n));
1119f4747e26SJunchao Zhang     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1120f4747e26SJunchao Zhang     PetscCall(VecGetKokkosView(rr, &rv));
1121f4747e26SJunchao Zhang     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1122d326c3f1SJunchao Zhang       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1123f4747e26SJunchao Zhang     PetscCall(VecRestoreKokkosView(rr, &lv));
1124f4747e26SJunchao Zhang     PetscCall(PetscLogGpuFlops(nz));
1125f4747e26SJunchao Zhang   }
1126f4747e26SJunchao Zhang   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1127f4747e26SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1128f4747e26SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1129f4747e26SJunchao Zhang }
1130f4747e26SJunchao Zhang 
1131d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1132d71ae5a4SJacob Faibussowitsch {
1133076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
1134a587d139SMark 
1135a587d139SMark   PetscFunctionBegin;
1136076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
11372328674fSJunchao Zhang   if (aijkok) { /* Only zero the device if data is already there */
1138d326c3f1SJunchao Zhang     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
11399566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(A));
11402328674fSJunchao Zhang   } else { /* Might be preallocated but not assembled */
11419566063dSJacob Faibussowitsch     PetscCall(MatZeroEntries_SeqAIJ(A));
11422328674fSJunchao Zhang   }
11433ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1144a587d139SMark }
1145a587d139SMark 
1146d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1147d71ae5a4SJacob Faibussowitsch {
1148f78ce678SMark Adams   Mat_SeqAIJKokkos     *aijkok;
1149f78ce678SMark Adams   PetscInt              n;
1150f78ce678SMark Adams   PetscScalarKokkosView xv;
1151f78ce678SMark Adams 
1152f78ce678SMark Adams   PetscFunctionBegin;
1153f78ce678SMark Adams   PetscCall(VecGetLocalSize(x, &n));
1154f78ce678SMark Adams   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1155f78ce678SMark Adams   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1156f78ce678SMark Adams 
1157f78ce678SMark Adams   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1158f78ce678SMark Adams   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1159f78ce678SMark Adams 
1160f78ce678SMark Adams   const auto &Aa    = aijkok->a_dual.view_device();
1161f78ce678SMark Adams   const auto &Ai    = aijkok->i_dual.view_device();
1162f78ce678SMark Adams   const auto &Adiag = aijkok->diag_dual.view_device();
1163f78ce678SMark Adams 
1164f78ce678SMark Adams   PetscCall(VecGetKokkosViewWrite(x, &xv));
11659371c9d4SSatish Balay   Kokkos::parallel_for(
1166d326c3f1SJunchao Zhang     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1167f78ce678SMark Adams       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1168f78ce678SMark Adams       else xv(i) = 0;
1169f78ce678SMark Adams     });
1170f78ce678SMark Adams   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
11713ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1172f78ce678SMark Adams }
1173f78ce678SMark Adams 
1174db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1175d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1176d71ae5a4SJacob Faibussowitsch {
1177db78de30SJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
1178db78de30SJunchao Zhang 
1179db78de30SJunchao Zhang   PetscFunctionBegin;
1180db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
11814f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1182db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
11839566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1184db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1185076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
11863ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1187db78de30SJunchao Zhang }
1188db78de30SJunchao Zhang 
1189d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1190d71ae5a4SJacob Faibussowitsch {
1191db78de30SJunchao Zhang   PetscFunctionBegin;
1192db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
11934f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1194db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
11953ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1196db78de30SJunchao Zhang }
1197db78de30SJunchao Zhang 
1198d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1199d71ae5a4SJacob Faibussowitsch {
1200db78de30SJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
1201db78de30SJunchao Zhang 
1202db78de30SJunchao Zhang   PetscFunctionBegin;
1203db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
12044f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1205db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
12069566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1207db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1208076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
12093ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1210db78de30SJunchao Zhang }
1211db78de30SJunchao Zhang 
1212d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1213d71ae5a4SJacob Faibussowitsch {
1214db78de30SJunchao Zhang   PetscFunctionBegin;
1215db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
12164f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1217db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
12189566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(A));
12193ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1220db78de30SJunchao Zhang }
1221db78de30SJunchao Zhang 
1222d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1223d71ae5a4SJacob Faibussowitsch {
1224db78de30SJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
1225db78de30SJunchao Zhang 
1226db78de30SJunchao Zhang   PetscFunctionBegin;
1227db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
12284f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1229db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1230db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1231076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
12323ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1233db78de30SJunchao Zhang }
1234db78de30SJunchao Zhang 
1235d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1236d71ae5a4SJacob Faibussowitsch {
1237db78de30SJunchao Zhang   PetscFunctionBegin;
1238db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
12394f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1240db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
12419566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(A));
12423ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1243db78de30SJunchao Zhang }
1244db78de30SJunchao Zhang 
1245c0c276a7Ssdargavi PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A)
1246c0c276a7Ssdargavi {
1247c0c276a7Ssdargavi   Mat_SeqAIJKokkos *akok;
1248c0c276a7Ssdargavi 
1249c0c276a7Ssdargavi   PetscFunctionBegin;
1250ecd797f4SJunchao Zhang   PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_d.extent(0), i_d, j_d, a_d));
1251c0c276a7Ssdargavi   PetscCall(MatCreate(comm, A));
1252c0c276a7Ssdargavi   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1253c0c276a7Ssdargavi   PetscFunctionReturn(PETSC_SUCCESS);
1254c0c276a7Ssdargavi }
1255c0c276a7Ssdargavi 
1256c17cf699SJunchao Zhang /* Computes Y += alpha X */
1257d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1258d71ae5a4SJacob Faibussowitsch {
1259a587d139SMark   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1260c17cf699SJunchao Zhang   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1261c17cf699SJunchao Zhang   ConstMatScalarKokkosView Xa;
1262c17cf699SJunchao Zhang   MatScalarKokkosView      Ya;
12634df4a32cSJunchao Zhang   auto                     exec = PetscGetKokkosExecutionSpace();
1264a587d139SMark 
1265a587d139SMark   PetscFunctionBegin;
1266c17cf699SJunchao Zhang   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1267c17cf699SJunchao Zhang   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
12689566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
12699566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(X));
12709566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1271db78de30SJunchao Zhang 
1272c17cf699SJunchao Zhang   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1273a587d139SMark     PetscBool e;
12749566063dSJacob Faibussowitsch     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1275a587d139SMark     if (e) {
12769566063dSJacob Faibussowitsch       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1277c17cf699SJunchao Zhang       if (e) pattern = SAME_NONZERO_PATTERN;
1278a587d139SMark     }
1279a587d139SMark   }
1280db78de30SJunchao Zhang 
1281c17cf699SJunchao Zhang   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1282c17cf699SJunchao Zhang     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1283c17cf699SJunchao Zhang     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1284c17cf699SJunchao Zhang     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1285c17cf699SJunchao Zhang   */
1286c17cf699SJunchao Zhang   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1287c17cf699SJunchao Zhang   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1288c17cf699SJunchao Zhang   Xa   = xkok->a_dual.view_device();
1289c17cf699SJunchao Zhang   Ya   = ykok->a_dual.view_device();
1290c17cf699SJunchao Zhang 
1291c17cf699SJunchao Zhang   if (pattern == SAME_NONZERO_PATTERN) {
1292d326c3f1SJunchao Zhang     KokkosBlas::axpy(exec, alpha, Xa, Ya);
12939566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1294c17cf699SJunchao Zhang   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1295c17cf699SJunchao Zhang     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1296c17cf699SJunchao Zhang     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1297c17cf699SJunchao Zhang 
12989371c9d4SSatish Balay     Kokkos::parallel_for(
1299d326c3f1SJunchao Zhang       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
13000e3ece09SJunchao Zhang         PetscInt i = t.league_rank(); // row i
13010e3ece09SJunchao Zhang         Kokkos::single(Kokkos::PerTeam(t), [=]() {
13020e3ece09SJunchao Zhang           // Only one thread works in a team
1303c17cf699SJunchao Zhang           PetscInt p, q = Yi(i);
13040e3ece09SJunchao Zhang           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
13050e3ece09SJunchao Zhang             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
13060e3ece09SJunchao Zhang             if (Xj(p) == Yj(q)) {                        // Found it
1307c17cf699SJunchao Zhang               Ya(q) += alpha * Xa(p);
1308c17cf699SJunchao Zhang               q++;
1309a587d139SMark             } else {
13100e3ece09SJunchao Zhang             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
13110e3ece09SJunchao Zhang             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
13120e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
13130e3ece09SJunchao Zhang               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
13148b8b16f9SJunchao Zhang #else
13150e3ece09SJunchao Zhang               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
13168b8b16f9SJunchao Zhang #endif
1317a587d139SMark             }
1318c17cf699SJunchao Zhang           }
1319c17cf699SJunchao Zhang         });
1320c17cf699SJunchao Zhang       });
13219566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
13220e3ece09SJunchao Zhang   } else { // different nonzero patterns
1323c17cf699SJunchao Zhang     Mat             Z;
1324c17cf699SJunchao Zhang     KokkosCsrMatrix zcsr;
1325c17cf699SJunchao Zhang     KernelHandle    kh;
13260e3ece09SJunchao Zhang     kh.create_spadd_handle(true); // X, Y are sorted
1327c17cf699SJunchao Zhang     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1328c17cf699SJunchao Zhang     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1329c17cf699SJunchao Zhang     zkok = new Mat_SeqAIJKokkos(zcsr);
13309566063dSJacob Faibussowitsch     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
13319566063dSJacob Faibussowitsch     PetscCall(MatHeaderReplace(Y, &Z));
1332c17cf699SJunchao Zhang     kh.destroy_spadd_handle();
1333c17cf699SJunchao Zhang   }
13349566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
13350e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
13363ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1337a587d139SMark }
1338a587d139SMark 
13392c4ab24aSJunchao Zhang struct MatCOOStruct_SeqAIJKokkos {
13402c4ab24aSJunchao Zhang   PetscCount           n;
13412c4ab24aSJunchao Zhang   PetscCount           Atot;
13422c4ab24aSJunchao Zhang   PetscInt             nz;
13432c4ab24aSJunchao Zhang   PetscCountKokkosView jmap;
13442c4ab24aSJunchao Zhang   PetscCountKokkosView perm;
13452c4ab24aSJunchao Zhang 
13462c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
13472c4ab24aSJunchao Zhang   {
13482c4ab24aSJunchao Zhang     nz   = coo_h->nz;
13492c4ab24aSJunchao Zhang     n    = coo_h->n;
13502c4ab24aSJunchao Zhang     Atot = coo_h->Atot;
13512c4ab24aSJunchao Zhang     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
13522c4ab24aSJunchao Zhang     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
13532c4ab24aSJunchao Zhang   }
13542c4ab24aSJunchao Zhang };
13552c4ab24aSJunchao Zhang 
135649abdd8aSBarry Smith static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
13572c4ab24aSJunchao Zhang {
13582c4ab24aSJunchao Zhang   PetscFunctionBegin;
135949abdd8aSBarry Smith   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
13602c4ab24aSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
13612c4ab24aSJunchao Zhang }
13622c4ab24aSJunchao Zhang 
1363d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1364d71ae5a4SJacob Faibussowitsch {
136542550becSJunchao Zhang   Mat_SeqAIJKokkos          *akok;
136642550becSJunchao Zhang   Mat_SeqAIJ                *aseq;
136703e76207SPierre Jolivet   PetscContainer             container_h;
13682c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJ       *coo_h;
13692c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJKokkos *coo_d;
137042550becSJunchao Zhang 
137142550becSJunchao Zhang   PetscFunctionBegin;
13729566063dSJacob Faibussowitsch   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1373394ed5ebSJunchao Zhang   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
137442550becSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1375cbc6b225SStefano Zampini   delete akok;
1376f4747e26SJunchao Zhang   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
13779566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
13782c4ab24aSJunchao Zhang 
13792c4ab24aSJunchao Zhang   // Copy the COO struct to device
13802c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
13812c4ab24aSJunchao Zhang   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
13822c4ab24aSJunchao Zhang   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
13832c4ab24aSJunchao Zhang 
13842c4ab24aSJunchao Zhang   // Put the COO struct in a container and then attach that to the matrix
138503e76207SPierre Jolivet   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
13863ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
138742550becSJunchao Zhang }
138842550becSJunchao Zhang 
1389d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1390d71ae5a4SJacob Faibussowitsch {
139142550becSJunchao Zhang   MatScalarKokkosView        Aa;
139242550becSJunchao Zhang   ConstMatScalarKokkosView   kv;
139342550becSJunchao Zhang   PetscMemType               memtype;
13942c4ab24aSJunchao Zhang   PetscContainer             container;
13952c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJKokkos *coo;
139642550becSJunchao Zhang 
139742550becSJunchao Zhang   PetscFunctionBegin;
13982c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
13992c4ab24aSJunchao Zhang   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
14002c4ab24aSJunchao Zhang 
14012c4ab24aSJunchao Zhang   const auto &n    = coo->n;
14022c4ab24aSJunchao Zhang   const auto &Annz = coo->nz;
14032c4ab24aSJunchao Zhang   const auto &jmap = coo->jmap;
14042c4ab24aSJunchao Zhang   const auto &perm = coo->perm;
14052c4ab24aSJunchao Zhang 
14069566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(v, &memtype));
140742550becSJunchao Zhang   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
14082c4ab24aSJunchao Zhang     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
140942550becSJunchao Zhang   } else {
14102c4ab24aSJunchao Zhang     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
141142550becSJunchao Zhang   }
141242550becSJunchao Zhang 
1413c7b718f4SJunchao Zhang   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1414c7b718f4SJunchao Zhang   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
141542550becSJunchao Zhang 
141608bb9926SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
14179371c9d4SSatish Balay   Kokkos::parallel_for(
1418d326c3f1SJunchao Zhang     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1419c7b718f4SJunchao Zhang       PetscScalar sum = 0.0;
1420c7b718f4SJunchao Zhang       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1421c7b718f4SJunchao Zhang       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1422c7b718f4SJunchao Zhang     });
142308bb9926SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1424394ed5ebSJunchao Zhang 
14259566063dSJacob Faibussowitsch   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
14269566063dSJacob Faibussowitsch   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
14273ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
142842550becSJunchao Zhang }
142942550becSJunchao Zhang 
1430d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1431d71ae5a4SJacob Faibussowitsch {
1432076ba34aSJunchao Zhang   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1433076ba34aSJunchao Zhang 
14348c3ff71bSJunchao Zhang   PetscFunctionBegin;
1435076ba34aSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
14366f3d89d0SStefano Zampini   A->boundtocpu  = PETSC_FALSE;
14376f3d89d0SStefano Zampini 
14388c3ff71bSJunchao Zhang   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
14398c3ff71bSJunchao Zhang   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
14408c3ff71bSJunchao Zhang   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1441a587d139SMark   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1442f0cf5187SStefano Zampini   A->ops->scale                     = MatScale_SeqAIJKokkos;
1443a587d139SMark   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1444076ba34aSJunchao Zhang   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
14458c3ff71bSJunchao Zhang   A->ops->mult                      = MatMult_SeqAIJKokkos;
14468c3ff71bSJunchao Zhang   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
14478c3ff71bSJunchao Zhang   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
14488c3ff71bSJunchao Zhang   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
14498c3ff71bSJunchao Zhang   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
14508c3ff71bSJunchao Zhang   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1451076ba34aSJunchao Zhang   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
14520ecb592aSJunchao Zhang   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1453152b3e56SJunchao Zhang   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1454f78ce678SMark Adams   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1455f4747e26SJunchao Zhang   A->ops->shift                     = MatShift_SeqAIJKokkos;
1456f4747e26SJunchao Zhang   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1457f4747e26SJunchao Zhang   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
145803db1824SAlex Lindsay   A->ops->getcurrentmemtype         = MatGetCurrentMemType_SeqAIJKokkos;
1459076ba34aSJunchao Zhang   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1460076ba34aSJunchao Zhang   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1461076ba34aSJunchao Zhang   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1462076ba34aSJunchao Zhang   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1463076ba34aSJunchao Zhang   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1464076ba34aSJunchao Zhang   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
14657ee59b9bSJunchao Zhang   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
146642550becSJunchao Zhang 
14679566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
14689566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
146957761e9aSJunchao Zhang #if defined(PETSC_HAVE_HYPRE)
147057761e9aSJunchao Zhang   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
147157761e9aSJunchao Zhang #endif
14723ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1473076ba34aSJunchao Zhang }
1474076ba34aSJunchao Zhang 
14759d13fa56SJunchao Zhang /*
14769d13fa56SJunchao Zhang    Extract the (prescribled) diagonal blocks of the matrix and then invert them
14779d13fa56SJunchao Zhang 
14789d13fa56SJunchao Zhang   Input Parameters:
14799d13fa56SJunchao Zhang +  A       - the MATSEQAIJKOKKOS matrix
14809d13fa56SJunchao Zhang .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
14819d13fa56SJunchao Zhang .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
14829d13fa56SJunchao Zhang .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
14839d13fa56SJunchao Zhang -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine
14849d13fa56SJunchao Zhang 
14859d13fa56SJunchao Zhang   Output Parameter:
14869d13fa56SJunchao Zhang .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
14879d13fa56SJunchao Zhang */
14889d13fa56SJunchao Zhang PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
14899d13fa56SJunchao Zhang {
14909d13fa56SJunchao Zhang   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
14919d13fa56SJunchao Zhang   PetscInt          nblocks = bs.extent(0) - 1;
14929d13fa56SJunchao Zhang 
14939d13fa56SJunchao Zhang   PetscFunctionBegin;
14949d13fa56SJunchao Zhang   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
14959d13fa56SJunchao Zhang 
14969d13fa56SJunchao Zhang   // Pull out the diagonal blocks of the matrix and then invert the blocks
14979d13fa56SJunchao Zhang   auto Aa    = akok->a_dual.view_device();
14989d13fa56SJunchao Zhang   auto Ai    = akok->i_dual.view_device();
14999d13fa56SJunchao Zhang   auto Aj    = akok->j_dual.view_device();
15009d13fa56SJunchao Zhang   auto Adiag = akok->diag_dual.view_device();
15019d13fa56SJunchao Zhang   // TODO: how to tune the team size?
150245402d8aSJunchao Zhang #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
15039d13fa56SJunchao Zhang   auto ts = Kokkos::AUTO();
15049d13fa56SJunchao Zhang #else
15059d13fa56SJunchao Zhang   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
15069d13fa56SJunchao Zhang #endif
15079d13fa56SJunchao Zhang   PetscCallCXX(Kokkos::parallel_for(
1508d326c3f1SJunchao Zhang     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
15099d13fa56SJunchao Zhang       const PetscInt bid    = teamMember.league_rank();                                                   // block id
15109d13fa56SJunchao Zhang       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
15119d13fa56SJunchao Zhang       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
15129d13fa56SJunchao Zhang       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
15139d13fa56SJunchao Zhang       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);
15149d13fa56SJunchao Zhang 
15159d13fa56SJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
15169d13fa56SJunchao Zhang         PetscInt i = rstart + r;                                                            // i-th row in A
15179d13fa56SJunchao Zhang 
15189d13fa56SJunchao Zhang         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
15199d13fa56SJunchao Zhang           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row
15209d13fa56SJunchao Zhang 
15219d13fa56SJunchao Zhang           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
15229d13fa56SJunchao Zhang             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
15239d13fa56SJunchao Zhang               B(r, c) = 0.0;
15249d13fa56SJunchao Zhang             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
15259d13fa56SJunchao Zhang               B(r, c) = Aa(first + c);
15269d13fa56SJunchao Zhang             } else { // this entry does not show up in the CSR
15279d13fa56SJunchao Zhang               B(r, c) = 0.0;
15289d13fa56SJunchao Zhang             }
15299d13fa56SJunchao Zhang           }
15309d13fa56SJunchao Zhang         } else { // rare case that the diagonal does not exist
15319d13fa56SJunchao Zhang           const PetscInt begin = Ai(i);
15329d13fa56SJunchao Zhang           const PetscInt end   = Ai(i + 1);
15339d13fa56SJunchao Zhang           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
15349d13fa56SJunchao Zhang           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
15359d13fa56SJunchao Zhang             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
15369d13fa56SJunchao Zhang             else if (Aj(j) >= rstart + m) break;
15379d13fa56SJunchao Zhang           }
15389d13fa56SJunchao Zhang         }
15399d13fa56SJunchao Zhang       });
15409d13fa56SJunchao Zhang 
15419d13fa56SJunchao Zhang       // LU-decompose B (w/o pivoting) and then invert B
15429d13fa56SJunchao Zhang       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
15439d13fa56SJunchao Zhang       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
15449d13fa56SJunchao Zhang     }));
15459d13fa56SJunchao Zhang   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
15469d13fa56SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
15479d13fa56SJunchao Zhang }
15489d13fa56SJunchao Zhang 
1549d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1550d71ae5a4SJacob Faibussowitsch {
1551076ba34aSJunchao Zhang   Mat_SeqAIJ *aseq;
1552076ba34aSJunchao Zhang   PetscInt    i, m, n;
15534df4a32cSJunchao Zhang   auto        exec = PetscGetKokkosExecutionSpace();
1554076ba34aSJunchao Zhang 
1555076ba34aSJunchao Zhang   PetscFunctionBegin;
15565f80ce2aSJacob Faibussowitsch   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1557076ba34aSJunchao Zhang 
1558076ba34aSJunchao Zhang   m = akok->nrows();
1559076ba34aSJunchao Zhang   n = akok->ncols();
15609566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(A, m, n, m, n));
15619566063dSJacob Faibussowitsch   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1562076ba34aSJunchao Zhang 
1563076ba34aSJunchao Zhang   /* Set up data structures of A as a MATSEQAIJ */
15649566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
156557508eceSPierre Jolivet   aseq = (Mat_SeqAIJ *)A->data;
1566076ba34aSJunchao Zhang 
1567f3d3cd90SZach Atkins   PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */
1568f3d3cd90SZach Atkins   PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec));
1569076ba34aSJunchao Zhang 
1570076ba34aSJunchao Zhang   aseq->i       = akok->i_host_data();
1571076ba34aSJunchao Zhang   aseq->j       = akok->j_host_data();
1572076ba34aSJunchao Zhang   aseq->a       = akok->a_host_data();
1573076ba34aSJunchao Zhang   aseq->nonew   = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1574076ba34aSJunchao Zhang   aseq->free_a  = PETSC_FALSE;
1575076ba34aSJunchao Zhang   aseq->free_ij = PETSC_FALSE;
1576076ba34aSJunchao Zhang   aseq->nz      = akok->nnz();
1577076ba34aSJunchao Zhang   aseq->maxnz   = aseq->nz;
1578076ba34aSJunchao Zhang 
15799566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &aseq->imax));
15809566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &aseq->ilen));
1581ad540459SPierre Jolivet   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1582076ba34aSJunchao Zhang 
1583076ba34aSJunchao Zhang   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1584076ba34aSJunchao Zhang   akok->nonzerostate = A->nonzerostate;
1585ff751488SJunchao Zhang   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
15869566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
15879566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
15883ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1589076ba34aSJunchao Zhang }
1590076ba34aSJunchao Zhang 
15910e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
15920e3ece09SJunchao Zhang {
15930e3ece09SJunchao Zhang   PetscFunctionBegin;
15940e3ece09SJunchao Zhang   PetscCall(MatSeqAIJKokkosSyncDevice(A));
15950e3ece09SJunchao Zhang   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
15960e3ece09SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
15970e3ece09SJunchao Zhang }
15980e3ece09SJunchao Zhang 
15990e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
16000e3ece09SJunchao Zhang {
16010e3ece09SJunchao Zhang   Mat_SeqAIJKokkos *akok;
16024d86920dSPierre Jolivet 
16030e3ece09SJunchao Zhang   PetscFunctionBegin;
16040e3ece09SJunchao Zhang   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
16050e3ece09SJunchao Zhang   PetscCall(MatCreate(comm, A));
16060e3ece09SJunchao Zhang   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
16070e3ece09SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
16080e3ece09SJunchao Zhang }
16090e3ece09SJunchao Zhang 
1610076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1611076ba34aSJunchao Zhang 
1612076ba34aSJunchao Zhang    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1613076ba34aSJunchao Zhang  */
1614d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1615d71ae5a4SJacob Faibussowitsch {
1616076ba34aSJunchao Zhang   PetscFunctionBegin;
16179566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
16189566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
16193ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16208c3ff71bSJunchao Zhang }
16218c3ff71bSJunchao Zhang 
1622152b3e56SJunchao Zhang /*@C
162311a5261eSBarry Smith   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
16248c3ff71bSJunchao Zhang   (the default parallel PETSc format). This matrix will ultimately be handled by
162520f4b53cSBarry Smith   Kokkos for calculations.
16268c3ff71bSJunchao Zhang 
16278c3ff71bSJunchao Zhang   Collective
16288c3ff71bSJunchao Zhang 
16298c3ff71bSJunchao Zhang   Input Parameters:
163011a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF`
16318c3ff71bSJunchao Zhang . m    - number of rows
16328c3ff71bSJunchao Zhang . n    - number of columns
163320f4b53cSBarry Smith . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
163420f4b53cSBarry Smith - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
16358c3ff71bSJunchao Zhang 
16368c3ff71bSJunchao Zhang   Output Parameter:
16378c3ff71bSJunchao Zhang . A - the matrix
16388c3ff71bSJunchao Zhang 
16392ef1f0ffSBarry Smith   Level: intermediate
16402ef1f0ffSBarry Smith 
16412ef1f0ffSBarry Smith   Notes:
164211a5261eSBarry Smith   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
16438c3ff71bSJunchao Zhang   MatXXXXSetPreallocation() paradgm instead of this routine directly.
164411a5261eSBarry Smith   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
16458c3ff71bSJunchao Zhang 
164611a5261eSBarry Smith   The AIJ format, also called
16472ef1f0ffSBarry Smith   compressed row storage, is fully compatible with standard Fortran
16488c3ff71bSJunchao Zhang   storage.  That is, the stored row and column indices can begin at
164920f4b53cSBarry Smith   either one (as in Fortran) or zero.
16508c3ff71bSJunchao Zhang 
16512ef1f0ffSBarry Smith   Specify the preallocated storage with either `nz` or `nnz` (not both).
16522ef1f0ffSBarry Smith   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
16532ef1f0ffSBarry Smith   allocation.
16548c3ff71bSJunchao Zhang 
1655fe59aa6dSJacob Faibussowitsch .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
16568c3ff71bSJunchao Zhang @*/
1657d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1658d71ae5a4SJacob Faibussowitsch {
16598c3ff71bSJunchao Zhang   PetscFunctionBegin;
16609566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
16619566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
16629566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, m, n));
16639566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
16649566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
16653ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16668c3ff71bSJunchao Zhang }
1667930e68a5SMark Adams 
1668aac854edSJunchao Zhang // After matrix numeric factorization, there are still steps to do before triangular solve can be called.
1669aac854edSJunchao Zhang // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does).
1670aac854edSJunchao Zhang // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve().
1671aac854edSJunchao Zhang // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck.
1672aac854edSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A)
1673d71ae5a4SJacob Faibussowitsch {
167486a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1675aac854edSJunchao Zhang   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1676aac854edSJunchao Zhang   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy
167786a27549SJunchao Zhang 
167886a27549SJunchao Zhang   PetscFunctionBegin;
1679aac854edSJunchao Zhang   if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet
1680aac854edSJunchao Zhang     if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d));
1681aac854edSJunchao Zhang     if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d));
168286a27549SJunchao Zhang     factors->sptrsv_symbolic_completed = PETSC_TRUE;
168386a27549SJunchao Zhang   }
16843ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
168586a27549SJunchao Zhang }
168686a27549SJunchao Zhang 
1687d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1688d71ae5a4SJacob Faibussowitsch {
1689aac854edSJunchao Zhang   const PetscInt              n         = A->rmap->n;
169086a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1691aac854edSJunchao Zhang   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1692aac854edSJunchao Zhang   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy
169386a27549SJunchao Zhang 
169486a27549SJunchao Zhang   PetscFunctionBegin;
1695aac854edSJunchao Zhang   if (!factors->transpose_updated) {
1696aac854edSJunchao Zhang     if (has_upper) {
1697aac854edSJunchao Zhang       if (!factors->iUt_d.extent(0)) {                                 // Allocate Ut on device if not yet
1698aac854edSJunchao Zhang         factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
16997b8d4ba6SJunchao Zhang         factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
17007b8d4ba6SJunchao Zhang         factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1701aac854edSJunchao Zhang       }
170286a27549SJunchao Zhang 
1703aac854edSJunchao Zhang       if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host
1704aac854edSJunchao Zhang         if (!factors->U) {
1705aac854edSJunchao Zhang           Mat_SeqAIJ *seq;
170686a27549SJunchao Zhang 
1707aac854edSJunchao Zhang           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U));
1708aac854edSJunchao Zhang           PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut));
170986a27549SJunchao Zhang 
1710aac854edSJunchao Zhang           seq            = static_cast<Mat_SeqAIJ *>(factors->Ut->data);
1711aac854edSJunchao Zhang           factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1712aac854edSJunchao Zhang           factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1713aac854edSJunchao Zhang           factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1714aac854edSJunchao Zhang         } else {
1715aac854edSJunchao Zhang           PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h
1716aac854edSJunchao Zhang         }
1717aac854edSJunchao Zhang         // Copy Ut from host to device
1718aac854edSJunchao Zhang         PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h));
1719aac854edSJunchao Zhang         PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h));
1720aac854edSJunchao Zhang         PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h));
1721aac854edSJunchao Zhang       } else { // If U was computed on device, we also compute the transpose there
1722aac854edSJunchao Zhang         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1723aac854edSJunchao Zhang         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d,
1724aac854edSJunchao Zhang                                                                                                                                                                                                                                factors->jU_d, factors->aU_d,
1725aac854edSJunchao Zhang                                                                                                                                                                                                                                factors->iUt_d, factors->jUt_d,
1726aac854edSJunchao Zhang                                                                                                                                                                                                                                factors->aUt_d));
1727aac854edSJunchao Zhang         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d));
1728aac854edSJunchao Zhang       }
1729aac854edSJunchao Zhang       PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d));
1730aac854edSJunchao Zhang     }
1731aac854edSJunchao Zhang 
1732aac854edSJunchao Zhang     // do the same for L with LU
1733aac854edSJunchao Zhang     if (has_lower) {
1734aac854edSJunchao Zhang       if (!factors->iLt_d.extent(0)) {                                 // Allocate Lt on device if not yet
1735aac854edSJunchao Zhang         factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1736aac854edSJunchao Zhang         factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1737aac854edSJunchao Zhang         factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1738aac854edSJunchao Zhang       }
1739aac854edSJunchao Zhang 
1740aac854edSJunchao Zhang       if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host
1741aac854edSJunchao Zhang         if (!factors->L) {
1742aac854edSJunchao Zhang           Mat_SeqAIJ *seq;
1743aac854edSJunchao Zhang 
1744aac854edSJunchao Zhang           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L));
1745aac854edSJunchao Zhang           PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt));
1746aac854edSJunchao Zhang 
1747aac854edSJunchao Zhang           seq            = static_cast<Mat_SeqAIJ *>(factors->Lt->data);
1748aac854edSJunchao Zhang           factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1749aac854edSJunchao Zhang           factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1750aac854edSJunchao Zhang           factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1751aac854edSJunchao Zhang         } else {
1752aac854edSJunchao Zhang           PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h
1753aac854edSJunchao Zhang         }
1754aac854edSJunchao Zhang         // Copy Lt from host to device
1755aac854edSJunchao Zhang         PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h));
1756aac854edSJunchao Zhang         PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h));
1757aac854edSJunchao Zhang         PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h));
1758aac854edSJunchao Zhang       } else { // If L was computed on device, we also compute the transpose there
1759aac854edSJunchao Zhang         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1760aac854edSJunchao Zhang         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d,
1761aac854edSJunchao Zhang                                                                                                                                                                                                                                factors->jL_d, factors->aL_d,
1762aac854edSJunchao Zhang                                                                                                                                                                                                                                factors->iLt_d, factors->jLt_d,
1763aac854edSJunchao Zhang                                                                                                                                                                                                                                factors->aLt_d));
1764aac854edSJunchao Zhang         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d));
1765aac854edSJunchao Zhang       }
1766aac854edSJunchao Zhang       PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d));
1767aac854edSJunchao Zhang     }
1768aac854edSJunchao Zhang 
176986a27549SJunchao Zhang     factors->transpose_updated = PETSC_TRUE;
177086a27549SJunchao Zhang   }
17713ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
177286a27549SJunchao Zhang }
177386a27549SJunchao Zhang 
1774aac854edSJunchao Zhang // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A.
1775aac854edSJunchao Zhang // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty.
1776aac854edSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx)
1777d71ae5a4SJacob Faibussowitsch {
1778aac854edSJunchao Zhang   auto                        exec    = PetscGetKokkosExecutionSpace();
177986a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1780aac854edSJunchao Zhang   PetscInt                    m       = A->rmap->n;
1781aac854edSJunchao Zhang   PetscScalarKokkosView       D       = factors->D_d;
1782aac854edSJunchao Zhang   PetscScalarKokkosView       X, Y, B; // alias
1783aac854edSJunchao Zhang   ConstPetscScalarKokkosView  b;
1784aac854edSJunchao Zhang   PetscScalarKokkosView       x;
1785aac854edSJunchao Zhang   PetscIntKokkosView         &rowperm  = factors->rowperm;
1786aac854edSJunchao Zhang   PetscBool                   identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
178786a27549SJunchao Zhang 
178886a27549SJunchao Zhang   PetscFunctionBegin;
17899566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1790aac854edSJunchao Zhang   PetscCall(MatSeqAIJKokkosSolveCheck(A));          // for UX = T
1791aac854edSJunchao Zhang   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B
1792aac854edSJunchao Zhang   PetscCall(VecGetKokkosView(bb, &b));
1793aac854edSJunchao Zhang   PetscCall(VecGetKokkosViewWrite(xx, &x));
1794aac854edSJunchao Zhang 
1795aac854edSJunchao Zhang   // Solve U^T Y = B
1796aac854edSJunchao Zhang   if (identity) { // Reorder b with the row permutation
1797aac854edSJunchao Zhang     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1798aac854edSJunchao Zhang     Y = factors->workVector;
1799aac854edSJunchao Zhang   } else {
1800aac854edSJunchao Zhang     B = factors->workVector;
1801aac854edSJunchao Zhang     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1802aac854edSJunchao Zhang     Y = x;
1803aac854edSJunchao Zhang   }
1804aac854edSJunchao Zhang   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1805aac854edSJunchao Zhang 
1806aac854edSJunchao Zhang   // Solve diag(D) Y' = Y.
1807aac854edSJunchao Zhang   // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication.
1808aac854edSJunchao Zhang   PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); }));
1809aac854edSJunchao Zhang 
1810aac854edSJunchao Zhang   // Solve UX = Y
1811aac854edSJunchao Zhang   if (identity) {
1812aac854edSJunchao Zhang     X = x;
1813aac854edSJunchao Zhang   } else {
1814aac854edSJunchao Zhang     X = factors->workVector; // B is not needed anymore
1815aac854edSJunchao Zhang   }
1816aac854edSJunchao Zhang   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1817aac854edSJunchao Zhang 
1818aac854edSJunchao Zhang   // Reorder X with the inverse column (row) permutation
18193a7d0413SPierre Jolivet   if (!identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1820aac854edSJunchao Zhang 
1821aac854edSJunchao Zhang   PetscCall(VecRestoreKokkosView(bb, &b));
1822aac854edSJunchao Zhang   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
18239566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
18243ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
182586a27549SJunchao Zhang }
182686a27549SJunchao Zhang 
1827aac854edSJunchao Zhang // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1828aac854edSJunchao Zhang // R and C are represented by rowperm and colperm in factors.
1829aac854edSJunchao Zhang // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1830aac854edSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1831d71ae5a4SJacob Faibussowitsch {
1832aac854edSJunchao Zhang   auto                        exec    = PetscGetKokkosExecutionSpace();
183386a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1834aac854edSJunchao Zhang   PetscInt                    m       = A->rmap->n;
1835aac854edSJunchao Zhang   PetscScalarKokkosView       X, Y, B; // alias
1836aac854edSJunchao Zhang   ConstPetscScalarKokkosView  b;
1837aac854edSJunchao Zhang   PetscScalarKokkosView       x;
1838aac854edSJunchao Zhang   PetscIntKokkosView         &rowperm      = factors->rowperm;
1839aac854edSJunchao Zhang   PetscIntKokkosView         &colperm      = factors->colperm;
1840aac854edSJunchao Zhang   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1841aac854edSJunchao Zhang   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
184286a27549SJunchao Zhang 
184386a27549SJunchao Zhang   PetscFunctionBegin;
18449566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1845aac854edSJunchao Zhang   PetscCall(MatSeqAIJKokkosSolveCheck(A));
1846aac854edSJunchao Zhang   PetscCall(VecGetKokkosView(bb, &b));
1847aac854edSJunchao Zhang   PetscCall(VecGetKokkosViewWrite(xx, &x));
184886a27549SJunchao Zhang 
1849aac854edSJunchao Zhang   // Solve L Y = B (i.e., L (U C^- x) = R b).  R b indicates applying the row permutation on b.
1850aac854edSJunchao Zhang   if (row_identity) {
1851aac854edSJunchao Zhang     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1852aac854edSJunchao Zhang     Y = factors->workVector;
1853aac854edSJunchao Zhang   } else {
1854aac854edSJunchao Zhang     B = factors->workVector;
1855aac854edSJunchao Zhang     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1856aac854edSJunchao Zhang     Y = x;
1857aac854edSJunchao Zhang   }
1858aac854edSJunchao Zhang   PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y));
1859aac854edSJunchao Zhang 
1860aac854edSJunchao Zhang   // Solve U C^- x = Y
1861aac854edSJunchao Zhang   if (col_identity) {
1862aac854edSJunchao Zhang     X = x;
1863aac854edSJunchao Zhang   } else {
1864aac854edSJunchao Zhang     X = factors->workVector;
1865aac854edSJunchao Zhang   }
1866aac854edSJunchao Zhang   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1867aac854edSJunchao Zhang 
1868aac854edSJunchao Zhang   // x = C X; Reorder X with the inverse col permutation
18693a7d0413SPierre Jolivet   if (!col_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); }));
1870aac854edSJunchao Zhang 
1871aac854edSJunchao Zhang   PetscCall(VecRestoreKokkosView(bb, &b));
1872aac854edSJunchao Zhang   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
18739566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
18743ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
187586a27549SJunchao Zhang }
187686a27549SJunchao Zhang 
1877aac854edSJunchao Zhang // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1878aac854edSJunchao Zhang // R and C are represented by rowperm and colperm in factors.
1879aac854edSJunchao Zhang // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1880aac854edSJunchao Zhang // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R.
1881aac854edSJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1882aac854edSJunchao Zhang {
1883aac854edSJunchao Zhang   auto                        exec    = PetscGetKokkosExecutionSpace();
1884aac854edSJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1885aac854edSJunchao Zhang   PetscInt                    m       = A->rmap->n;
1886aac854edSJunchao Zhang   PetscScalarKokkosView       X, Y, B; // alias
1887aac854edSJunchao Zhang   ConstPetscScalarKokkosView  b;
1888aac854edSJunchao Zhang   PetscScalarKokkosView       x;
1889aac854edSJunchao Zhang   PetscIntKokkosView         &rowperm      = factors->rowperm;
1890aac854edSJunchao Zhang   PetscIntKokkosView         &colperm      = factors->colperm;
1891aac854edSJunchao Zhang   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1892aac854edSJunchao Zhang   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1893aac854edSJunchao Zhang 
1894aac854edSJunchao Zhang   PetscFunctionBegin;
1895aac854edSJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1896aac854edSJunchao Zhang   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T
1897aac854edSJunchao Zhang   PetscCall(VecGetKokkosView(bb, &b));
1898aac854edSJunchao Zhang   PetscCall(VecGetKokkosViewWrite(xx, &x));
1899aac854edSJunchao Zhang 
1900aac854edSJunchao Zhang   // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b).  Note C^- b = C^T b, which means applying the column permutation on b.
1901aac854edSJunchao Zhang   if (col_identity) { // Reorder b with the col permutation
1902aac854edSJunchao Zhang     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1903aac854edSJunchao Zhang     Y = factors->workVector;
1904aac854edSJunchao Zhang   } else {
1905aac854edSJunchao Zhang     B = factors->workVector;
1906aac854edSJunchao Zhang     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); }));
1907aac854edSJunchao Zhang     Y = x;
1908aac854edSJunchao Zhang   }
1909aac854edSJunchao Zhang   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1910aac854edSJunchao Zhang 
1911aac854edSJunchao Zhang   // Solve L^T X = Y
1912aac854edSJunchao Zhang   if (row_identity) {
1913aac854edSJunchao Zhang     X = x;
1914aac854edSJunchao Zhang   } else {
1915aac854edSJunchao Zhang     X = factors->workVector;
1916aac854edSJunchao Zhang   }
1917aac854edSJunchao Zhang   PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X));
1918aac854edSJunchao Zhang 
1919aac854edSJunchao Zhang   // x = R^- X = R^T X; Reorder X with the inverse row permutation
19203a7d0413SPierre Jolivet   if (!row_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1921aac854edSJunchao Zhang 
1922aac854edSJunchao Zhang   PetscCall(VecRestoreKokkosView(bb, &b));
1923aac854edSJunchao Zhang   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1924aac854edSJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1925aac854edSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1926aac854edSJunchao Zhang }
1927aac854edSJunchao Zhang 
1928aac854edSJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1929aac854edSJunchao Zhang {
1930aac854edSJunchao Zhang   PetscFunctionBegin;
1931aac854edSJunchao Zhang   PetscCall(MatSeqAIJKokkosSyncHost(A));
1932aac854edSJunchao Zhang   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1933aac854edSJunchao Zhang 
1934aac854edSJunchao Zhang   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
1935aac854edSJunchao Zhang     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1936aac854edSJunchao Zhang     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
1937aac854edSJunchao Zhang     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
1938aac854edSJunchao Zhang     const MatScalar            *Ba = b->a;
1939aac854edSJunchao Zhang     PetscInt                    m = B->rmap->n, n = B->cmap->n;
1940aac854edSJunchao Zhang 
1941aac854edSJunchao Zhang     if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time
1942aac854edSJunchao Zhang       // Allocate memory and copy the structure
1943aac854edSJunchao Zhang       factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1);
1944aac854edSJunchao Zhang       factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries
1945aac854edSJunchao Zhang       factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m);
1946aac854edSJunchao Zhang       factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1);
1947aac854edSJunchao Zhang       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m]));
1948aac854edSJunchao Zhang       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m]));
1949aac854edSJunchao Zhang 
1950aac854edSJunchao Zhang       PetscInt *Li = factors->iL_h.data();
1951aac854edSJunchao Zhang       PetscInt *Lj = factors->jL_h.data();
1952aac854edSJunchao Zhang       PetscInt *Ui = factors->iU_h.data();
1953aac854edSJunchao Zhang       PetscInt *Uj = factors->jU_h.data();
1954aac854edSJunchao Zhang 
1955aac854edSJunchao Zhang       Li[0] = Ui[0] = 0;
1956aac854edSJunchao Zhang       for (PetscInt i = 0; i < m; i++) {
1957aac854edSJunchao Zhang         PetscInt llen = Bi[i + 1] - Bi[i];       // exclusive of the diagonal entry
1958aac854edSJunchao Zhang         PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry
1959aac854edSJunchao Zhang 
196064dc1d19SNuno Nobre         PetscCall(PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen)); // entries of L on the left of the diagonal
1961aac854edSJunchao Zhang         Lj[Li[i] + llen] = i;                                   // diagonal entry of L
1962aac854edSJunchao Zhang 
1963aac854edSJunchao Zhang         Uj[Ui[i]] = i;                                                             // diagonal entry of U
196464dc1d19SNuno Nobre         PetscCall(PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1)); // entries of U on  the right of the diagonal
1965aac854edSJunchao Zhang 
1966aac854edSJunchao Zhang         Li[i + 1] = Li[i] + llen + 1;
1967aac854edSJunchao Zhang         Ui[i + 1] = Ui[i] + ulen;
1968aac854edSJunchao Zhang       }
1969aac854edSJunchao Zhang 
1970aac854edSJunchao Zhang       factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h);
1971aac854edSJunchao Zhang       factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h);
1972aac854edSJunchao Zhang       factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h);
1973aac854edSJunchao Zhang       factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h);
1974aac854edSJunchao Zhang       factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h);
1975aac854edSJunchao Zhang       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
1976aac854edSJunchao Zhang 
1977aac854edSJunchao Zhang       // Copy row/col permutation to device
1978aac854edSJunchao Zhang       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
1979aac854edSJunchao Zhang       PetscBool row_identity;
1980aac854edSJunchao Zhang       PetscCall(ISIdentity(rowperm, &row_identity));
1981aac854edSJunchao Zhang       if (!row_identity) {
1982aac854edSJunchao Zhang         const PetscInt *ip;
1983aac854edSJunchao Zhang 
1984aac854edSJunchao Zhang         PetscCall(ISGetIndices(rowperm, &ip));
1985aac854edSJunchao Zhang         factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m);
1986aac854edSJunchao Zhang         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
1987aac854edSJunchao Zhang         PetscCall(ISRestoreIndices(rowperm, &ip));
1988aac854edSJunchao Zhang         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
1989aac854edSJunchao Zhang       }
1990aac854edSJunchao Zhang 
1991aac854edSJunchao Zhang       IS        colperm = ((Mat_SeqAIJ *)B->data)->col;
1992aac854edSJunchao Zhang       PetscBool col_identity;
1993aac854edSJunchao Zhang       PetscCall(ISIdentity(colperm, &col_identity));
1994aac854edSJunchao Zhang       if (!col_identity) {
1995aac854edSJunchao Zhang         const PetscInt *ip;
1996aac854edSJunchao Zhang 
1997aac854edSJunchao Zhang         PetscCall(ISGetIndices(colperm, &ip));
1998aac854edSJunchao Zhang         factors->colperm = PetscIntKokkosView(NoInit("colperm"), n);
1999aac854edSJunchao Zhang         PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n)));
2000aac854edSJunchao Zhang         PetscCall(ISRestoreIndices(colperm, &ip));
2001aac854edSJunchao Zhang         PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
2002aac854edSJunchao Zhang       }
2003aac854edSJunchao Zhang 
2004aac854edSJunchao Zhang       /* Create sptrsv handles for L, U and their transpose */
2005aac854edSJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2006aac854edSJunchao Zhang       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2007aac854edSJunchao Zhang #else
2008aac854edSJunchao Zhang       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2009aac854edSJunchao Zhang #endif
2010aac854edSJunchao Zhang       factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */);
2011aac854edSJunchao Zhang       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2012aac854edSJunchao Zhang       factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */);
2013aac854edSJunchao Zhang       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2014aac854edSJunchao Zhang     }
2015aac854edSJunchao Zhang 
2016aac854edSJunchao Zhang     // Copy the value
2017aac854edSJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
2018aac854edSJunchao Zhang       PetscInt        llen = Bi[i + 1] - Bi[i];
2019aac854edSJunchao Zhang       PetscInt        ulen = Bdiag[i] - Bdiag[i + 1];
2020aac854edSJunchao Zhang       const PetscInt *Li   = factors->iL_h.data();
2021aac854edSJunchao Zhang       const PetscInt *Ui   = factors->iU_h.data();
2022aac854edSJunchao Zhang 
2023aac854edSJunchao Zhang       PetscScalar *La = factors->aL_h.data();
2024aac854edSJunchao Zhang       PetscScalar *Ua = factors->aU_h.data();
2025aac854edSJunchao Zhang 
202664dc1d19SNuno Nobre       PetscCall(PetscArraycpy(La + Li[i], Ba + Bi[i], llen)); // entries of L
2027aac854edSJunchao Zhang       La[Li[i] + llen] = 1.0;                                 // diagonal entry
2028aac854edSJunchao Zhang 
2029aac854edSJunchao Zhang       Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]];                                            // diagonal entry
203064dc1d19SNuno Nobre       PetscCall(PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1)); // entries of U
2031aac854edSJunchao Zhang     }
2032aac854edSJunchao Zhang 
2033aac854edSJunchao Zhang     PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h));
2034aac854edSJunchao Zhang     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2035aac854edSJunchao Zhang     // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic
2036aac854edSJunchao Zhang     factors->transpose_updated         = PETSC_FALSE;
2037aac854edSJunchao Zhang     factors->sptrsv_symbolic_completed = PETSC_FALSE;
2038aac854edSJunchao Zhang 
2039aac854edSJunchao Zhang     B->ops->solve          = MatSolve_SeqAIJKokkos_LU;
2040aac854edSJunchao Zhang     B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2041aac854edSJunchao Zhang   }
2042aac854edSJunchao Zhang 
2043aac854edSJunchao Zhang   B->ops->matsolve          = NULL;
2044aac854edSJunchao Zhang   B->ops->matsolvetranspose = NULL;
2045aac854edSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
2046aac854edSJunchao Zhang }
2047aac854edSJunchao Zhang 
2048aac854edSJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info)
2049d71ae5a4SJacob Faibussowitsch {
205086a27549SJunchao Zhang   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
205186a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
205286a27549SJunchao Zhang   PetscInt                    fill_lev = info->levels;
205386a27549SJunchao Zhang 
205486a27549SJunchao Zhang   PetscFunctionBegin;
20559566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2056aac854edSJunchao Zhang   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false");
20579566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2058076ba34aSJunchao Zhang 
2059076ba34aSJunchao Zhang   auto a_d = aijkok->a_dual.view_device();
2060076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
2061076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
2062076ba34aSJunchao Zhang 
2063aac854edSJunchao Zhang   PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d));
206486a27549SJunchao Zhang 
206586a27549SJunchao Zhang   B->assembled              = PETSC_TRUE;
206686a27549SJunchao Zhang   B->preallocated           = PETSC_TRUE;
2067aac854edSJunchao Zhang   B->ops->solve             = MatSolve_SeqAIJKokkos_LU;
2068aac854edSJunchao Zhang   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos_LU;
206986a27549SJunchao Zhang   B->ops->matsolve          = NULL;
207086a27549SJunchao Zhang   B->ops->matsolvetranspose = NULL;
207186a27549SJunchao Zhang 
207286a27549SJunchao Zhang   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
207386a27549SJunchao Zhang   factors->transpose_updated         = PETSC_FALSE;
207486a27549SJunchao Zhang   factors->sptrsv_symbolic_completed = PETSC_FALSE;
2075eeadb341SJunchao Zhang   /* TODO: log flops, but how to know that? */
20769566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
20773ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
207886a27549SJunchao Zhang }
207986a27549SJunchao Zhang 
2080aac854edSJunchao Zhang // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering
2081aac854edSJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info)
2082d71ae5a4SJacob Faibussowitsch {
208386a27549SJunchao Zhang   Mat_SeqAIJKokkos           *aijkok;
208486a27549SJunchao Zhang   Mat_SeqAIJ                 *b;
208586a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
208686a27549SJunchao Zhang   PetscInt                    fill_lev = info->levels;
208786a27549SJunchao Zhang   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
208886a27549SJunchao Zhang   PetscInt                    n        = A->rmap->n;
208986a27549SJunchao Zhang 
209086a27549SJunchao Zhang   PetscFunctionBegin;
2091aac854edSJunchao Zhang   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now");
20929566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
209386a27549SJunchao Zhang 
209486a27549SJunchao Zhang   /* Create a spiluk handle and then do symbolic factorization */
209586a27549SJunchao Zhang   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
2096aac854edSJunchao Zhang   factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
209786a27549SJunchao Zhang 
209886a27549SJunchao Zhang   auto spiluk_handle = factors->kh.get_spiluk_handle();
209986a27549SJunchao Zhang 
210086a27549SJunchao Zhang   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
210186a27549SJunchao Zhang   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
210286a27549SJunchao Zhang   Kokkos::realloc(factors->iU_d, n + 1);
210386a27549SJunchao Zhang   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
210486a27549SJunchao Zhang 
210586a27549SJunchao Zhang   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2106076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
2107076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
2108aac854edSJunchao Zhang   PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d));
210986a27549SJunchao Zhang   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
211086a27549SJunchao Zhang 
211186a27549SJunchao Zhang   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
211286a27549SJunchao Zhang   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
211386a27549SJunchao Zhang   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
211486a27549SJunchao Zhang   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
211586a27549SJunchao Zhang 
211686a27549SJunchao Zhang   /* TODO: add options to select sptrsv algorithms */
211786a27549SJunchao Zhang   /* Create sptrsv handles for L, U and their transpose */
211886a27549SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2119aac854edSJunchao Zhang   auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
212086a27549SJunchao Zhang #else
2121aac854edSJunchao Zhang   auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
212286a27549SJunchao Zhang #endif
212386a27549SJunchao Zhang 
212486a27549SJunchao Zhang   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
212586a27549SJunchao Zhang   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
212686a27549SJunchao Zhang   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
212786a27549SJunchao Zhang   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
212886a27549SJunchao Zhang 
212986a27549SJunchao Zhang   /* Fill fields of the factor matrix B */
21309566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
213186a27549SJunchao Zhang   b     = (Mat_SeqAIJ *)B->data;
213286a27549SJunchao Zhang   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
213386a27549SJunchao Zhang   B->info.fill_ratio_given  = info->fill;
2134a1e4e190SStefano Zampini   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
213586a27549SJunchao Zhang 
2136aac854edSJunchao Zhang   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0;
21373ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2138930e68a5SMark Adams }
2139930e68a5SMark Adams 
2140aac854edSJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2141aac854edSJunchao Zhang {
2142aac854edSJunchao Zhang   PetscFunctionBegin;
2143aac854edSJunchao Zhang   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2144aac854edSJunchao Zhang   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2145aac854edSJunchao Zhang   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2146aac854edSJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2147aac854edSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
2148aac854edSJunchao Zhang }
2149aac854edSJunchao Zhang 
2150aac854edSJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2151aac854edSJunchao Zhang {
2152aac854edSJunchao Zhang   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2153aac854edSJunchao Zhang 
2154aac854edSJunchao Zhang   PetscFunctionBegin;
2155aac854edSJunchao Zhang   if (!info->factoronhost) {
2156aac854edSJunchao Zhang     PetscCall(ISIdentity(isrow, &row_identity));
2157aac854edSJunchao Zhang     PetscCall(ISIdentity(iscol, &col_identity));
2158aac854edSJunchao Zhang   }
2159aac854edSJunchao Zhang 
2160aac854edSJunchao Zhang   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2161aac854edSJunchao Zhang   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2162aac854edSJunchao Zhang 
2163aac854edSJunchao Zhang   if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering
2164aac854edSJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info));
2165aac854edSJunchao Zhang   } else {
2166aac854edSJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host
2167aac854edSJunchao Zhang     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2168aac854edSJunchao Zhang   }
2169aac854edSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
2170aac854edSJunchao Zhang }
2171aac854edSJunchao Zhang 
2172aac854edSJunchao Zhang static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
2173aac854edSJunchao Zhang {
2174aac854edSJunchao Zhang   PetscFunctionBegin;
2175aac854edSJunchao Zhang   PetscCall(MatSeqAIJKokkosSyncHost(A));
2176aac854edSJunchao Zhang   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
2177aac854edSJunchao Zhang 
2178aac854edSJunchao Zhang   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
2179aac854edSJunchao Zhang     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2180aac854edSJunchao Zhang     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
2181aac854edSJunchao Zhang     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
2182aac854edSJunchao Zhang     const MatScalar            *Ba = b->a;
2183aac854edSJunchao Zhang     PetscInt                    m  = B->rmap->n;
2184aac854edSJunchao Zhang 
2185aac854edSJunchao Zhang     if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization
2186aac854edSJunchao Zhang       // Allocate memory and copy the structure
2187aac854edSJunchao Zhang       factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h
2188aac854edSJunchao Zhang       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]);
2189aac854edSJunchao Zhang       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]);
2190aac854edSJunchao Zhang       factors->D_h  = MatScalarKokkosViewHost(NoInit("D_h"), m);
2191aac854edSJunchao Zhang       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
2192aac854edSJunchao Zhang       factors->D_d  = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h);
2193aac854edSJunchao Zhang 
2194aac854edSJunchao Zhang       // Build jU_h from the skewed Aj
2195aac854edSJunchao Zhang       PetscInt *Uj = factors->jU_h.data();
2196aac854edSJunchao Zhang       for (PetscInt i = 0; i < m; i++) {
2197aac854edSJunchao Zhang         PetscInt ulen = Bi[i + 1] - Bi[i];
2198aac854edSJunchao Zhang         Uj[Bi[i]]     = i;                                              // diagonal entry
2199aac854edSJunchao Zhang         PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal
2200aac854edSJunchao Zhang       }
2201aac854edSJunchao Zhang 
2202aac854edSJunchao Zhang       // Copy iU, jU to device
2203aac854edSJunchao Zhang       PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h));
2204aac854edSJunchao Zhang       PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h));
2205aac854edSJunchao Zhang 
2206aac854edSJunchao Zhang       // Copy row/col permutation to device
2207aac854edSJunchao Zhang       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
2208aac854edSJunchao Zhang       PetscBool row_identity;
2209aac854edSJunchao Zhang       PetscCall(ISIdentity(rowperm, &row_identity));
2210aac854edSJunchao Zhang       if (!row_identity) {
2211aac854edSJunchao Zhang         const PetscInt *ip;
2212aac854edSJunchao Zhang 
2213aac854edSJunchao Zhang         PetscCall(ISGetIndices(rowperm, &ip));
2214aac854edSJunchao Zhang         PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m));
2215aac854edSJunchao Zhang         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2216aac854edSJunchao Zhang         PetscCall(ISRestoreIndices(rowperm, &ip));
2217aac854edSJunchao Zhang         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2218aac854edSJunchao Zhang       }
2219aac854edSJunchao Zhang 
2220aac854edSJunchao Zhang       // Create sptrsv handles for U and U^T
2221aac854edSJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2222aac854edSJunchao Zhang       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2223aac854edSJunchao Zhang #else
2224aac854edSJunchao Zhang       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2225aac854edSJunchao Zhang #endif
2226aac854edSJunchao Zhang       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2227aac854edSJunchao Zhang       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2228aac854edSJunchao Zhang     }
2229aac854edSJunchao Zhang     // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them
2230aac854edSJunchao Zhang     B->ops->solve          = MatSolve_SeqAIJKokkos_Cholesky;
2231aac854edSJunchao Zhang     B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky;
2232aac854edSJunchao Zhang 
2233aac854edSJunchao Zhang     // Copy the value
2234aac854edSJunchao Zhang     PetscScalar *Ua = factors->aU_h.data();
2235aac854edSJunchao Zhang     PetscScalar *D  = factors->D_h.data();
2236aac854edSJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
2237aac854edSJunchao Zhang       D[i]      = Ba[Bdiag[i]];     // actually Aa[Adiag[i]] is the inverse of the diagonal
2238aac854edSJunchao Zhang       Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U
2239aac854edSJunchao Zhang       for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k];
2240aac854edSJunchao Zhang     }
2241aac854edSJunchao Zhang     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2242aac854edSJunchao Zhang     PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h));
2243aac854edSJunchao Zhang 
2244aac854edSJunchao Zhang     factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again
2245aac854edSJunchao Zhang     factors->transpose_updated         = PETSC_FALSE;
2246aac854edSJunchao Zhang   }
2247aac854edSJunchao Zhang 
2248aac854edSJunchao Zhang   B->ops->matsolve          = NULL;
2249aac854edSJunchao Zhang   B->ops->matsolvetranspose = NULL;
2250aac854edSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
2251aac854edSJunchao Zhang }
2252aac854edSJunchao Zhang 
2253aac854edSJunchao Zhang static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2254aac854edSJunchao Zhang {
2255aac854edSJunchao Zhang   PetscFunctionBegin;
2256aac854edSJunchao Zhang   if (info->solveonhost) {
2257aac854edSJunchao Zhang     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2258aac854edSJunchao Zhang     PetscCall(MatSetType(B, MATSEQSBAIJ));
2259aac854edSJunchao Zhang     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2260aac854edSJunchao Zhang   }
2261aac854edSJunchao Zhang 
2262aac854edSJunchao Zhang   PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2263aac854edSJunchao Zhang 
2264aac854edSJunchao Zhang   if (!info->solveonhost) {
2265bfe80ac4SPierre Jolivet     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2266aac854edSJunchao Zhang     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2267aac854edSJunchao Zhang     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2268aac854edSJunchao Zhang     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2269aac854edSJunchao Zhang   }
2270aac854edSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
2271aac854edSJunchao Zhang }
2272aac854edSJunchao Zhang 
2273aac854edSJunchao Zhang static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2274aac854edSJunchao Zhang {
2275aac854edSJunchao Zhang   PetscFunctionBegin;
2276aac854edSJunchao Zhang   if (info->solveonhost) {
2277aac854edSJunchao Zhang     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2278aac854edSJunchao Zhang     PetscCall(MatSetType(B, MATSEQSBAIJ));
2279aac854edSJunchao Zhang     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2280aac854edSJunchao Zhang   }
2281aac854edSJunchao Zhang 
2282aac854edSJunchao Zhang   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm
2283aac854edSJunchao Zhang 
2284aac854edSJunchao Zhang   if (!info->solveonhost) {
2285bfe80ac4SPierre Jolivet     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2286aac854edSJunchao Zhang     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2287aac854edSJunchao Zhang     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2288aac854edSJunchao Zhang     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2289aac854edSJunchao Zhang   }
2290aac854edSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
2291aac854edSJunchao Zhang }
2292aac854edSJunchao Zhang 
2293aac854edSJunchao Zhang // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix
2294aac854edSJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type)
2295d71ae5a4SJacob Faibussowitsch {
2296930e68a5SMark Adams   PetscFunctionBegin;
2297930e68a5SMark Adams   *type = MATSOLVERKOKKOS;
22983ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2299930e68a5SMark Adams }
2300930e68a5SMark Adams 
2301930e68a5SMark Adams /*MC
230286a27549SJunchao Zhang   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
230311a5261eSBarry Smith   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
2304930e68a5SMark Adams 
2305930e68a5SMark Adams   Level: beginner
2306930e68a5SMark Adams 
23071cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
2308930e68a5SMark Adams M*/
230986a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
2310930e68a5SMark Adams {
2311930e68a5SMark Adams   PetscInt n = A->rmap->n;
2312aac854edSJunchao Zhang   MPI_Comm comm;
2313930e68a5SMark Adams 
2314930e68a5SMark Adams   PetscFunctionBegin;
2315aac854edSJunchao Zhang   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2316aac854edSJunchao Zhang   PetscCall(MatCreate(comm, B));
23179566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*B, n, n, n, n));
2318aac854edSJunchao Zhang   PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2319930e68a5SMark Adams   (*B)->factortype = ftype;
23209566063dSJacob Faibussowitsch   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
23219566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2322aac854edSJunchao Zhang   PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2323aac854edSJunchao Zhang 
2324aac854edSJunchao Zhang   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2325aac854edSJunchao Zhang     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
2326aac854edSJunchao Zhang     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
2327aac854edSJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2328aac854edSJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2329aac854edSJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2330aac854edSJunchao Zhang   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2331aac854edSJunchao Zhang     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJKokkos;
2332aac854edSJunchao Zhang     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos;
2333aac854edSJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2334aac854edSJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2335aac854edSJunchao Zhang   } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
2336aac854edSJunchao Zhang 
2337aac854edSJunchao Zhang   // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host
2338aac854edSJunchao Zhang   (*B)->canuseordering = PETSC_TRUE;
2339aac854edSJunchao Zhang   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos));
23403ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2341930e68a5SMark Adams }
23428f7e8f9dSMark Adams 
2343aac854edSJunchao Zhang PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void)
2344d71ae5a4SJacob Faibussowitsch {
234586a27549SJunchao Zhang   PetscFunctionBegin;
23469566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
2347aac854edSJunchao Zhang   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos));
23489566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
2349aac854edSJunchao Zhang   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos));
23503ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
235186a27549SJunchao Zhang }
235286a27549SJunchao Zhang 
2353076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */
2354d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
2355d71ae5a4SJacob Faibussowitsch {
235645402d8aSJunchao Zhang   const auto        &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
235745402d8aSJunchao Zhang   const auto        &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
235845402d8aSJunchao Zhang   const auto        &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
2359076ba34aSJunchao Zhang   const PetscInt    *i  = iv.data();
2360076ba34aSJunchao Zhang   const PetscInt    *j  = jv.data();
2361076ba34aSJunchao Zhang   const PetscScalar *a  = av.data();
2362076ba34aSJunchao Zhang   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
2363076ba34aSJunchao Zhang 
2364076ba34aSJunchao Zhang   PetscFunctionBegin;
23659566063dSJacob Faibussowitsch   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
2366076ba34aSJunchao Zhang   for (PetscInt k = 0; k < m; k++) {
23679566063dSJacob Faibussowitsch     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
236848a46eb9SPierre Jolivet     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
23699566063dSJacob Faibussowitsch     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
2370076ba34aSJunchao Zhang   }
23713ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2372076ba34aSJunchao Zhang }
2373