xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 93a54799f44632781c486dd4b73c1c88575e4136)
1e36ced11SJunchao Zhang #include <petsc_kokkos.hpp>
211d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp>
3c0c276a7Ssdargavi #include <petscmat_kokkos.hpp>
4076ba34aSJunchao Zhang #include <petscpkg_version.h>
5152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h>
642550becSJunchao Zhang #include <petsc/private/sfimpl.h>
7aac854edSJunchao Zhang #include <petsc/private/kokkosimpl.hpp>
87233ce55SJed Brown #include <petscsys.h>
98c3ff71bSJunchao Zhang 
108c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp>
11f0cf5187SStefano Zampini #include <KokkosBlas.hpp>
128c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp>
13cc6e31f1SJunchao Zhang 
14cc6e31f1SJunchao Zhang // To suppress compiler warnings:
15cc6e31f1SJunchao Zhang // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
16cc6e31f1SJunchao Zhang // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
17cc6e31f1SJunchao Zhang // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
18cc6e31f1SJunchao Zhang // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
19cc6e31f1SJunchao Zhang PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
208c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
21cc6e31f1SJunchao Zhang PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
22cc6e31f1SJunchao Zhang 
2386a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp>
2486a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp>
25076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp>
26076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp>
279d13fa56SJunchao Zhang #include <KokkosBatched_LU_Decl.hpp>
289d13fa56SJunchao Zhang #include <KokkosBatched_InverseLU_Decl.hpp>
2986a27549SJunchao Zhang 
3042550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
318c3ff71bSJunchao Zhang 
320e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
33f98996d3SJunchao Zhang   #include <KokkosSparse_Utils.hpp>
34f98996d3SJunchao Zhang using KokkosSparse::sort_crs_matrix;
359371c9d4SSatish Balay using KokkosSparse::Impl::transpose_matrix;
36f98996d3SJunchao Zhang #else
37f98996d3SJunchao Zhang   #include <KokkosKernels_Sorting.hpp>
38f98996d3SJunchao Zhang using KokkosKernels::sort_crs_matrix;
399371c9d4SSatish Balay using KokkosKernels::Impl::transpose_matrix;
40f98996d3SJunchao Zhang #endif
41f98996d3SJunchao Zhang 
42aac854edSJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0)
43aac854edSJunchao Zhang using KokkosSparse::spiluk_symbolic;
44aac854edSJunchao Zhang using KokkosSparse::spiluk_numeric;
45aac854edSJunchao Zhang using KokkosSparse::sptrsv_symbolic;
46aac854edSJunchao Zhang using KokkosSparse::sptrsv_solve;
47aac854edSJunchao Zhang using KokkosSparse::Experimental::SPTRSVAlgorithm;
48aac854edSJunchao Zhang using KokkosSparse::Experimental::SPILUKAlgorithm;
49aac854edSJunchao Zhang #else
50aac854edSJunchao Zhang using KokkosSparse::Experimental::spiluk_symbolic;
51aac854edSJunchao Zhang using KokkosSparse::Experimental::spiluk_numeric;
52aac854edSJunchao Zhang using KokkosSparse::Experimental::sptrsv_symbolic;
53aac854edSJunchao Zhang using KokkosSparse::Experimental::sptrsv_solve;
54aac854edSJunchao Zhang using KokkosSparse::Experimental::SPTRSVAlgorithm;
55aac854edSJunchao Zhang using KokkosSparse::Experimental::SPILUKAlgorithm;
56aac854edSJunchao Zhang #endif
57aac854edSJunchao Zhang 
588c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
598c3ff71bSJunchao Zhang 
60076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
61076ba34aSJunchao Zhang    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
62076ba34aSJunchao Zhang    In the latter case, it is important to set a_dual's sync state correctly.
63076ba34aSJunchao Zhang  */
64d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
65d71ae5a4SJacob Faibussowitsch {
66076ba34aSJunchao Zhang   Mat_SeqAIJ       *aijseq;
67076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
688c3ff71bSJunchao Zhang 
698c3ff71bSJunchao Zhang   PetscFunctionBegin;
703ba16761SJacob Faibussowitsch   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
719566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
72076ba34aSJunchao Zhang 
73076ba34aSJunchao Zhang   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
74076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
75076ba34aSJunchao Zhang 
76076ba34aSJunchao Zhang   /* If aijkok does not exist, we just copy i, j to device.
77076ba34aSJunchao Zhang      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
78076ba34aSJunchao Zhang      In both cases, we build a new aijkok structure.
79076ba34aSJunchao Zhang   */
80076ba34aSJunchao Zhang   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
81*93a54799SPierre Jolivet     if (aijkok && aijkok->host_aij_allocated_by_kokkos) {   /* Avoid accidentally freeing much needed a,i,j on host when deleting aijkok */
82d1c799ffSJunchao Zhang       PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i));
83d1c799ffSJunchao Zhang       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j));
84d1c799ffSJunchao Zhang       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a));
85d1c799ffSJunchao Zhang       PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1));
86d1c799ffSJunchao Zhang       PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz()));
87d1c799ffSJunchao Zhang       PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz()));
88d1c799ffSJunchao Zhang       aijseq->free_a  = PETSC_TRUE;
89d1c799ffSJunchao Zhang       aijseq->free_ij = PETSC_TRUE;
90d1c799ffSJunchao Zhang       /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where
91d1c799ffSJunchao Zhang          we have the CsrMatrix on device first and then copy to host, followed by
92d1c799ffSJunchao Zhang          MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL.
93d1c799ffSJunchao Zhang          One could improve it by not using NULL garray.
94d1c799ffSJunchao Zhang       */
95d1c799ffSJunchao Zhang     }
96076ba34aSJunchao Zhang     delete aijkok;
97f4747e26SJunchao Zhang     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
98076ba34aSJunchao Zhang     A->spptr = aijkok;
99f4747e26SJunchao Zhang   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
100f4747e26SJunchao Zhang     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
101f4747e26SJunchao Zhang     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
102f4747e26SJunchao Zhang     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
103076ba34aSJunchao Zhang   }
1043ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1058c3ff71bSJunchao Zhang }
1068c3ff71bSJunchao Zhang 
10786a27549SJunchao Zhang /* Sync CSR data to device if not yet */
108d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
109d71ae5a4SJacob Faibussowitsch {
1108c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1118c3ff71bSJunchao Zhang 
1128c3ff71bSJunchao Zhang   PetscFunctionBegin;
113aaa8cc7dSPierre Jolivet   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
1145f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
115076ba34aSJunchao Zhang   if (aijkok->a_dual.need_sync_device()) {
116f3d3cd90SZach Atkins     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
117580c7c76SPierre Jolivet     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
11886a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_FALSE;
1198c3ff71bSJunchao Zhang   }
1203ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1218c3ff71bSJunchao Zhang }
1228c3ff71bSJunchao Zhang 
123076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */
124d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
125d71ae5a4SJacob Faibussowitsch {
12686a27549SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
12786a27549SJunchao Zhang 
12886a27549SJunchao Zhang   PetscFunctionBegin;
1295f80ce2aSJacob Faibussowitsch   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
13086a27549SJunchao Zhang   aijkok->a_dual.clear_sync_state();
13186a27549SJunchao Zhang   aijkok->a_dual.modify_device();
13286a27549SJunchao Zhang   aijkok->transpose_updated = PETSC_FALSE;
13386a27549SJunchao Zhang   aijkok->hermitian_updated = PETSC_FALSE;
1349566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
1359566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
1363ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
13786a27549SJunchao Zhang }
13886a27549SJunchao Zhang 
139d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
140d71ae5a4SJacob Faibussowitsch {
141f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1424df4a32cSJunchao Zhang   auto              exec   = PetscGetKokkosExecutionSpace();
143f0cf5187SStefano Zampini 
144f0cf5187SStefano Zampini   PetscFunctionBegin;
145f0cf5187SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
14686a27549SJunchao Zhang   /* We do not expect one needs factors on host  */
147aaa8cc7dSPierre Jolivet   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
1485f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
149f3d3cd90SZach Atkins   PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec));
1503ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
151f0cf5187SStefano Zampini }
152f0cf5187SStefano Zampini 
153d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
154d71ae5a4SJacob Faibussowitsch {
155076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
156f0cf5187SStefano Zampini 
157f0cf5187SStefano Zampini   PetscFunctionBegin;
1585519a089SJose E. Roman   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
1595519a089SJose E. Roman     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
1605519a089SJose E. Roman     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
1615519a089SJose E. Roman     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
1625519a089SJose E. Roman   */
1635519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
164f3d3cd90SZach Atkins     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
165076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
166076ba34aSJunchao Zhang   } else { /* Happens when calling MatSetValues on a newly created matrix */
167076ba34aSJunchao Zhang     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
168076ba34aSJunchao Zhang   }
1693ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
170076ba34aSJunchao Zhang }
171076ba34aSJunchao Zhang 
172d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
173d71ae5a4SJacob Faibussowitsch {
174076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
175076ba34aSJunchao Zhang 
176076ba34aSJunchao Zhang   PetscFunctionBegin;
1775519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
1783ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
179076ba34aSJunchao Zhang }
180076ba34aSJunchao Zhang 
181d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
182d71ae5a4SJacob Faibussowitsch {
183076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
184076ba34aSJunchao Zhang 
185076ba34aSJunchao Zhang   PetscFunctionBegin;
1865519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
187f3d3cd90SZach Atkins     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
188076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1892328674fSJunchao Zhang   } else {
1902328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
1912328674fSJunchao Zhang   }
1923ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
193076ba34aSJunchao Zhang }
194076ba34aSJunchao Zhang 
195d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
196d71ae5a4SJacob Faibussowitsch {
197076ba34aSJunchao Zhang   PetscFunctionBegin;
198076ba34aSJunchao Zhang   *array = NULL;
1993ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
200076ba34aSJunchao Zhang }
201076ba34aSJunchao Zhang 
202d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
203d71ae5a4SJacob Faibussowitsch {
204076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
205076ba34aSJunchao Zhang 
206076ba34aSJunchao Zhang   PetscFunctionBegin;
2075519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
208076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
2092328674fSJunchao Zhang   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
2102328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
2112328674fSJunchao Zhang   }
2123ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
213076ba34aSJunchao Zhang }
214076ba34aSJunchao Zhang 
215d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
216d71ae5a4SJacob Faibussowitsch {
217076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
218076ba34aSJunchao Zhang 
219076ba34aSJunchao Zhang   PetscFunctionBegin;
2205519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
221076ba34aSJunchao Zhang     aijkok->a_dual.clear_sync_state();
222076ba34aSJunchao Zhang     aijkok->a_dual.modify_host();
2232328674fSJunchao Zhang   }
2243ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
225f0cf5187SStefano Zampini }
226f0cf5187SStefano Zampini 
227d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
228d71ae5a4SJacob Faibussowitsch {
2297ee59b9bSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
2307ee59b9bSJunchao Zhang 
2317ee59b9bSJunchao Zhang   PetscFunctionBegin;
2327ee59b9bSJunchao Zhang   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
2337ee59b9bSJunchao Zhang 
2347ee59b9bSJunchao Zhang   if (i) *i = aijkok->i_device_data();
2357ee59b9bSJunchao Zhang   if (j) *j = aijkok->j_device_data();
2367ee59b9bSJunchao Zhang   if (a) {
237f3d3cd90SZach Atkins     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
2387ee59b9bSJunchao Zhang     *a = aijkok->a_device_data();
2397ee59b9bSJunchao Zhang   }
2407ee59b9bSJunchao Zhang   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
2413ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2427ee59b9bSJunchao Zhang }
2437ee59b9bSJunchao Zhang 
2440e3ece09SJunchao Zhang /*
2450e3ece09SJunchao Zhang   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
2460e3ece09SJunchao Zhang 
2470e3ece09SJunchao Zhang   Input Parameter:
2480e3ece09SJunchao Zhang .  A       - the MATSEQAIJKOKKOS matrix
2490e3ece09SJunchao Zhang 
2500e3ece09SJunchao Zhang   Output Parameters:
2510e3ece09SJunchao Zhang +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
252aaa8cc7dSPierre Jolivet -  T_d    - the transpose on device, whose value array is allocated but not initialized
2530e3ece09SJunchao Zhang */
2540e3ece09SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
255d71ae5a4SJacob Faibussowitsch {
2560e3ece09SJunchao Zhang   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
2570e3ece09SJunchao Zhang   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
2580e3ece09SJunchao Zhang   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
2597b8d4ba6SJunchao Zhang   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
2600e3ece09SJunchao Zhang   MatRowMapType          *Ti = Ti_h.data();
2617b8d4ba6SJunchao Zhang   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
2627b8d4ba6SJunchao Zhang   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
2630e3ece09SJunchao Zhang   PetscInt               *Tj   = Tj_h.data();
2640e3ece09SJunchao Zhang   PetscInt               *perm = perm_h.data();
2650e3ece09SJunchao Zhang   PetscInt               *offset;
266152b3e56SJunchao Zhang 
267152b3e56SJunchao Zhang   PetscFunctionBegin;
2680e3ece09SJunchao Zhang   // Populate Ti
2690e3ece09SJunchao Zhang   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
2700e3ece09SJunchao Zhang   Ti++;
2710e3ece09SJunchao Zhang   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
2720e3ece09SJunchao Zhang   Ti--;
2730e3ece09SJunchao Zhang   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
2740e3ece09SJunchao Zhang 
2750e3ece09SJunchao Zhang   // Populate Tj and the permutation array
2760e3ece09SJunchao Zhang   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
2770e3ece09SJunchao Zhang   for (PetscInt i = 0; i < m; i++) {
2780e3ece09SJunchao Zhang     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
2790e3ece09SJunchao Zhang       PetscInt r    = Aj[j];                       // row r of T
2800e3ece09SJunchao Zhang       PetscInt disp = Ti[r] + offset[r];
2810e3ece09SJunchao Zhang 
2820e3ece09SJunchao Zhang       Tj[disp]   = i; // col i of T
2830e3ece09SJunchao Zhang       perm[disp] = j;
2840e3ece09SJunchao Zhang       offset[r]++;
285076ba34aSJunchao Zhang     }
2860e3ece09SJunchao Zhang   }
2870e3ece09SJunchao Zhang   PetscCall(PetscFree(offset));
2880e3ece09SJunchao Zhang 
2890e3ece09SJunchao Zhang   // Sort each row of T, along with the permutation array
2900e3ece09SJunchao Zhang   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
2910e3ece09SJunchao Zhang 
2920e3ece09SJunchao Zhang   // Output perm and T on device
2930e3ece09SJunchao Zhang   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
2940e3ece09SJunchao Zhang   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
2950e3ece09SJunchao Zhang   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
2960e3ece09SJunchao Zhang   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
2973ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
298152b3e56SJunchao Zhang }
299152b3e56SJunchao Zhang 
3000e3ece09SJunchao Zhang // Generate the transpose on device and cache it internally
3010e3ece09SJunchao Zhang // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
3020e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
303d71ae5a4SJacob Faibussowitsch {
3040e3ece09SJunchao Zhang   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
3050e3ece09SJunchao Zhang   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
3060e3ece09SJunchao Zhang   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
3070e3ece09SJunchao Zhang   KokkosCsrMatrix  &T = akok->csrmatT;
308152b3e56SJunchao Zhang 
309152b3e56SJunchao Zhang   PetscFunctionBegin;
3100e3ece09SJunchao Zhang   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
311f3d3cd90SZach Atkins   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
3120e3ece09SJunchao Zhang 
3130e3ece09SJunchao Zhang   const auto &Aa = akok->a_dual.view_device();
3140e3ece09SJunchao Zhang 
3150e3ece09SJunchao Zhang   if (A->symmetric == PETSC_BOOL3_TRUE) {
3160e3ece09SJunchao Zhang     *csrmatT = akok->csrmat;
3170e3ece09SJunchao Zhang   } else {
3180e3ece09SJunchao Zhang     // See if we already have a cached transpose and its value is up to date
3190e3ece09SJunchao Zhang     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
3200e3ece09SJunchao Zhang       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
3210e3ece09SJunchao Zhang         const auto &perm = akok->transpose_perm; // get the permutation array
3220e3ece09SJunchao Zhang         auto       &Ta   = T.values;
3230e3ece09SJunchao Zhang 
324d326c3f1SJunchao Zhang         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
325076ba34aSJunchao Zhang       }
3260e3ece09SJunchao Zhang     } else { // Generate T of size n x m for the first time
3270e3ece09SJunchao Zhang       MatRowMapKokkosView perm;
3280e3ece09SJunchao Zhang 
3290e3ece09SJunchao Zhang       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
3300e3ece09SJunchao Zhang       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
331d326c3f1SJunchao Zhang       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
3320e3ece09SJunchao Zhang     }
3330e3ece09SJunchao Zhang     akok->transpose_updated = PETSC_TRUE;
3340e3ece09SJunchao Zhang     *csrmatT                = akok->csrmatT;
3350e3ece09SJunchao Zhang   }
3360e3ece09SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
3370e3ece09SJunchao Zhang }
3380e3ece09SJunchao Zhang 
3390e3ece09SJunchao Zhang // Generate the Hermitian on device and cache it internally
3400e3ece09SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
3410e3ece09SJunchao Zhang {
3420e3ece09SJunchao Zhang   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
3430e3ece09SJunchao Zhang   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
3440e3ece09SJunchao Zhang   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
3450e3ece09SJunchao Zhang   KokkosCsrMatrix  &T = akok->csrmatH;
3460e3ece09SJunchao Zhang 
3470e3ece09SJunchao Zhang   PetscFunctionBegin;
3480e3ece09SJunchao Zhang   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
349f3d3cd90SZach Atkins   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
3500e3ece09SJunchao Zhang 
3510e3ece09SJunchao Zhang   const auto &Aa = akok->a_dual.view_device();
3520e3ece09SJunchao Zhang 
3530e3ece09SJunchao Zhang   if (A->hermitian == PETSC_BOOL3_TRUE) {
3540e3ece09SJunchao Zhang     *csrmatH = akok->csrmat;
3550e3ece09SJunchao Zhang   } else {
3560e3ece09SJunchao Zhang     // See if we already have a cached hermitian and its value is up to date
3570e3ece09SJunchao Zhang     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
3580e3ece09SJunchao Zhang       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
3590e3ece09SJunchao Zhang         const auto &perm = akok->transpose_perm; // get the permutation array
3600e3ece09SJunchao Zhang         auto       &Ta   = T.values;
3610e3ece09SJunchao Zhang 
362d326c3f1SJunchao Zhang         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
3630e3ece09SJunchao Zhang       }
3640e3ece09SJunchao Zhang     } else { // Generate T of size n x m for the first time
3650e3ece09SJunchao Zhang       MatRowMapKokkosView perm;
3660e3ece09SJunchao Zhang 
3670e3ece09SJunchao Zhang       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
3680e3ece09SJunchao Zhang       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
369d326c3f1SJunchao Zhang       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
3700e3ece09SJunchao Zhang     }
3710e3ece09SJunchao Zhang     akok->hermitian_updated = PETSC_TRUE;
3720e3ece09SJunchao Zhang     *csrmatH                = akok->csrmatH;
3730e3ece09SJunchao Zhang   }
3743ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
375152b3e56SJunchao Zhang }
376a587d139SMark 
3778c3ff71bSJunchao Zhang /* y = A x */
378d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
379d71ae5a4SJacob Faibussowitsch {
3808c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
381152b3e56SJunchao Zhang   ConstPetscScalarKokkosView xv;
382152b3e56SJunchao Zhang   PetscScalarKokkosView      yv;
3838c3ff71bSJunchao Zhang 
3848c3ff71bSJunchao Zhang   PetscFunctionBegin;
3859566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3869566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
3879566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
3889566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(yy, &yv));
3898c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
390d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
3919566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
3929566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
393076ba34aSJunchao Zhang   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
3949566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
3959566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3963ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3978c3ff71bSJunchao Zhang }
3988c3ff71bSJunchao Zhang 
3998c3ff71bSJunchao Zhang /* y = A^T x */
400d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
401d71ae5a4SJacob Faibussowitsch {
4028c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
403152b3e56SJunchao Zhang   const char                *mode;
404152b3e56SJunchao Zhang   ConstPetscScalarKokkosView xv;
405152b3e56SJunchao Zhang   PetscScalarKokkosView      yv;
4060e3ece09SJunchao Zhang   KokkosCsrMatrix            csrmat;
4078c3ff71bSJunchao Zhang 
4088c3ff71bSJunchao Zhang   PetscFunctionBegin;
4099566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4109566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
4119566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
4129566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(yy, &yv));
413152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
4149566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
415152b3e56SJunchao Zhang     mode = "N";
416152b3e56SJunchao Zhang   } else {
417076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
4180e3ece09SJunchao Zhang     csrmat = aijkok->csrmat;
419152b3e56SJunchao Zhang     mode   = "T";
420152b3e56SJunchao Zhang   }
421d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
4229566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
4239566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
4240e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
4259566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
4263ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4278c3ff71bSJunchao Zhang }
4288c3ff71bSJunchao Zhang 
4298c3ff71bSJunchao Zhang /* y = A^H x */
430d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
431d71ae5a4SJacob Faibussowitsch {
4328c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
433152b3e56SJunchao Zhang   const char                *mode;
434152b3e56SJunchao Zhang   ConstPetscScalarKokkosView xv;
435152b3e56SJunchao Zhang   PetscScalarKokkosView      yv;
4360e3ece09SJunchao Zhang   KokkosCsrMatrix            csrmat;
4378c3ff71bSJunchao Zhang 
4388c3ff71bSJunchao Zhang   PetscFunctionBegin;
4399566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4409566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
4419566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
4429566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(yy, &yv));
443152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
4449566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
445152b3e56SJunchao Zhang     mode = "N";
446152b3e56SJunchao Zhang   } else {
447076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
4480e3ece09SJunchao Zhang     csrmat = aijkok->csrmat;
449152b3e56SJunchao Zhang     mode   = "C";
450152b3e56SJunchao Zhang   }
451d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
4529566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
4539566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
4540e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
4559566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
4563ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4578c3ff71bSJunchao Zhang }
4588c3ff71bSJunchao Zhang 
4598c3ff71bSJunchao Zhang /* z = A x + y */
460d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
461d71ae5a4SJacob Faibussowitsch {
4628c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
46392896123SJunchao Zhang   ConstPetscScalarKokkosView xv;
464152b3e56SJunchao Zhang   PetscScalarKokkosView      zv;
4658c3ff71bSJunchao Zhang 
4668c3ff71bSJunchao Zhang   PetscFunctionBegin;
4679566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4689566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
46992896123SJunchao Zhang   if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
4709566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
47192896123SJunchao Zhang   PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
4728c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
473d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
4749566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
47592896123SJunchao Zhang   PetscCall(VecRestoreKokkosView(zz, &zv));
4769566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
4779566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
4783ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4798c3ff71bSJunchao Zhang }
4808c3ff71bSJunchao Zhang 
4818c3ff71bSJunchao Zhang /* z = A^T x + y */
482d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
483d71ae5a4SJacob Faibussowitsch {
4848c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
485152b3e56SJunchao Zhang   const char                *mode;
48692896123SJunchao Zhang   ConstPetscScalarKokkosView xv;
487152b3e56SJunchao Zhang   PetscScalarKokkosView      zv;
4880e3ece09SJunchao Zhang   KokkosCsrMatrix            csrmat;
4898c3ff71bSJunchao Zhang 
4908c3ff71bSJunchao Zhang   PetscFunctionBegin;
4919566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4929566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
49392896123SJunchao Zhang   if (zz != yy) PetscCall(VecCopy(yy, zz));
4949566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
49592896123SJunchao Zhang   PetscCall(VecGetKokkosView(zz, &zv));
496152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
4979566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
498152b3e56SJunchao Zhang     mode = "N";
499152b3e56SJunchao Zhang   } else {
500076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
5010e3ece09SJunchao Zhang     csrmat = aijkok->csrmat;
502152b3e56SJunchao Zhang     mode   = "T";
503152b3e56SJunchao Zhang   }
504d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
5059566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
50692896123SJunchao Zhang   PetscCall(VecRestoreKokkosView(zz, &zv));
5070e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
5089566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
5093ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5108c3ff71bSJunchao Zhang }
5118c3ff71bSJunchao Zhang 
5128c3ff71bSJunchao Zhang /* z = A^H x + y */
513d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
514d71ae5a4SJacob Faibussowitsch {
5158c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
516152b3e56SJunchao Zhang   const char                *mode;
51792896123SJunchao Zhang   ConstPetscScalarKokkosView xv;
518152b3e56SJunchao Zhang   PetscScalarKokkosView      zv;
5190e3ece09SJunchao Zhang   KokkosCsrMatrix            csrmat;
5208c3ff71bSJunchao Zhang 
5218c3ff71bSJunchao Zhang   PetscFunctionBegin;
5229566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
5239566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
52492896123SJunchao Zhang   if (zz != yy) PetscCall(VecCopy(yy, zz));
5259566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
52692896123SJunchao Zhang   PetscCall(VecGetKokkosView(zz, &zv));
527152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
5289566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
529152b3e56SJunchao Zhang     mode = "N";
530152b3e56SJunchao Zhang   } else {
531076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
5320e3ece09SJunchao Zhang     csrmat = aijkok->csrmat;
533152b3e56SJunchao Zhang     mode   = "C";
534152b3e56SJunchao Zhang   }
535d326c3f1SJunchao Zhang   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
5369566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
53792896123SJunchao Zhang   PetscCall(VecRestoreKokkosView(zz, &zv));
5380e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
5399566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
5403ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
541152b3e56SJunchao Zhang }
542152b3e56SJunchao Zhang 
54366976f2fSJacob Faibussowitsch static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
544d71ae5a4SJacob Faibussowitsch {
545152b3e56SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
546152b3e56SJunchao Zhang 
547152b3e56SJunchao Zhang   PetscFunctionBegin;
548152b3e56SJunchao Zhang   switch (op) {
549152b3e56SJunchao Zhang   case MAT_FORM_EXPLICIT_TRANSPOSE:
550152b3e56SJunchao Zhang     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
5519566063dSJacob Faibussowitsch     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
552152b3e56SJunchao Zhang     A->form_explicit_transpose = flg;
553152b3e56SJunchao Zhang     break;
554d71ae5a4SJacob Faibussowitsch   default:
555d71ae5a4SJacob Faibussowitsch     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
556d71ae5a4SJacob Faibussowitsch     break;
557152b3e56SJunchao Zhang   }
5583ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5598c3ff71bSJunchao Zhang }
5608c3ff71bSJunchao Zhang 
561076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */
562d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
563d71ae5a4SJacob Faibussowitsch {
564076ba34aSJunchao Zhang   Mat_SeqAIJ *aseq;
5658c3ff71bSJunchao Zhang 
5668c3ff71bSJunchao Zhang   PetscFunctionBegin;
5679566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
568076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
56951ece73cSJunchao Zhang     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
57051ece73cSJunchao Zhang     PetscCall(MatSetType(*newmat, mtype));
5718c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
5729566063dSJacob Faibussowitsch     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
573076ba34aSJunchao Zhang   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
5745f80ce2aSJacob Faibussowitsch     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
5759566063dSJacob Faibussowitsch     PetscCall(PetscFree(A->defaultvectype));
5769566063dSJacob Faibussowitsch     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
5779566063dSJacob Faibussowitsch     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
5789566063dSJacob Faibussowitsch     PetscCall(MatSetOps_SeqAIJKokkos(A));
579076ba34aSJunchao Zhang     aseq = static_cast<Mat_SeqAIJ *>(A->data);
580394ed5ebSJunchao Zhang     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
5815f80ce2aSJacob Faibussowitsch       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
582f4747e26SJunchao Zhang       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
5838c3ff71bSJunchao Zhang     }
584076ba34aSJunchao Zhang   }
5853ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5868c3ff71bSJunchao Zhang }
5878c3ff71bSJunchao Zhang 
588076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
589076ba34aSJunchao Zhang    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
590076ba34aSJunchao Zhang  */
591d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
592d71ae5a4SJacob Faibussowitsch {
593076ba34aSJunchao Zhang   Mat_SeqAIJ       *bseq;
594076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
595076ba34aSJunchao Zhang   Mat               mat;
5968c3ff71bSJunchao Zhang 
5978c3ff71bSJunchao Zhang   PetscFunctionBegin;
598076ba34aSJunchao Zhang   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
5999566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
600076ba34aSJunchao Zhang   mat = *B;
601f4747e26SJunchao Zhang   if (A->assembled) {
602076ba34aSJunchao Zhang     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
603f4747e26SJunchao Zhang     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
604076ba34aSJunchao Zhang     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
605076ba34aSJunchao Zhang     /* Now copy values to B if needed */
606076ba34aSJunchao Zhang     if (dupOption == MAT_COPY_VALUES) {
607076ba34aSJunchao Zhang       if (akok->a_dual.need_sync_device()) {
608076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
609076ba34aSJunchao Zhang         bkok->a_dual.modify_host();
610076ba34aSJunchao Zhang       } else { /* If device has the latest data, we only copy data on device */
611076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
612076ba34aSJunchao Zhang         bkok->a_dual.modify_device();
613076ba34aSJunchao Zhang       }
614076ba34aSJunchao Zhang     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
615076ba34aSJunchao Zhang       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
616076ba34aSJunchao Zhang       bkok->a_dual.modify_host();
617076ba34aSJunchao Zhang     }
618076ba34aSJunchao Zhang     mat->spptr = bkok;
619076ba34aSJunchao Zhang   }
620076ba34aSJunchao Zhang 
6219566063dSJacob Faibussowitsch   PetscCall(PetscFree(mat->defaultvectype));
6229566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
6239566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
6249566063dSJacob Faibussowitsch   PetscCall(MatSetOps_SeqAIJKokkos(mat));
6253ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6268c3ff71bSJunchao Zhang }
6278c3ff71bSJunchao Zhang 
628d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
629d71ae5a4SJacob Faibussowitsch {
6300ecb592aSJunchao Zhang   Mat               At;
6310e3ece09SJunchao Zhang   KokkosCsrMatrix   internT;
6320ecb592aSJunchao Zhang   Mat_SeqAIJKokkos *atkok, *bkok;
6330ecb592aSJunchao Zhang 
6340ecb592aSJunchao Zhang   PetscFunctionBegin;
6357fb60732SBarry Smith   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
6369566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
6370ecb592aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
638ff751488SJunchao Zhang     /* Deep copy internT, as we want to isolate the internal transpose */
6390e3ece09SJunchao Zhang     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
6409566063dSJacob Faibussowitsch     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
6410ecb592aSJunchao Zhang     if (reuse == MAT_INITIAL_MATRIX) *B = At;
6429566063dSJacob Faibussowitsch     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
6430ecb592aSJunchao Zhang   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
6440ecb592aSJunchao Zhang     if ((*B)->assembled) {
6450ecb592aSJunchao Zhang       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
6460e3ece09SJunchao Zhang       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
6479566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
6480ecb592aSJunchao Zhang     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
6490ecb592aSJunchao Zhang       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
6500e3ece09SJunchao Zhang       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
6510e3ece09SJunchao Zhang       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
6520e3ece09SJunchao Zhang       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
6530e3ece09SJunchao Zhang       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
6540ecb592aSJunchao Zhang     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
6550ecb592aSJunchao Zhang   }
6563ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6570ecb592aSJunchao Zhang }
6580ecb592aSJunchao Zhang 
659d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
660d71ae5a4SJacob Faibussowitsch {
66186a27549SJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
6628c3ff71bSJunchao Zhang 
6638c3ff71bSJunchao Zhang   PetscFunctionBegin;
66486a27549SJunchao Zhang   if (A->factortype == MAT_FACTOR_NONE) {
66586a27549SJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
6668c3ff71bSJunchao Zhang     delete aijkok;
66786a27549SJunchao Zhang   } else {
66886a27549SJunchao Zhang     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
66986a27549SJunchao Zhang   }
670cbc6b225SStefano Zampini   A->spptr = NULL;
6719566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
6729566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
6739566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
67457761e9aSJunchao Zhang #if defined(PETSC_HAVE_HYPRE)
67557761e9aSJunchao Zhang   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
67657761e9aSJunchao Zhang #endif
6779566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
6783ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6798c3ff71bSJunchao Zhang }
6808c3ff71bSJunchao Zhang 
6813f3ba80aSJunchao Zhang /*MC
6823f3ba80aSJunchao Zhang    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
6833f3ba80aSJunchao Zhang 
68415229ffcSPierre Jolivet    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
6853f3ba80aSJunchao Zhang 
6862ef1f0ffSBarry Smith    Options Database Key:
68711a5261eSBarry Smith .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
6883f3ba80aSJunchao Zhang 
6893f3ba80aSJunchao Zhang   Level: beginner
6903f3ba80aSJunchao Zhang 
6911cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
6923f3ba80aSJunchao Zhang M*/
693d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
694d71ae5a4SJacob Faibussowitsch {
69586a27549SJunchao Zhang   PetscFunctionBegin;
6969566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
6979566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(A));
6989566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
6993ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
70086a27549SJunchao Zhang }
70186a27549SJunchao Zhang 
702076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
703d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
704d71ae5a4SJacob Faibussowitsch {
705076ba34aSJunchao Zhang   Mat_SeqAIJ         *a, *b;
706076ba34aSJunchao Zhang   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
707076ba34aSJunchao Zhang   MatScalarKokkosView aa, ba, ca;
708076ba34aSJunchao Zhang   MatRowMapKokkosView ai, bi, ci;
709076ba34aSJunchao Zhang   MatColIdxKokkosView aj, bj, cj;
710076ba34aSJunchao Zhang   PetscInt            m, n, nnz, aN;
711a3f881fbSStefano Zampini 
712a3f881fbSStefano Zampini   PetscFunctionBegin;
713076ba34aSJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
714076ba34aSJunchao Zhang   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
7154f572ea9SToby Isaac   PetscAssertPointer(C, 4);
716076ba34aSJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
717076ba34aSJunchao Zhang   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
7185f80ce2aSJacob Faibussowitsch   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
7195f80ce2aSJacob Faibussowitsch   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
720076ba34aSJunchao Zhang 
7219566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
7229566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B));
723076ba34aSJunchao Zhang   a    = static_cast<Mat_SeqAIJ *>(A->data);
724076ba34aSJunchao Zhang   b    = static_cast<Mat_SeqAIJ *>(B->data);
725076ba34aSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
726076ba34aSJunchao Zhang   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
727076ba34aSJunchao Zhang   aa   = akok->a_dual.view_device();
728076ba34aSJunchao Zhang   ai   = akok->i_dual.view_device();
729076ba34aSJunchao Zhang   ba   = bkok->a_dual.view_device();
730076ba34aSJunchao Zhang   bi   = bkok->i_dual.view_device();
731076ba34aSJunchao Zhang   m    = A->rmap->n; /* M, N and nnz of C */
732076ba34aSJunchao Zhang   n    = A->cmap->n + B->cmap->n;
733076ba34aSJunchao Zhang   nnz  = a->nz + b->nz;
734076ba34aSJunchao Zhang   aN   = A->cmap->n; /* N of A */
735076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) {
736076ba34aSJunchao Zhang     aj      = akok->j_dual.view_device();
737076ba34aSJunchao Zhang     bj      = bkok->j_dual.view_device();
738ecd797f4SJunchao Zhang     auto ca = MatScalarKokkosView("a", aa.extent(0) + ba.extent(0));
739ecd797f4SJunchao Zhang     auto ci = MatRowMapKokkosView("i", ai.extent(0));
740ecd797f4SJunchao Zhang     auto cj = MatColIdxKokkosView("j", aj.extent(0) + bj.extent(0));
741076ba34aSJunchao Zhang 
742076ba34aSJunchao Zhang     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
7439371c9d4SSatish Balay     Kokkos::parallel_for(
744d326c3f1SJunchao Zhang       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
745076ba34aSJunchao Zhang         PetscInt i       = t.league_rank(); /* row i */
746076ba34aSJunchao Zhang         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
747076ba34aSJunchao Zhang 
748076ba34aSJunchao Zhang         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
749076ba34aSJunchao Zhang                                                    ci(i) = coffset;
750076ba34aSJunchao Zhang                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
751076ba34aSJunchao Zhang         });
752076ba34aSJunchao Zhang 
753076ba34aSJunchao Zhang         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
754076ba34aSJunchao Zhang           if (k < alen) {
755076ba34aSJunchao Zhang             ca(coffset + k) = aa(ai(i) + k);
756076ba34aSJunchao Zhang             cj(coffset + k) = aj(ai(i) + k);
757076ba34aSJunchao Zhang           } else {
758076ba34aSJunchao Zhang             ca(coffset + k) = ba(bi(i) + k - alen);
759076ba34aSJunchao Zhang             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
760076ba34aSJunchao Zhang           }
761076ba34aSJunchao Zhang         });
762076ba34aSJunchao Zhang       });
763ecd797f4SJunchao Zhang     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci, cj, ca));
7649566063dSJacob Faibussowitsch     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
765076ba34aSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {
766076ba34aSJunchao Zhang     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
767076ba34aSJunchao Zhang     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
768076ba34aSJunchao Zhang     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
769076ba34aSJunchao Zhang     ca   = ckok->a_dual.view_device();
770076ba34aSJunchao Zhang     ci   = ckok->i_dual.view_device();
771076ba34aSJunchao Zhang 
7729371c9d4SSatish Balay     Kokkos::parallel_for(
773d326c3f1SJunchao Zhang       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
774076ba34aSJunchao Zhang         PetscInt i    = t.league_rank(); /* row i */
775076ba34aSJunchao Zhang         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
776076ba34aSJunchao Zhang         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
777076ba34aSJunchao Zhang           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
778076ba34aSJunchao Zhang           else ca(ci(i) + k) = ba(bi(i) + k - alen);
779076ba34aSJunchao Zhang         });
780076ba34aSJunchao Zhang       });
7819566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
782076ba34aSJunchao Zhang   }
7833ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
784076ba34aSJunchao Zhang }
785076ba34aSJunchao Zhang 
786d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
787d71ae5a4SJacob Faibussowitsch {
788076ba34aSJunchao Zhang   PetscFunctionBegin;
789076ba34aSJunchao Zhang   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
7903ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
791a3f881fbSStefano Zampini }
792a3f881fbSStefano Zampini 
793d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
794d71ae5a4SJacob Faibussowitsch {
795a3f881fbSStefano Zampini   Mat_Product                 *product = C->product;
796a3f881fbSStefano Zampini   Mat                          A, B;
797076ba34aSJunchao Zhang   bool                         transA, transB; /* use bool, since KK needs this type */
798a3f881fbSStefano Zampini   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
799a3f881fbSStefano Zampini   Mat_SeqAIJ                  *c;
800076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos *pdata;
8010e3ece09SJunchao Zhang   KokkosCsrMatrix              csrmatA, csrmatB;
802a3f881fbSStefano Zampini 
803a3f881fbSStefano Zampini   PetscFunctionBegin;
804a3f881fbSStefano Zampini   MatCheckProduct(C, 1);
8055f80ce2aSJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
806076ba34aSJunchao Zhang   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
807076ba34aSJunchao Zhang 
8080e3ece09SJunchao Zhang   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
8090e3ece09SJunchao Zhang   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
8100e3ece09SJunchao Zhang   // we still do numeric.
8110e3ece09SJunchao Zhang   if (pdata->reusesym) { // numeric reuses results from symbolic
8120e3ece09SJunchao Zhang     pdata->reusesym = PETSC_FALSE;
8133ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
814076ba34aSJunchao Zhang   }
815076ba34aSJunchao Zhang 
816076ba34aSJunchao Zhang   switch (product->type) {
8179371c9d4SSatish Balay   case MATPRODUCT_AB:
8189371c9d4SSatish Balay     transA = false;
8199371c9d4SSatish Balay     transB = false;
8209371c9d4SSatish Balay     break;
8219371c9d4SSatish Balay   case MATPRODUCT_AtB:
8229371c9d4SSatish Balay     transA = true;
8239371c9d4SSatish Balay     transB = false;
8249371c9d4SSatish Balay     break;
8259371c9d4SSatish Balay   case MATPRODUCT_ABt:
8269371c9d4SSatish Balay     transA = false;
8279371c9d4SSatish Balay     transB = true;
8289371c9d4SSatish Balay     break;
829d71ae5a4SJacob Faibussowitsch   default:
830d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
831076ba34aSJunchao Zhang   }
832076ba34aSJunchao Zhang 
833a3f881fbSStefano Zampini   A = product->A;
834a3f881fbSStefano Zampini   B = product->B;
8359566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
8369566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B));
837a3f881fbSStefano Zampini   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
838a3f881fbSStefano Zampini   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
839a3f881fbSStefano Zampini   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
840076ba34aSJunchao Zhang 
8415f80ce2aSJacob Faibussowitsch   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
842076ba34aSJunchao Zhang 
8430e3ece09SJunchao Zhang   csrmatA = akok->csrmat;
8440e3ece09SJunchao Zhang   csrmatB = bkok->csrmat;
845076ba34aSJunchao Zhang 
846076ba34aSJunchao Zhang   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
847076ba34aSJunchao Zhang   if (transA) {
8489566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
849076ba34aSJunchao Zhang     transA = false;
850a3f881fbSStefano Zampini   }
851a3f881fbSStefano Zampini 
852076ba34aSJunchao Zhang   if (transB) {
8539566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
854076ba34aSJunchao Zhang     transB = false;
855076ba34aSJunchao Zhang   }
8569566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
8570e3ece09SJunchao Zhang   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
8580e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
859866eb059SJunchao Zhang   auto spgemmHandle = pdata->kh.get_spgemm_handle();
860866eb059SJunchao Zhang   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
861e944a159SJunchao Zhang #endif
862866eb059SJunchao Zhang 
8639566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
8649566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(C));
865a3f881fbSStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
866a3f881fbSStefano Zampini   c = (Mat_SeqAIJ *)C->data;
8679566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
8689566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
8699566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
870a3f881fbSStefano Zampini   c->reallocs         = 0;
871076ba34aSJunchao Zhang   C->info.mallocs     = 0;
872a3f881fbSStefano Zampini   C->info.nz_unneeded = 0;
873a3f881fbSStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
874a3f881fbSStefano Zampini   C->num_ass++;
8753ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
876a3f881fbSStefano Zampini }
877a3f881fbSStefano Zampini 
878d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
879d71ae5a4SJacob Faibussowitsch {
880076ba34aSJunchao Zhang   Mat_Product                 *product = C->product;
881076ba34aSJunchao Zhang   MatProductType               ptype;
882076ba34aSJunchao Zhang   Mat                          A, B;
883076ba34aSJunchao Zhang   bool                         transA, transB;
884076ba34aSJunchao Zhang   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
885076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos *pdata;
886076ba34aSJunchao Zhang   MPI_Comm                     comm;
8870e3ece09SJunchao Zhang   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
888a3f881fbSStefano Zampini 
889a3f881fbSStefano Zampini   PetscFunctionBegin;
890a3f881fbSStefano Zampini   MatCheckProduct(C, 1);
8919566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
8925f80ce2aSJacob Faibussowitsch   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
893a3f881fbSStefano Zampini   A = product->A;
894a3f881fbSStefano Zampini   B = product->B;
8959566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
8969566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B));
897a3f881fbSStefano Zampini   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
898a3f881fbSStefano Zampini   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
8990e3ece09SJunchao Zhang   csrmatA = akok->csrmat;
9000e3ece09SJunchao Zhang   csrmatB = bkok->csrmat;
901076ba34aSJunchao Zhang 
902a3f881fbSStefano Zampini   ptype = product->type;
9030e3ece09SJunchao Zhang   // Take advantage of the symmetry if true
9040e3ece09SJunchao Zhang   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
9050e3ece09SJunchao Zhang     ptype                                          = MATPRODUCT_AB;
9060e3ece09SJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
9070e3ece09SJunchao Zhang   }
9080e3ece09SJunchao Zhang   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
9090e3ece09SJunchao Zhang     ptype                                          = MATPRODUCT_AB;
9100e3ece09SJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
9110e3ece09SJunchao Zhang   }
9120e3ece09SJunchao Zhang 
913a3f881fbSStefano Zampini   switch (ptype) {
9149371c9d4SSatish Balay   case MATPRODUCT_AB:
9159371c9d4SSatish Balay     transA = false;
9169371c9d4SSatish Balay     transB = false;
9170e6a1e94SMark Adams     PetscCall(MatSetBlockSizesFromMats(C, A, B));
9189371c9d4SSatish Balay     break;
9199371c9d4SSatish Balay   case MATPRODUCT_AtB:
9209371c9d4SSatish Balay     transA = true;
9219371c9d4SSatish Balay     transB = false;
9220e6a1e94SMark Adams     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
9230e6a1e94SMark Adams     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
9249371c9d4SSatish Balay     break;
9259371c9d4SSatish Balay   case MATPRODUCT_ABt:
9269371c9d4SSatish Balay     transA = false;
9279371c9d4SSatish Balay     transB = true;
9280e6a1e94SMark Adams     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
9290e6a1e94SMark Adams     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
9309371c9d4SSatish Balay     break;
931d71ae5a4SJacob Faibussowitsch   default:
932d71ae5a4SJacob Faibussowitsch     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
933a3f881fbSStefano Zampini   }
9340e3ece09SJunchao Zhang   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
935076ba34aSJunchao Zhang   pdata->reusesym = product->api_user;
936a3f881fbSStefano Zampini 
937076ba34aSJunchao Zhang   /* TODO: add command line options to select spgemm algorithms */
938866eb059SJunchao Zhang   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
939866eb059SJunchao Zhang 
940866eb059SJunchao Zhang   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
941866eb059SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
942866eb059SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
943866eb059SJunchao Zhang   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
944866eb059SJunchao Zhang   #endif
945866eb059SJunchao Zhang #endif
9460e3ece09SJunchao Zhang   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
947076ba34aSJunchao Zhang 
9489566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
949076ba34aSJunchao Zhang   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
950076ba34aSJunchao Zhang   if (transA) {
9519566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
952076ba34aSJunchao Zhang     transA = false;
953076ba34aSJunchao Zhang   }
954076ba34aSJunchao Zhang 
955076ba34aSJunchao Zhang   if (transB) {
9569566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
957076ba34aSJunchao Zhang     transB = false;
958076ba34aSJunchao Zhang   }
959076ba34aSJunchao Zhang 
9600e3ece09SJunchao Zhang   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
961076ba34aSJunchao Zhang   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
962076ba34aSJunchao Zhang     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
963076ba34aSJunchao Zhang     calling new Mat_SeqAIJKokkos().
964076ba34aSJunchao Zhang     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
965076ba34aSJunchao Zhang   */
9660e3ece09SJunchao Zhang   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
9670e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
968866eb059SJunchao Zhang   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
969866eb059SJunchao Zhang   auto spgemmHandle = pdata->kh.get_spgemm_handle();
970866eb059SJunchao Zhang   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
971e944a159SJunchao Zhang #endif
9729566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
973076ba34aSJunchao Zhang 
9749566063dSJacob Faibussowitsch   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
9759566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
976076ba34aSJunchao Zhang   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
9773ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
978a3f881fbSStefano Zampini }
979a3f881fbSStefano Zampini 
980a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */
981d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
982d71ae5a4SJacob Faibussowitsch {
983076ba34aSJunchao Zhang   Mat_Product *product = mat->product;
984a3f881fbSStefano Zampini   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
985a3f881fbSStefano Zampini 
986a3f881fbSStefano Zampini   PetscFunctionBegin;
987a3f881fbSStefano Zampini   MatCheckProduct(mat, 1);
9889566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
98948a46eb9SPierre Jolivet   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
990a3f881fbSStefano Zampini   if (Biskok && Ciskok) {
991a3f881fbSStefano Zampini     switch (product->type) {
992a3f881fbSStefano Zampini     case MATPRODUCT_AB:
993a3f881fbSStefano Zampini     case MATPRODUCT_AtB:
994d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABt:
995d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
996d71ae5a4SJacob Faibussowitsch       break;
997a3f881fbSStefano Zampini     case MATPRODUCT_PtAP:
998a3f881fbSStefano Zampini     case MATPRODUCT_RARt:
999d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABC:
1000d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
1001d71ae5a4SJacob Faibussowitsch       break;
1002d71ae5a4SJacob Faibussowitsch     default:
1003d71ae5a4SJacob Faibussowitsch       break;
1004a3f881fbSStefano Zampini     }
1005a3f881fbSStefano Zampini   } else { /* fallback for AIJ */
10069566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
1007a3f881fbSStefano Zampini   }
10083ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1009a3f881fbSStefano Zampini }
1010a587d139SMark 
1011d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
1012d71ae5a4SJacob Faibussowitsch {
1013f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok;
1014f0cf5187SStefano Zampini 
1015f0cf5187SStefano Zampini   PetscFunctionBegin;
10169566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
10179566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1018f0cf5187SStefano Zampini   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1019d326c3f1SJunchao Zhang   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
10209566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(A));
10219566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
10229566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
10233ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1024f0cf5187SStefano Zampini }
1025f0cf5187SStefano Zampini 
1026f4747e26SJunchao Zhang // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
1027f4747e26SJunchao Zhang static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1028f4747e26SJunchao Zhang {
1029f4747e26SJunchao Zhang   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1030f4747e26SJunchao Zhang 
1031f4747e26SJunchao Zhang   PetscFunctionBegin;
1032f4747e26SJunchao Zhang   if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
1033f4747e26SJunchao Zhang     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1034f4747e26SJunchao Zhang 
1035f4747e26SJunchao Zhang     PetscCall(PetscLogGpuTimeBegin());
1036f4747e26SJunchao Zhang     PetscCall(MatSeqAIJKokkosSyncDevice(A));
1037f4747e26SJunchao Zhang     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1038f4747e26SJunchao Zhang     const auto &Aa     = aijkok->a_dual.view_device();
1039f4747e26SJunchao Zhang     const auto &Adiag  = aijkok->diag_dual.view_device();
1040d326c3f1SJunchao Zhang     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1041f4747e26SJunchao Zhang     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1042f4747e26SJunchao Zhang     PetscCall(PetscLogGpuFlops(n));
1043f4747e26SJunchao Zhang     PetscCall(PetscLogGpuTimeEnd());
1044f4747e26SJunchao Zhang   } else { // need reassembly, very slow!
1045f4747e26SJunchao Zhang     PetscCall(MatShift_Basic(A, a));
1046f4747e26SJunchao Zhang   }
1047f4747e26SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1048f4747e26SJunchao Zhang }
1049f4747e26SJunchao Zhang 
1050f4747e26SJunchao Zhang static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1051f4747e26SJunchao Zhang {
1052f4747e26SJunchao Zhang   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1053f4747e26SJunchao Zhang 
1054f4747e26SJunchao Zhang   PetscFunctionBegin;
1055f4747e26SJunchao Zhang   if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1056f4747e26SJunchao Zhang     ConstPetscScalarKokkosView dv;
1057f4747e26SJunchao Zhang     PetscInt                   n, nv;
1058f4747e26SJunchao Zhang 
1059f4747e26SJunchao Zhang     PetscCall(PetscLogGpuTimeBegin());
1060f4747e26SJunchao Zhang     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1061f4747e26SJunchao Zhang     PetscCall(VecGetKokkosView(D, &dv));
1062f4747e26SJunchao Zhang     PetscCall(VecGetLocalSize(D, &nv));
1063f4747e26SJunchao Zhang     n = PetscMin(Y->rmap->n, Y->cmap->n);
1064f4747e26SJunchao Zhang     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1065f4747e26SJunchao Zhang 
1066f4747e26SJunchao Zhang     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1067f4747e26SJunchao Zhang     const auto &Aa     = aijkok->a_dual.view_device();
1068f4747e26SJunchao Zhang     const auto &Adiag  = aijkok->diag_dual.view_device();
1069f4747e26SJunchao Zhang     PetscCallCXX(Kokkos::parallel_for(
1070d326c3f1SJunchao Zhang       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1071f4747e26SJunchao Zhang         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1072f4747e26SJunchao Zhang         else Aa(Adiag(i)) += dv(i);
1073f4747e26SJunchao Zhang       }));
1074f4747e26SJunchao Zhang     PetscCall(VecRestoreKokkosView(D, &dv));
1075f4747e26SJunchao Zhang     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1076f4747e26SJunchao Zhang     PetscCall(PetscLogGpuFlops(n));
1077f4747e26SJunchao Zhang     PetscCall(PetscLogGpuTimeEnd());
1078f4747e26SJunchao Zhang   } else { // need reassembly, very slow!
1079f4747e26SJunchao Zhang     PetscCall(MatDiagonalSet_Default(Y, D, is));
1080f4747e26SJunchao Zhang   }
1081f4747e26SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1082f4747e26SJunchao Zhang }
1083f4747e26SJunchao Zhang 
1084f4747e26SJunchao Zhang static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1085f4747e26SJunchao Zhang {
1086f4747e26SJunchao Zhang   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1087f4747e26SJunchao Zhang   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1088f4747e26SJunchao Zhang   ConstPetscScalarKokkosView lv, rv;
1089f4747e26SJunchao Zhang 
1090f4747e26SJunchao Zhang   PetscFunctionBegin;
1091f4747e26SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1092f4747e26SJunchao Zhang   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1093f4747e26SJunchao Zhang   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1094f4747e26SJunchao Zhang   const auto &Aa     = aijkok->a_dual.view_device();
1095f4747e26SJunchao Zhang   const auto &Ai     = aijkok->i_dual.view_device();
1096f4747e26SJunchao Zhang   const auto &Aj     = aijkok->j_dual.view_device();
1097f4747e26SJunchao Zhang   if (ll) {
1098f4747e26SJunchao Zhang     PetscCall(VecGetLocalSize(ll, &m));
1099f4747e26SJunchao Zhang     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1100f4747e26SJunchao Zhang     PetscCall(VecGetKokkosView(ll, &lv));
1101f4747e26SJunchao Zhang     PetscCallCXX(Kokkos::parallel_for( // for each row
1102d326c3f1SJunchao Zhang       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1103f4747e26SJunchao Zhang         PetscInt i   = t.league_rank(); // row i
1104f4747e26SJunchao Zhang         PetscInt len = Ai(i + 1) - Ai(i);
1105f4747e26SJunchao Zhang         // scale entries on the row
1106f4747e26SJunchao Zhang         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1107f4747e26SJunchao Zhang       }));
1108f4747e26SJunchao Zhang     PetscCall(VecRestoreKokkosView(ll, &lv));
1109f4747e26SJunchao Zhang     PetscCall(PetscLogGpuFlops(nz));
1110f4747e26SJunchao Zhang   }
1111f4747e26SJunchao Zhang   if (rr) {
1112f4747e26SJunchao Zhang     PetscCall(VecGetLocalSize(rr, &n));
1113f4747e26SJunchao Zhang     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1114f4747e26SJunchao Zhang     PetscCall(VecGetKokkosView(rr, &rv));
1115f4747e26SJunchao Zhang     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1116d326c3f1SJunchao Zhang       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1117f4747e26SJunchao Zhang     PetscCall(VecRestoreKokkosView(rr, &lv));
1118f4747e26SJunchao Zhang     PetscCall(PetscLogGpuFlops(nz));
1119f4747e26SJunchao Zhang   }
1120f4747e26SJunchao Zhang   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1121f4747e26SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1122f4747e26SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1123f4747e26SJunchao Zhang }
1124f4747e26SJunchao Zhang 
1125d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1126d71ae5a4SJacob Faibussowitsch {
1127076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
1128a587d139SMark 
1129a587d139SMark   PetscFunctionBegin;
1130076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
11312328674fSJunchao Zhang   if (aijkok) { /* Only zero the device if data is already there */
1132d326c3f1SJunchao Zhang     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
11339566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(A));
11342328674fSJunchao Zhang   } else { /* Might be preallocated but not assembled */
11359566063dSJacob Faibussowitsch     PetscCall(MatZeroEntries_SeqAIJ(A));
11362328674fSJunchao Zhang   }
11373ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1138a587d139SMark }
1139a587d139SMark 
1140d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1141d71ae5a4SJacob Faibussowitsch {
1142f78ce678SMark Adams   Mat_SeqAIJKokkos     *aijkok;
1143f78ce678SMark Adams   PetscInt              n;
1144f78ce678SMark Adams   PetscScalarKokkosView xv;
1145f78ce678SMark Adams 
1146f78ce678SMark Adams   PetscFunctionBegin;
1147f78ce678SMark Adams   PetscCall(VecGetLocalSize(x, &n));
1148f78ce678SMark Adams   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1149f78ce678SMark Adams   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1150f78ce678SMark Adams 
1151f78ce678SMark Adams   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1152f78ce678SMark Adams   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1153f78ce678SMark Adams 
1154f78ce678SMark Adams   const auto &Aa    = aijkok->a_dual.view_device();
1155f78ce678SMark Adams   const auto &Ai    = aijkok->i_dual.view_device();
1156f78ce678SMark Adams   const auto &Adiag = aijkok->diag_dual.view_device();
1157f78ce678SMark Adams 
1158f78ce678SMark Adams   PetscCall(VecGetKokkosViewWrite(x, &xv));
11599371c9d4SSatish Balay   Kokkos::parallel_for(
1160d326c3f1SJunchao Zhang     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1161f78ce678SMark Adams       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1162f78ce678SMark Adams       else xv(i) = 0;
1163f78ce678SMark Adams     });
1164f78ce678SMark Adams   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
11653ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1166f78ce678SMark Adams }
1167f78ce678SMark Adams 
1168db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1169d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1170d71ae5a4SJacob Faibussowitsch {
1171db78de30SJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
1172db78de30SJunchao Zhang 
1173db78de30SJunchao Zhang   PetscFunctionBegin;
1174db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
11754f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1176db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
11779566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1178db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1179076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
11803ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1181db78de30SJunchao Zhang }
1182db78de30SJunchao Zhang 
1183d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1184d71ae5a4SJacob Faibussowitsch {
1185db78de30SJunchao Zhang   PetscFunctionBegin;
1186db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
11874f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1188db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
11893ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1190db78de30SJunchao Zhang }
1191db78de30SJunchao Zhang 
1192d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1193d71ae5a4SJacob Faibussowitsch {
1194db78de30SJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
1195db78de30SJunchao Zhang 
1196db78de30SJunchao Zhang   PetscFunctionBegin;
1197db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
11984f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1199db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
12009566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1201db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1202076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
12033ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1204db78de30SJunchao Zhang }
1205db78de30SJunchao Zhang 
1206d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1207d71ae5a4SJacob Faibussowitsch {
1208db78de30SJunchao Zhang   PetscFunctionBegin;
1209db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
12104f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1211db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
12129566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(A));
12133ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1214db78de30SJunchao Zhang }
1215db78de30SJunchao Zhang 
1216d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1217d71ae5a4SJacob Faibussowitsch {
1218db78de30SJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
1219db78de30SJunchao Zhang 
1220db78de30SJunchao Zhang   PetscFunctionBegin;
1221db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
12224f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1223db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1224db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1225076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
12263ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1227db78de30SJunchao Zhang }
1228db78de30SJunchao Zhang 
1229d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1230d71ae5a4SJacob Faibussowitsch {
1231db78de30SJunchao Zhang   PetscFunctionBegin;
1232db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
12334f572ea9SToby Isaac   PetscAssertPointer(kv, 2);
1234db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
12359566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(A));
12363ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1237db78de30SJunchao Zhang }
1238db78de30SJunchao Zhang 
1239c0c276a7Ssdargavi PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A)
1240c0c276a7Ssdargavi {
1241c0c276a7Ssdargavi   Mat_SeqAIJKokkos *akok;
1242c0c276a7Ssdargavi 
1243c0c276a7Ssdargavi   PetscFunctionBegin;
1244ecd797f4SJunchao Zhang   PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_d.extent(0), i_d, j_d, a_d));
1245c0c276a7Ssdargavi   PetscCall(MatCreate(comm, A));
1246c0c276a7Ssdargavi   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1247c0c276a7Ssdargavi   PetscFunctionReturn(PETSC_SUCCESS);
1248c0c276a7Ssdargavi }
1249c0c276a7Ssdargavi 
1250c17cf699SJunchao Zhang /* Computes Y += alpha X */
1251d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1252d71ae5a4SJacob Faibussowitsch {
1253a587d139SMark   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1254c17cf699SJunchao Zhang   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1255c17cf699SJunchao Zhang   ConstMatScalarKokkosView Xa;
1256c17cf699SJunchao Zhang   MatScalarKokkosView      Ya;
12574df4a32cSJunchao Zhang   auto                     exec = PetscGetKokkosExecutionSpace();
1258a587d139SMark 
1259a587d139SMark   PetscFunctionBegin;
1260c17cf699SJunchao Zhang   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1261c17cf699SJunchao Zhang   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
12629566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
12639566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(X));
12649566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1265db78de30SJunchao Zhang 
1266c17cf699SJunchao Zhang   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1267a587d139SMark     PetscBool e;
12689566063dSJacob Faibussowitsch     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1269a587d139SMark     if (e) {
12709566063dSJacob Faibussowitsch       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1271c17cf699SJunchao Zhang       if (e) pattern = SAME_NONZERO_PATTERN;
1272a587d139SMark     }
1273a587d139SMark   }
1274db78de30SJunchao Zhang 
1275c17cf699SJunchao Zhang   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1276c17cf699SJunchao Zhang     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1277c17cf699SJunchao Zhang     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1278c17cf699SJunchao Zhang     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1279c17cf699SJunchao Zhang   */
1280c17cf699SJunchao Zhang   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1281c17cf699SJunchao Zhang   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1282c17cf699SJunchao Zhang   Xa   = xkok->a_dual.view_device();
1283c17cf699SJunchao Zhang   Ya   = ykok->a_dual.view_device();
1284c17cf699SJunchao Zhang 
1285c17cf699SJunchao Zhang   if (pattern == SAME_NONZERO_PATTERN) {
1286d326c3f1SJunchao Zhang     KokkosBlas::axpy(exec, alpha, Xa, Ya);
12879566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1288c17cf699SJunchao Zhang   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1289c17cf699SJunchao Zhang     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1290c17cf699SJunchao Zhang     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1291c17cf699SJunchao Zhang 
12929371c9d4SSatish Balay     Kokkos::parallel_for(
1293d326c3f1SJunchao Zhang       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
12940e3ece09SJunchao Zhang         PetscInt i = t.league_rank(); // row i
12950e3ece09SJunchao Zhang         Kokkos::single(Kokkos::PerTeam(t), [=]() {
12960e3ece09SJunchao Zhang           // Only one thread works in a team
1297c17cf699SJunchao Zhang           PetscInt p, q = Yi(i);
12980e3ece09SJunchao Zhang           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
12990e3ece09SJunchao Zhang             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
13000e3ece09SJunchao Zhang             if (Xj(p) == Yj(q)) {                        // Found it
1301c17cf699SJunchao Zhang               Ya(q) += alpha * Xa(p);
1302c17cf699SJunchao Zhang               q++;
1303a587d139SMark             } else {
13040e3ece09SJunchao Zhang             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
13050e3ece09SJunchao Zhang             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
13060e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
13070e3ece09SJunchao Zhang               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
13088b8b16f9SJunchao Zhang #else
13090e3ece09SJunchao Zhang               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
13108b8b16f9SJunchao Zhang #endif
1311a587d139SMark             }
1312c17cf699SJunchao Zhang           }
1313c17cf699SJunchao Zhang         });
1314c17cf699SJunchao Zhang       });
13159566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
13160e3ece09SJunchao Zhang   } else { // different nonzero patterns
1317c17cf699SJunchao Zhang     Mat             Z;
1318c17cf699SJunchao Zhang     KokkosCsrMatrix zcsr;
1319c17cf699SJunchao Zhang     KernelHandle    kh;
13200e3ece09SJunchao Zhang     kh.create_spadd_handle(true); // X, Y are sorted
1321c17cf699SJunchao Zhang     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1322c17cf699SJunchao Zhang     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1323c17cf699SJunchao Zhang     zkok = new Mat_SeqAIJKokkos(zcsr);
13249566063dSJacob Faibussowitsch     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
13259566063dSJacob Faibussowitsch     PetscCall(MatHeaderReplace(Y, &Z));
1326c17cf699SJunchao Zhang     kh.destroy_spadd_handle();
1327c17cf699SJunchao Zhang   }
13289566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
13290e3ece09SJunchao Zhang   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
13303ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1331a587d139SMark }
1332a587d139SMark 
13332c4ab24aSJunchao Zhang struct MatCOOStruct_SeqAIJKokkos {
13342c4ab24aSJunchao Zhang   PetscCount           n;
13352c4ab24aSJunchao Zhang   PetscCount           Atot;
13362c4ab24aSJunchao Zhang   PetscInt             nz;
13372c4ab24aSJunchao Zhang   PetscCountKokkosView jmap;
13382c4ab24aSJunchao Zhang   PetscCountKokkosView perm;
13392c4ab24aSJunchao Zhang 
13402c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
13412c4ab24aSJunchao Zhang   {
13422c4ab24aSJunchao Zhang     nz   = coo_h->nz;
13432c4ab24aSJunchao Zhang     n    = coo_h->n;
13442c4ab24aSJunchao Zhang     Atot = coo_h->Atot;
13452c4ab24aSJunchao Zhang     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
13462c4ab24aSJunchao Zhang     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
13472c4ab24aSJunchao Zhang   }
13482c4ab24aSJunchao Zhang };
13492c4ab24aSJunchao Zhang 
135049abdd8aSBarry Smith static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
13512c4ab24aSJunchao Zhang {
13522c4ab24aSJunchao Zhang   PetscFunctionBegin;
135349abdd8aSBarry Smith   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
13542c4ab24aSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
13552c4ab24aSJunchao Zhang }
13562c4ab24aSJunchao Zhang 
1357d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1358d71ae5a4SJacob Faibussowitsch {
135942550becSJunchao Zhang   Mat_SeqAIJKokkos          *akok;
136042550becSJunchao Zhang   Mat_SeqAIJ                *aseq;
136103e76207SPierre Jolivet   PetscContainer             container_h;
13622c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJ       *coo_h;
13632c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJKokkos *coo_d;
136442550becSJunchao Zhang 
136542550becSJunchao Zhang   PetscFunctionBegin;
13669566063dSJacob Faibussowitsch   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1367394ed5ebSJunchao Zhang   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
136842550becSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1369cbc6b225SStefano Zampini   delete akok;
1370f4747e26SJunchao Zhang   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
13719566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
13722c4ab24aSJunchao Zhang 
13732c4ab24aSJunchao Zhang   // Copy the COO struct to device
13742c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
13752c4ab24aSJunchao Zhang   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
13762c4ab24aSJunchao Zhang   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
13772c4ab24aSJunchao Zhang 
13782c4ab24aSJunchao Zhang   // Put the COO struct in a container and then attach that to the matrix
137903e76207SPierre Jolivet   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
13803ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
138142550becSJunchao Zhang }
138242550becSJunchao Zhang 
1383d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1384d71ae5a4SJacob Faibussowitsch {
138542550becSJunchao Zhang   MatScalarKokkosView        Aa;
138642550becSJunchao Zhang   ConstMatScalarKokkosView   kv;
138742550becSJunchao Zhang   PetscMemType               memtype;
13882c4ab24aSJunchao Zhang   PetscContainer             container;
13892c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJKokkos *coo;
139042550becSJunchao Zhang 
139142550becSJunchao Zhang   PetscFunctionBegin;
13922c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
13932c4ab24aSJunchao Zhang   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
13942c4ab24aSJunchao Zhang 
13952c4ab24aSJunchao Zhang   const auto &n    = coo->n;
13962c4ab24aSJunchao Zhang   const auto &Annz = coo->nz;
13972c4ab24aSJunchao Zhang   const auto &jmap = coo->jmap;
13982c4ab24aSJunchao Zhang   const auto &perm = coo->perm;
13992c4ab24aSJunchao Zhang 
14009566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(v, &memtype));
140142550becSJunchao Zhang   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
14022c4ab24aSJunchao Zhang     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
140342550becSJunchao Zhang   } else {
14042c4ab24aSJunchao Zhang     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
140542550becSJunchao Zhang   }
140642550becSJunchao Zhang 
1407c7b718f4SJunchao Zhang   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1408c7b718f4SJunchao Zhang   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
140942550becSJunchao Zhang 
141008bb9926SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
14119371c9d4SSatish Balay   Kokkos::parallel_for(
1412d326c3f1SJunchao Zhang     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1413c7b718f4SJunchao Zhang       PetscScalar sum = 0.0;
1414c7b718f4SJunchao Zhang       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1415c7b718f4SJunchao Zhang       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1416c7b718f4SJunchao Zhang     });
141708bb9926SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1418394ed5ebSJunchao Zhang 
14199566063dSJacob Faibussowitsch   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
14209566063dSJacob Faibussowitsch   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
14213ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
142242550becSJunchao Zhang }
142342550becSJunchao Zhang 
1424d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1425d71ae5a4SJacob Faibussowitsch {
1426076ba34aSJunchao Zhang   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1427076ba34aSJunchao Zhang 
14288c3ff71bSJunchao Zhang   PetscFunctionBegin;
1429076ba34aSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
14306f3d89d0SStefano Zampini   A->boundtocpu  = PETSC_FALSE;
14316f3d89d0SStefano Zampini 
14328c3ff71bSJunchao Zhang   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
14338c3ff71bSJunchao Zhang   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
14348c3ff71bSJunchao Zhang   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1435a587d139SMark   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1436f0cf5187SStefano Zampini   A->ops->scale                     = MatScale_SeqAIJKokkos;
1437a587d139SMark   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1438076ba34aSJunchao Zhang   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
14398c3ff71bSJunchao Zhang   A->ops->mult                      = MatMult_SeqAIJKokkos;
14408c3ff71bSJunchao Zhang   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
14418c3ff71bSJunchao Zhang   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
14428c3ff71bSJunchao Zhang   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
14438c3ff71bSJunchao Zhang   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
14448c3ff71bSJunchao Zhang   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1445076ba34aSJunchao Zhang   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
14460ecb592aSJunchao Zhang   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1447152b3e56SJunchao Zhang   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1448f78ce678SMark Adams   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1449f4747e26SJunchao Zhang   A->ops->shift                     = MatShift_SeqAIJKokkos;
1450f4747e26SJunchao Zhang   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1451f4747e26SJunchao Zhang   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1452076ba34aSJunchao Zhang   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1453076ba34aSJunchao Zhang   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1454076ba34aSJunchao Zhang   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1455076ba34aSJunchao Zhang   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1456076ba34aSJunchao Zhang   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1457076ba34aSJunchao Zhang   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
14587ee59b9bSJunchao Zhang   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
145942550becSJunchao Zhang 
14609566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
14619566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
146257761e9aSJunchao Zhang #if defined(PETSC_HAVE_HYPRE)
146357761e9aSJunchao Zhang   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
146457761e9aSJunchao Zhang #endif
14653ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1466076ba34aSJunchao Zhang }
1467076ba34aSJunchao Zhang 
14689d13fa56SJunchao Zhang /*
14699d13fa56SJunchao Zhang    Extract the (prescribled) diagonal blocks of the matrix and then invert them
14709d13fa56SJunchao Zhang 
14719d13fa56SJunchao Zhang   Input Parameters:
14729d13fa56SJunchao Zhang +  A       - the MATSEQAIJKOKKOS matrix
14739d13fa56SJunchao Zhang .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
14749d13fa56SJunchao Zhang .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
14759d13fa56SJunchao Zhang .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
14769d13fa56SJunchao Zhang -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine
14779d13fa56SJunchao Zhang 
14789d13fa56SJunchao Zhang   Output Parameter:
14799d13fa56SJunchao Zhang .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
14809d13fa56SJunchao Zhang */
14819d13fa56SJunchao Zhang PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
14829d13fa56SJunchao Zhang {
14839d13fa56SJunchao Zhang   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
14849d13fa56SJunchao Zhang   PetscInt          nblocks = bs.extent(0) - 1;
14859d13fa56SJunchao Zhang 
14869d13fa56SJunchao Zhang   PetscFunctionBegin;
14879d13fa56SJunchao Zhang   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
14889d13fa56SJunchao Zhang 
14899d13fa56SJunchao Zhang   // Pull out the diagonal blocks of the matrix and then invert the blocks
14909d13fa56SJunchao Zhang   auto Aa    = akok->a_dual.view_device();
14919d13fa56SJunchao Zhang   auto Ai    = akok->i_dual.view_device();
14929d13fa56SJunchao Zhang   auto Aj    = akok->j_dual.view_device();
14939d13fa56SJunchao Zhang   auto Adiag = akok->diag_dual.view_device();
14949d13fa56SJunchao Zhang   // TODO: how to tune the team size?
149545402d8aSJunchao Zhang #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
14969d13fa56SJunchao Zhang   auto ts = Kokkos::AUTO();
14979d13fa56SJunchao Zhang #else
14989d13fa56SJunchao Zhang   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
14999d13fa56SJunchao Zhang #endif
15009d13fa56SJunchao Zhang   PetscCallCXX(Kokkos::parallel_for(
1501d326c3f1SJunchao Zhang     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
15029d13fa56SJunchao Zhang       const PetscInt bid    = teamMember.league_rank();                                                   // block id
15039d13fa56SJunchao Zhang       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
15049d13fa56SJunchao Zhang       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
15059d13fa56SJunchao Zhang       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
15069d13fa56SJunchao Zhang       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);
15079d13fa56SJunchao Zhang 
15089d13fa56SJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
15099d13fa56SJunchao Zhang         PetscInt i = rstart + r;                                                            // i-th row in A
15109d13fa56SJunchao Zhang 
15119d13fa56SJunchao Zhang         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
15129d13fa56SJunchao Zhang           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row
15139d13fa56SJunchao Zhang 
15149d13fa56SJunchao Zhang           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
15159d13fa56SJunchao Zhang             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
15169d13fa56SJunchao Zhang               B(r, c) = 0.0;
15179d13fa56SJunchao Zhang             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
15189d13fa56SJunchao Zhang               B(r, c) = Aa(first + c);
15199d13fa56SJunchao Zhang             } else { // this entry does not show up in the CSR
15209d13fa56SJunchao Zhang               B(r, c) = 0.0;
15219d13fa56SJunchao Zhang             }
15229d13fa56SJunchao Zhang           }
15239d13fa56SJunchao Zhang         } else { // rare case that the diagonal does not exist
15249d13fa56SJunchao Zhang           const PetscInt begin = Ai(i);
15259d13fa56SJunchao Zhang           const PetscInt end   = Ai(i + 1);
15269d13fa56SJunchao Zhang           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
15279d13fa56SJunchao Zhang           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
15289d13fa56SJunchao Zhang             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
15299d13fa56SJunchao Zhang             else if (Aj(j) >= rstart + m) break;
15309d13fa56SJunchao Zhang           }
15319d13fa56SJunchao Zhang         }
15329d13fa56SJunchao Zhang       });
15339d13fa56SJunchao Zhang 
15349d13fa56SJunchao Zhang       // LU-decompose B (w/o pivoting) and then invert B
15359d13fa56SJunchao Zhang       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
15369d13fa56SJunchao Zhang       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
15379d13fa56SJunchao Zhang     }));
15389d13fa56SJunchao Zhang   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
15399d13fa56SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
15409d13fa56SJunchao Zhang }
15419d13fa56SJunchao Zhang 
1542d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1543d71ae5a4SJacob Faibussowitsch {
1544076ba34aSJunchao Zhang   Mat_SeqAIJ *aseq;
1545076ba34aSJunchao Zhang   PetscInt    i, m, n;
15464df4a32cSJunchao Zhang   auto        exec = PetscGetKokkosExecutionSpace();
1547076ba34aSJunchao Zhang 
1548076ba34aSJunchao Zhang   PetscFunctionBegin;
15495f80ce2aSJacob Faibussowitsch   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1550076ba34aSJunchao Zhang 
1551076ba34aSJunchao Zhang   m = akok->nrows();
1552076ba34aSJunchao Zhang   n = akok->ncols();
15539566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(A, m, n, m, n));
15549566063dSJacob Faibussowitsch   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1555076ba34aSJunchao Zhang 
1556076ba34aSJunchao Zhang   /* Set up data structures of A as a MATSEQAIJ */
15579566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
155857508eceSPierre Jolivet   aseq = (Mat_SeqAIJ *)A->data;
1559076ba34aSJunchao Zhang 
1560f3d3cd90SZach Atkins   PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */
1561f3d3cd90SZach Atkins   PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec));
1562076ba34aSJunchao Zhang 
1563076ba34aSJunchao Zhang   aseq->i       = akok->i_host_data();
1564076ba34aSJunchao Zhang   aseq->j       = akok->j_host_data();
1565076ba34aSJunchao Zhang   aseq->a       = akok->a_host_data();
1566076ba34aSJunchao Zhang   aseq->nonew   = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1567076ba34aSJunchao Zhang   aseq->free_a  = PETSC_FALSE;
1568076ba34aSJunchao Zhang   aseq->free_ij = PETSC_FALSE;
1569076ba34aSJunchao Zhang   aseq->nz      = akok->nnz();
1570076ba34aSJunchao Zhang   aseq->maxnz   = aseq->nz;
1571076ba34aSJunchao Zhang 
15729566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &aseq->imax));
15739566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &aseq->ilen));
1574ad540459SPierre Jolivet   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1575076ba34aSJunchao Zhang 
1576076ba34aSJunchao Zhang   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1577076ba34aSJunchao Zhang   akok->nonzerostate = A->nonzerostate;
1578ff751488SJunchao Zhang   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
15799566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
15809566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
15813ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1582076ba34aSJunchao Zhang }
1583076ba34aSJunchao Zhang 
15840e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
15850e3ece09SJunchao Zhang {
15860e3ece09SJunchao Zhang   PetscFunctionBegin;
15870e3ece09SJunchao Zhang   PetscCall(MatSeqAIJKokkosSyncDevice(A));
15880e3ece09SJunchao Zhang   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
15890e3ece09SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
15900e3ece09SJunchao Zhang }
15910e3ece09SJunchao Zhang 
15920e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
15930e3ece09SJunchao Zhang {
15940e3ece09SJunchao Zhang   Mat_SeqAIJKokkos *akok;
15954d86920dSPierre Jolivet 
15960e3ece09SJunchao Zhang   PetscFunctionBegin;
15970e3ece09SJunchao Zhang   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
15980e3ece09SJunchao Zhang   PetscCall(MatCreate(comm, A));
15990e3ece09SJunchao Zhang   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
16000e3ece09SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
16010e3ece09SJunchao Zhang }
16020e3ece09SJunchao Zhang 
1603076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1604076ba34aSJunchao Zhang 
1605076ba34aSJunchao Zhang    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1606076ba34aSJunchao Zhang  */
1607d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1608d71ae5a4SJacob Faibussowitsch {
1609076ba34aSJunchao Zhang   PetscFunctionBegin;
16109566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
16119566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
16123ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16138c3ff71bSJunchao Zhang }
16148c3ff71bSJunchao Zhang 
1615152b3e56SJunchao Zhang /*@C
161611a5261eSBarry Smith   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
16178c3ff71bSJunchao Zhang   (the default parallel PETSc format). This matrix will ultimately be handled by
161820f4b53cSBarry Smith   Kokkos for calculations.
16198c3ff71bSJunchao Zhang 
16208c3ff71bSJunchao Zhang   Collective
16218c3ff71bSJunchao Zhang 
16228c3ff71bSJunchao Zhang   Input Parameters:
162311a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF`
16248c3ff71bSJunchao Zhang . m    - number of rows
16258c3ff71bSJunchao Zhang . n    - number of columns
162620f4b53cSBarry Smith . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
162720f4b53cSBarry Smith - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
16288c3ff71bSJunchao Zhang 
16298c3ff71bSJunchao Zhang   Output Parameter:
16308c3ff71bSJunchao Zhang . A - the matrix
16318c3ff71bSJunchao Zhang 
16322ef1f0ffSBarry Smith   Level: intermediate
16332ef1f0ffSBarry Smith 
16342ef1f0ffSBarry Smith   Notes:
163511a5261eSBarry Smith   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
16368c3ff71bSJunchao Zhang   MatXXXXSetPreallocation() paradgm instead of this routine directly.
163711a5261eSBarry Smith   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
16388c3ff71bSJunchao Zhang 
163911a5261eSBarry Smith   The AIJ format, also called
16402ef1f0ffSBarry Smith   compressed row storage, is fully compatible with standard Fortran
16418c3ff71bSJunchao Zhang   storage.  That is, the stored row and column indices can begin at
164220f4b53cSBarry Smith   either one (as in Fortran) or zero.
16438c3ff71bSJunchao Zhang 
16442ef1f0ffSBarry Smith   Specify the preallocated storage with either `nz` or `nnz` (not both).
16452ef1f0ffSBarry Smith   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
16462ef1f0ffSBarry Smith   allocation.
16478c3ff71bSJunchao Zhang 
1648fe59aa6dSJacob Faibussowitsch .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
16498c3ff71bSJunchao Zhang @*/
1650d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1651d71ae5a4SJacob Faibussowitsch {
16528c3ff71bSJunchao Zhang   PetscFunctionBegin;
16539566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
16549566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
16559566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, m, n));
16569566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
16579566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
16583ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16598c3ff71bSJunchao Zhang }
1660930e68a5SMark Adams 
1661aac854edSJunchao Zhang // After matrix numeric factorization, there are still steps to do before triangular solve can be called.
1662aac854edSJunchao Zhang // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does).
1663aac854edSJunchao Zhang // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve().
1664aac854edSJunchao Zhang // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck.
1665aac854edSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A)
1666d71ae5a4SJacob Faibussowitsch {
166786a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1668aac854edSJunchao Zhang   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1669aac854edSJunchao Zhang   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy
167086a27549SJunchao Zhang 
167186a27549SJunchao Zhang   PetscFunctionBegin;
1672aac854edSJunchao Zhang   if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet
1673aac854edSJunchao Zhang     if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d));
1674aac854edSJunchao Zhang     if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d));
167586a27549SJunchao Zhang     factors->sptrsv_symbolic_completed = PETSC_TRUE;
167686a27549SJunchao Zhang   }
16773ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
167886a27549SJunchao Zhang }
167986a27549SJunchao Zhang 
1680d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1681d71ae5a4SJacob Faibussowitsch {
1682aac854edSJunchao Zhang   const PetscInt              n         = A->rmap->n;
168386a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1684aac854edSJunchao Zhang   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1685aac854edSJunchao Zhang   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy
168686a27549SJunchao Zhang 
168786a27549SJunchao Zhang   PetscFunctionBegin;
1688aac854edSJunchao Zhang   if (!factors->transpose_updated) {
1689aac854edSJunchao Zhang     if (has_upper) {
1690aac854edSJunchao Zhang       if (!factors->iUt_d.extent(0)) {                                 // Allocate Ut on device if not yet
1691aac854edSJunchao Zhang         factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
16927b8d4ba6SJunchao Zhang         factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
16937b8d4ba6SJunchao Zhang         factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1694aac854edSJunchao Zhang       }
169586a27549SJunchao Zhang 
1696aac854edSJunchao Zhang       if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host
1697aac854edSJunchao Zhang         if (!factors->U) {
1698aac854edSJunchao Zhang           Mat_SeqAIJ *seq;
169986a27549SJunchao Zhang 
1700aac854edSJunchao Zhang           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U));
1701aac854edSJunchao Zhang           PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut));
170286a27549SJunchao Zhang 
1703aac854edSJunchao Zhang           seq            = static_cast<Mat_SeqAIJ *>(factors->Ut->data);
1704aac854edSJunchao Zhang           factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1705aac854edSJunchao Zhang           factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1706aac854edSJunchao Zhang           factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1707aac854edSJunchao Zhang         } else {
1708aac854edSJunchao Zhang           PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h
1709aac854edSJunchao Zhang         }
1710aac854edSJunchao Zhang         // Copy Ut from host to device
1711aac854edSJunchao Zhang         PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h));
1712aac854edSJunchao Zhang         PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h));
1713aac854edSJunchao Zhang         PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h));
1714aac854edSJunchao Zhang       } else { // If U was computed on device, we also compute the transpose there
1715aac854edSJunchao Zhang         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1716aac854edSJunchao Zhang         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d,
1717aac854edSJunchao Zhang                                                                                                                                                                                                                                factors->jU_d, factors->aU_d,
1718aac854edSJunchao Zhang                                                                                                                                                                                                                                factors->iUt_d, factors->jUt_d,
1719aac854edSJunchao Zhang                                                                                                                                                                                                                                factors->aUt_d));
1720aac854edSJunchao Zhang         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d));
1721aac854edSJunchao Zhang       }
1722aac854edSJunchao Zhang       PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d));
1723aac854edSJunchao Zhang     }
1724aac854edSJunchao Zhang 
1725aac854edSJunchao Zhang     // do the same for L with LU
1726aac854edSJunchao Zhang     if (has_lower) {
1727aac854edSJunchao Zhang       if (!factors->iLt_d.extent(0)) {                                 // Allocate Lt on device if not yet
1728aac854edSJunchao Zhang         factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1729aac854edSJunchao Zhang         factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1730aac854edSJunchao Zhang         factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1731aac854edSJunchao Zhang       }
1732aac854edSJunchao Zhang 
1733aac854edSJunchao Zhang       if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host
1734aac854edSJunchao Zhang         if (!factors->L) {
1735aac854edSJunchao Zhang           Mat_SeqAIJ *seq;
1736aac854edSJunchao Zhang 
1737aac854edSJunchao Zhang           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L));
1738aac854edSJunchao Zhang           PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt));
1739aac854edSJunchao Zhang 
1740aac854edSJunchao Zhang           seq            = static_cast<Mat_SeqAIJ *>(factors->Lt->data);
1741aac854edSJunchao Zhang           factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1742aac854edSJunchao Zhang           factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1743aac854edSJunchao Zhang           factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1744aac854edSJunchao Zhang         } else {
1745aac854edSJunchao Zhang           PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h
1746aac854edSJunchao Zhang         }
1747aac854edSJunchao Zhang         // Copy Lt from host to device
1748aac854edSJunchao Zhang         PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h));
1749aac854edSJunchao Zhang         PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h));
1750aac854edSJunchao Zhang         PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h));
1751aac854edSJunchao Zhang       } else { // If L was computed on device, we also compute the transpose there
1752aac854edSJunchao Zhang         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1753aac854edSJunchao Zhang         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d,
1754aac854edSJunchao Zhang                                                                                                                                                                                                                                factors->jL_d, factors->aL_d,
1755aac854edSJunchao Zhang                                                                                                                                                                                                                                factors->iLt_d, factors->jLt_d,
1756aac854edSJunchao Zhang                                                                                                                                                                                                                                factors->aLt_d));
1757aac854edSJunchao Zhang         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d));
1758aac854edSJunchao Zhang       }
1759aac854edSJunchao Zhang       PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d));
1760aac854edSJunchao Zhang     }
1761aac854edSJunchao Zhang 
176286a27549SJunchao Zhang     factors->transpose_updated = PETSC_TRUE;
176386a27549SJunchao Zhang   }
17643ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
176586a27549SJunchao Zhang }
176686a27549SJunchao Zhang 
1767aac854edSJunchao Zhang // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A.
1768aac854edSJunchao Zhang // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty.
1769aac854edSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx)
1770d71ae5a4SJacob Faibussowitsch {
1771aac854edSJunchao Zhang   auto                        exec    = PetscGetKokkosExecutionSpace();
177286a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1773aac854edSJunchao Zhang   PetscInt                    m       = A->rmap->n;
1774aac854edSJunchao Zhang   PetscScalarKokkosView       D       = factors->D_d;
1775aac854edSJunchao Zhang   PetscScalarKokkosView       X, Y, B; // alias
1776aac854edSJunchao Zhang   ConstPetscScalarKokkosView  b;
1777aac854edSJunchao Zhang   PetscScalarKokkosView       x;
1778aac854edSJunchao Zhang   PetscIntKokkosView         &rowperm  = factors->rowperm;
1779aac854edSJunchao Zhang   PetscBool                   identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
178086a27549SJunchao Zhang 
178186a27549SJunchao Zhang   PetscFunctionBegin;
17829566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1783aac854edSJunchao Zhang   PetscCall(MatSeqAIJKokkosSolveCheck(A));          // for UX = T
1784aac854edSJunchao Zhang   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B
1785aac854edSJunchao Zhang   PetscCall(VecGetKokkosView(bb, &b));
1786aac854edSJunchao Zhang   PetscCall(VecGetKokkosViewWrite(xx, &x));
1787aac854edSJunchao Zhang 
1788aac854edSJunchao Zhang   // Solve U^T Y = B
1789aac854edSJunchao Zhang   if (identity) { // Reorder b with the row permutation
1790aac854edSJunchao Zhang     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1791aac854edSJunchao Zhang     Y = factors->workVector;
1792aac854edSJunchao Zhang   } else {
1793aac854edSJunchao Zhang     B = factors->workVector;
1794aac854edSJunchao Zhang     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1795aac854edSJunchao Zhang     Y = x;
1796aac854edSJunchao Zhang   }
1797aac854edSJunchao Zhang   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1798aac854edSJunchao Zhang 
1799aac854edSJunchao Zhang   // Solve diag(D) Y' = Y.
1800aac854edSJunchao Zhang   // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication.
1801aac854edSJunchao Zhang   PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); }));
1802aac854edSJunchao Zhang 
1803aac854edSJunchao Zhang   // Solve UX = Y
1804aac854edSJunchao Zhang   if (identity) {
1805aac854edSJunchao Zhang     X = x;
1806aac854edSJunchao Zhang   } else {
1807aac854edSJunchao Zhang     X = factors->workVector; // B is not needed anymore
1808aac854edSJunchao Zhang   }
1809aac854edSJunchao Zhang   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1810aac854edSJunchao Zhang 
1811aac854edSJunchao Zhang   // Reorder X with the inverse column (row) permutation
1812aac854edSJunchao Zhang   if (!identity) {
1813aac854edSJunchao Zhang     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1814aac854edSJunchao Zhang   }
1815aac854edSJunchao Zhang 
1816aac854edSJunchao Zhang   PetscCall(VecRestoreKokkosView(bb, &b));
1817aac854edSJunchao Zhang   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
18189566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
18193ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
182086a27549SJunchao Zhang }
182186a27549SJunchao Zhang 
1822aac854edSJunchao Zhang // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1823aac854edSJunchao Zhang // R and C are represented by rowperm and colperm in factors.
1824aac854edSJunchao Zhang // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1825aac854edSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1826d71ae5a4SJacob Faibussowitsch {
1827aac854edSJunchao Zhang   auto                        exec    = PetscGetKokkosExecutionSpace();
182886a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1829aac854edSJunchao Zhang   PetscInt                    m       = A->rmap->n;
1830aac854edSJunchao Zhang   PetscScalarKokkosView       X, Y, B; // alias
1831aac854edSJunchao Zhang   ConstPetscScalarKokkosView  b;
1832aac854edSJunchao Zhang   PetscScalarKokkosView       x;
1833aac854edSJunchao Zhang   PetscIntKokkosView         &rowperm      = factors->rowperm;
1834aac854edSJunchao Zhang   PetscIntKokkosView         &colperm      = factors->colperm;
1835aac854edSJunchao Zhang   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1836aac854edSJunchao Zhang   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
183786a27549SJunchao Zhang 
183886a27549SJunchao Zhang   PetscFunctionBegin;
18399566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1840aac854edSJunchao Zhang   PetscCall(MatSeqAIJKokkosSolveCheck(A));
1841aac854edSJunchao Zhang   PetscCall(VecGetKokkosView(bb, &b));
1842aac854edSJunchao Zhang   PetscCall(VecGetKokkosViewWrite(xx, &x));
184386a27549SJunchao Zhang 
1844aac854edSJunchao Zhang   // Solve L Y = B (i.e., L (U C^- x) = R b).  R b indicates applying the row permutation on b.
1845aac854edSJunchao Zhang   if (row_identity) {
1846aac854edSJunchao Zhang     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1847aac854edSJunchao Zhang     Y = factors->workVector;
1848aac854edSJunchao Zhang   } else {
1849aac854edSJunchao Zhang     B = factors->workVector;
1850aac854edSJunchao Zhang     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1851aac854edSJunchao Zhang     Y = x;
1852aac854edSJunchao Zhang   }
1853aac854edSJunchao Zhang   PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y));
1854aac854edSJunchao Zhang 
1855aac854edSJunchao Zhang   // Solve U C^- x = Y
1856aac854edSJunchao Zhang   if (col_identity) {
1857aac854edSJunchao Zhang     X = x;
1858aac854edSJunchao Zhang   } else {
1859aac854edSJunchao Zhang     X = factors->workVector;
1860aac854edSJunchao Zhang   }
1861aac854edSJunchao Zhang   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1862aac854edSJunchao Zhang 
1863aac854edSJunchao Zhang   // x = C X; Reorder X with the inverse col permutation
1864aac854edSJunchao Zhang   if (!col_identity) {
1865aac854edSJunchao Zhang     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); }));
1866aac854edSJunchao Zhang   }
1867aac854edSJunchao Zhang 
1868aac854edSJunchao Zhang   PetscCall(VecRestoreKokkosView(bb, &b));
1869aac854edSJunchao Zhang   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
18709566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
18713ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
187286a27549SJunchao Zhang }
187386a27549SJunchao Zhang 
1874aac854edSJunchao Zhang // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1875aac854edSJunchao Zhang // R and C are represented by rowperm and colperm in factors.
1876aac854edSJunchao Zhang // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1877aac854edSJunchao Zhang // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R.
1878aac854edSJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1879aac854edSJunchao Zhang {
1880aac854edSJunchao Zhang   auto                        exec    = PetscGetKokkosExecutionSpace();
1881aac854edSJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1882aac854edSJunchao Zhang   PetscInt                    m       = A->rmap->n;
1883aac854edSJunchao Zhang   PetscScalarKokkosView       X, Y, B; // alias
1884aac854edSJunchao Zhang   ConstPetscScalarKokkosView  b;
1885aac854edSJunchao Zhang   PetscScalarKokkosView       x;
1886aac854edSJunchao Zhang   PetscIntKokkosView         &rowperm      = factors->rowperm;
1887aac854edSJunchao Zhang   PetscIntKokkosView         &colperm      = factors->colperm;
1888aac854edSJunchao Zhang   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1889aac854edSJunchao Zhang   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1890aac854edSJunchao Zhang 
1891aac854edSJunchao Zhang   PetscFunctionBegin;
1892aac854edSJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1893aac854edSJunchao Zhang   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T
1894aac854edSJunchao Zhang   PetscCall(VecGetKokkosView(bb, &b));
1895aac854edSJunchao Zhang   PetscCall(VecGetKokkosViewWrite(xx, &x));
1896aac854edSJunchao Zhang 
1897aac854edSJunchao Zhang   // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b).  Note C^- b = C^T b, which means applying the column permutation on b.
1898aac854edSJunchao Zhang   if (col_identity) { // Reorder b with the col permutation
1899aac854edSJunchao Zhang     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1900aac854edSJunchao Zhang     Y = factors->workVector;
1901aac854edSJunchao Zhang   } else {
1902aac854edSJunchao Zhang     B = factors->workVector;
1903aac854edSJunchao Zhang     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); }));
1904aac854edSJunchao Zhang     Y = x;
1905aac854edSJunchao Zhang   }
1906aac854edSJunchao Zhang   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1907aac854edSJunchao Zhang 
1908aac854edSJunchao Zhang   // Solve L^T X = Y
1909aac854edSJunchao Zhang   if (row_identity) {
1910aac854edSJunchao Zhang     X = x;
1911aac854edSJunchao Zhang   } else {
1912aac854edSJunchao Zhang     X = factors->workVector;
1913aac854edSJunchao Zhang   }
1914aac854edSJunchao Zhang   PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X));
1915aac854edSJunchao Zhang 
1916aac854edSJunchao Zhang   // x = R^- X = R^T X; Reorder X with the inverse row permutation
1917aac854edSJunchao Zhang   if (!row_identity) {
1918aac854edSJunchao Zhang     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1919aac854edSJunchao Zhang   }
1920aac854edSJunchao Zhang 
1921aac854edSJunchao Zhang   PetscCall(VecRestoreKokkosView(bb, &b));
1922aac854edSJunchao Zhang   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1923aac854edSJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1924aac854edSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1925aac854edSJunchao Zhang }
1926aac854edSJunchao Zhang 
1927aac854edSJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1928aac854edSJunchao Zhang {
1929aac854edSJunchao Zhang   PetscFunctionBegin;
1930aac854edSJunchao Zhang   PetscCall(MatSeqAIJKokkosSyncHost(A));
1931aac854edSJunchao Zhang   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1932aac854edSJunchao Zhang 
1933aac854edSJunchao Zhang   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
1934aac854edSJunchao Zhang     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1935aac854edSJunchao Zhang     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
1936aac854edSJunchao Zhang     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
1937aac854edSJunchao Zhang     const MatScalar            *Ba = b->a;
1938aac854edSJunchao Zhang     PetscInt                    m = B->rmap->n, n = B->cmap->n;
1939aac854edSJunchao Zhang 
1940aac854edSJunchao Zhang     if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time
1941aac854edSJunchao Zhang       // Allocate memory and copy the structure
1942aac854edSJunchao Zhang       factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1);
1943aac854edSJunchao Zhang       factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries
1944aac854edSJunchao Zhang       factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m);
1945aac854edSJunchao Zhang       factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1);
1946aac854edSJunchao Zhang       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m]));
1947aac854edSJunchao Zhang       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m]));
1948aac854edSJunchao Zhang 
1949aac854edSJunchao Zhang       PetscInt *Li = factors->iL_h.data();
1950aac854edSJunchao Zhang       PetscInt *Lj = factors->jL_h.data();
1951aac854edSJunchao Zhang       PetscInt *Ui = factors->iU_h.data();
1952aac854edSJunchao Zhang       PetscInt *Uj = factors->jU_h.data();
1953aac854edSJunchao Zhang 
1954aac854edSJunchao Zhang       Li[0] = Ui[0] = 0;
1955aac854edSJunchao Zhang       for (PetscInt i = 0; i < m; i++) {
1956aac854edSJunchao Zhang         PetscInt llen = Bi[i + 1] - Bi[i];       // exclusive of the diagonal entry
1957aac854edSJunchao Zhang         PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry
1958aac854edSJunchao Zhang 
1959aac854edSJunchao Zhang         PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen); // entries of L on the left of the diagonal
1960aac854edSJunchao Zhang         Lj[Li[i] + llen] = i;                        // diagonal entry of L
1961aac854edSJunchao Zhang 
1962aac854edSJunchao Zhang         Uj[Ui[i]] = i;                                                  // diagonal entry of U
1963aac854edSJunchao Zhang         PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1); // entries of U on  the right of the diagonal
1964aac854edSJunchao Zhang 
1965aac854edSJunchao Zhang         Li[i + 1] = Li[i] + llen + 1;
1966aac854edSJunchao Zhang         Ui[i + 1] = Ui[i] + ulen;
1967aac854edSJunchao Zhang       }
1968aac854edSJunchao Zhang 
1969aac854edSJunchao Zhang       factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h);
1970aac854edSJunchao Zhang       factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h);
1971aac854edSJunchao Zhang       factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h);
1972aac854edSJunchao Zhang       factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h);
1973aac854edSJunchao Zhang       factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h);
1974aac854edSJunchao Zhang       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
1975aac854edSJunchao Zhang 
1976aac854edSJunchao Zhang       // Copy row/col permutation to device
1977aac854edSJunchao Zhang       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
1978aac854edSJunchao Zhang       PetscBool row_identity;
1979aac854edSJunchao Zhang       PetscCall(ISIdentity(rowperm, &row_identity));
1980aac854edSJunchao Zhang       if (!row_identity) {
1981aac854edSJunchao Zhang         const PetscInt *ip;
1982aac854edSJunchao Zhang 
1983aac854edSJunchao Zhang         PetscCall(ISGetIndices(rowperm, &ip));
1984aac854edSJunchao Zhang         factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m);
1985aac854edSJunchao Zhang         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
1986aac854edSJunchao Zhang         PetscCall(ISRestoreIndices(rowperm, &ip));
1987aac854edSJunchao Zhang         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
1988aac854edSJunchao Zhang       }
1989aac854edSJunchao Zhang 
1990aac854edSJunchao Zhang       IS        colperm = ((Mat_SeqAIJ *)B->data)->col;
1991aac854edSJunchao Zhang       PetscBool col_identity;
1992aac854edSJunchao Zhang       PetscCall(ISIdentity(colperm, &col_identity));
1993aac854edSJunchao Zhang       if (!col_identity) {
1994aac854edSJunchao Zhang         const PetscInt *ip;
1995aac854edSJunchao Zhang 
1996aac854edSJunchao Zhang         PetscCall(ISGetIndices(colperm, &ip));
1997aac854edSJunchao Zhang         factors->colperm = PetscIntKokkosView(NoInit("colperm"), n);
1998aac854edSJunchao Zhang         PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n)));
1999aac854edSJunchao Zhang         PetscCall(ISRestoreIndices(colperm, &ip));
2000aac854edSJunchao Zhang         PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
2001aac854edSJunchao Zhang       }
2002aac854edSJunchao Zhang 
2003aac854edSJunchao Zhang       /* Create sptrsv handles for L, U and their transpose */
2004aac854edSJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2005aac854edSJunchao Zhang       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2006aac854edSJunchao Zhang #else
2007aac854edSJunchao Zhang       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2008aac854edSJunchao Zhang #endif
2009aac854edSJunchao Zhang       factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */);
2010aac854edSJunchao Zhang       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2011aac854edSJunchao Zhang       factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */);
2012aac854edSJunchao Zhang       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2013aac854edSJunchao Zhang     }
2014aac854edSJunchao Zhang 
2015aac854edSJunchao Zhang     // Copy the value
2016aac854edSJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
2017aac854edSJunchao Zhang       PetscInt        llen = Bi[i + 1] - Bi[i];
2018aac854edSJunchao Zhang       PetscInt        ulen = Bdiag[i] - Bdiag[i + 1];
2019aac854edSJunchao Zhang       const PetscInt *Li   = factors->iL_h.data();
2020aac854edSJunchao Zhang       const PetscInt *Ui   = factors->iU_h.data();
2021aac854edSJunchao Zhang 
2022aac854edSJunchao Zhang       PetscScalar *La = factors->aL_h.data();
2023aac854edSJunchao Zhang       PetscScalar *Ua = factors->aU_h.data();
2024aac854edSJunchao Zhang 
2025aac854edSJunchao Zhang       PetscArraycpy(La + Li[i], Ba + Bi[i], llen); // entries of L
2026aac854edSJunchao Zhang       La[Li[i] + llen] = 1.0;                      // diagonal entry
2027aac854edSJunchao Zhang 
2028aac854edSJunchao Zhang       Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]];                                 // diagonal entry
2029aac854edSJunchao Zhang       PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1); // entries of U
2030aac854edSJunchao Zhang     }
2031aac854edSJunchao Zhang 
2032aac854edSJunchao Zhang     PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h));
2033aac854edSJunchao Zhang     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2034aac854edSJunchao Zhang     // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic
2035aac854edSJunchao Zhang     factors->transpose_updated         = PETSC_FALSE;
2036aac854edSJunchao Zhang     factors->sptrsv_symbolic_completed = PETSC_FALSE;
2037aac854edSJunchao Zhang 
2038aac854edSJunchao Zhang     B->ops->solve          = MatSolve_SeqAIJKokkos_LU;
2039aac854edSJunchao Zhang     B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2040aac854edSJunchao Zhang   }
2041aac854edSJunchao Zhang 
2042aac854edSJunchao Zhang   B->ops->matsolve          = NULL;
2043aac854edSJunchao Zhang   B->ops->matsolvetranspose = NULL;
2044aac854edSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
2045aac854edSJunchao Zhang }
2046aac854edSJunchao Zhang 
2047aac854edSJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info)
2048d71ae5a4SJacob Faibussowitsch {
204986a27549SJunchao Zhang   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
205086a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
205186a27549SJunchao Zhang   PetscInt                    fill_lev = info->levels;
205286a27549SJunchao Zhang 
205386a27549SJunchao Zhang   PetscFunctionBegin;
20549566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2055aac854edSJunchao Zhang   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false");
20569566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2057076ba34aSJunchao Zhang 
2058076ba34aSJunchao Zhang   auto a_d = aijkok->a_dual.view_device();
2059076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
2060076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
2061076ba34aSJunchao Zhang 
2062aac854edSJunchao Zhang   PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d));
206386a27549SJunchao Zhang 
206486a27549SJunchao Zhang   B->assembled              = PETSC_TRUE;
206586a27549SJunchao Zhang   B->preallocated           = PETSC_TRUE;
2066aac854edSJunchao Zhang   B->ops->solve             = MatSolve_SeqAIJKokkos_LU;
2067aac854edSJunchao Zhang   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos_LU;
206886a27549SJunchao Zhang   B->ops->matsolve          = NULL;
206986a27549SJunchao Zhang   B->ops->matsolvetranspose = NULL;
207086a27549SJunchao Zhang 
207186a27549SJunchao Zhang   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
207286a27549SJunchao Zhang   factors->transpose_updated         = PETSC_FALSE;
207386a27549SJunchao Zhang   factors->sptrsv_symbolic_completed = PETSC_FALSE;
2074eeadb341SJunchao Zhang   /* TODO: log flops, but how to know that? */
20759566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
20763ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
207786a27549SJunchao Zhang }
207886a27549SJunchao Zhang 
2079aac854edSJunchao Zhang // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering
2080aac854edSJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info)
2081d71ae5a4SJacob Faibussowitsch {
208286a27549SJunchao Zhang   Mat_SeqAIJKokkos           *aijkok;
208386a27549SJunchao Zhang   Mat_SeqAIJ                 *b;
208486a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
208586a27549SJunchao Zhang   PetscInt                    fill_lev = info->levels;
208686a27549SJunchao Zhang   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
208786a27549SJunchao Zhang   PetscInt                    n        = A->rmap->n;
208886a27549SJunchao Zhang 
208986a27549SJunchao Zhang   PetscFunctionBegin;
2090aac854edSJunchao Zhang   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now");
20919566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
209286a27549SJunchao Zhang 
209386a27549SJunchao Zhang   /* Create a spiluk handle and then do symbolic factorization */
209486a27549SJunchao Zhang   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
2095aac854edSJunchao Zhang   factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
209686a27549SJunchao Zhang 
209786a27549SJunchao Zhang   auto spiluk_handle = factors->kh.get_spiluk_handle();
209886a27549SJunchao Zhang 
209986a27549SJunchao Zhang   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
210086a27549SJunchao Zhang   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
210186a27549SJunchao Zhang   Kokkos::realloc(factors->iU_d, n + 1);
210286a27549SJunchao Zhang   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
210386a27549SJunchao Zhang 
210486a27549SJunchao Zhang   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2105076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
2106076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
2107aac854edSJunchao Zhang   PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d));
210886a27549SJunchao Zhang   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
210986a27549SJunchao Zhang 
211086a27549SJunchao Zhang   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
211186a27549SJunchao Zhang   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
211286a27549SJunchao Zhang   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
211386a27549SJunchao Zhang   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
211486a27549SJunchao Zhang 
211586a27549SJunchao Zhang   /* TODO: add options to select sptrsv algorithms */
211686a27549SJunchao Zhang   /* Create sptrsv handles for L, U and their transpose */
211786a27549SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2118aac854edSJunchao Zhang   auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
211986a27549SJunchao Zhang #else
2120aac854edSJunchao Zhang   auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
212186a27549SJunchao Zhang #endif
212286a27549SJunchao Zhang 
212386a27549SJunchao Zhang   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
212486a27549SJunchao Zhang   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
212586a27549SJunchao Zhang   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
212686a27549SJunchao Zhang   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
212786a27549SJunchao Zhang 
212886a27549SJunchao Zhang   /* Fill fields of the factor matrix B */
21299566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
213086a27549SJunchao Zhang   b     = (Mat_SeqAIJ *)B->data;
213186a27549SJunchao Zhang   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
213286a27549SJunchao Zhang   B->info.fill_ratio_given  = info->fill;
2133a1e4e190SStefano Zampini   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
213486a27549SJunchao Zhang 
2135aac854edSJunchao Zhang   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0;
21363ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2137930e68a5SMark Adams }
2138930e68a5SMark Adams 
2139aac854edSJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2140aac854edSJunchao Zhang {
2141aac854edSJunchao Zhang   PetscFunctionBegin;
2142aac854edSJunchao Zhang   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2143aac854edSJunchao Zhang   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2144aac854edSJunchao Zhang   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2145aac854edSJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2146aac854edSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
2147aac854edSJunchao Zhang }
2148aac854edSJunchao Zhang 
2149aac854edSJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2150aac854edSJunchao Zhang {
2151aac854edSJunchao Zhang   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2152aac854edSJunchao Zhang 
2153aac854edSJunchao Zhang   PetscFunctionBegin;
2154aac854edSJunchao Zhang   if (!info->factoronhost) {
2155aac854edSJunchao Zhang     PetscCall(ISIdentity(isrow, &row_identity));
2156aac854edSJunchao Zhang     PetscCall(ISIdentity(iscol, &col_identity));
2157aac854edSJunchao Zhang   }
2158aac854edSJunchao Zhang 
2159aac854edSJunchao Zhang   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2160aac854edSJunchao Zhang   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2161aac854edSJunchao Zhang 
2162aac854edSJunchao Zhang   if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering
2163aac854edSJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info));
2164aac854edSJunchao Zhang   } else {
2165aac854edSJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host
2166aac854edSJunchao Zhang     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2167aac854edSJunchao Zhang   }
2168aac854edSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
2169aac854edSJunchao Zhang }
2170aac854edSJunchao Zhang 
2171aac854edSJunchao Zhang static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
2172aac854edSJunchao Zhang {
2173aac854edSJunchao Zhang   PetscFunctionBegin;
2174aac854edSJunchao Zhang   PetscCall(MatSeqAIJKokkosSyncHost(A));
2175aac854edSJunchao Zhang   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
2176aac854edSJunchao Zhang 
2177aac854edSJunchao Zhang   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
2178aac854edSJunchao Zhang     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2179aac854edSJunchao Zhang     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
2180aac854edSJunchao Zhang     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
2181aac854edSJunchao Zhang     const MatScalar            *Ba = b->a;
2182aac854edSJunchao Zhang     PetscInt                    m  = B->rmap->n;
2183aac854edSJunchao Zhang 
2184aac854edSJunchao Zhang     if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization
2185aac854edSJunchao Zhang       // Allocate memory and copy the structure
2186aac854edSJunchao Zhang       factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h
2187aac854edSJunchao Zhang       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]);
2188aac854edSJunchao Zhang       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]);
2189aac854edSJunchao Zhang       factors->D_h  = MatScalarKokkosViewHost(NoInit("D_h"), m);
2190aac854edSJunchao Zhang       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
2191aac854edSJunchao Zhang       factors->D_d  = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h);
2192aac854edSJunchao Zhang 
2193aac854edSJunchao Zhang       // Build jU_h from the skewed Aj
2194aac854edSJunchao Zhang       PetscInt *Uj = factors->jU_h.data();
2195aac854edSJunchao Zhang       for (PetscInt i = 0; i < m; i++) {
2196aac854edSJunchao Zhang         PetscInt ulen = Bi[i + 1] - Bi[i];
2197aac854edSJunchao Zhang         Uj[Bi[i]]     = i;                                              // diagonal entry
2198aac854edSJunchao Zhang         PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal
2199aac854edSJunchao Zhang       }
2200aac854edSJunchao Zhang 
2201aac854edSJunchao Zhang       // Copy iU, jU to device
2202aac854edSJunchao Zhang       PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h));
2203aac854edSJunchao Zhang       PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h));
2204aac854edSJunchao Zhang 
2205aac854edSJunchao Zhang       // Copy row/col permutation to device
2206aac854edSJunchao Zhang       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
2207aac854edSJunchao Zhang       PetscBool row_identity;
2208aac854edSJunchao Zhang       PetscCall(ISIdentity(rowperm, &row_identity));
2209aac854edSJunchao Zhang       if (!row_identity) {
2210aac854edSJunchao Zhang         const PetscInt *ip;
2211aac854edSJunchao Zhang 
2212aac854edSJunchao Zhang         PetscCall(ISGetIndices(rowperm, &ip));
2213aac854edSJunchao Zhang         PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m));
2214aac854edSJunchao Zhang         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2215aac854edSJunchao Zhang         PetscCall(ISRestoreIndices(rowperm, &ip));
2216aac854edSJunchao Zhang         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2217aac854edSJunchao Zhang       }
2218aac854edSJunchao Zhang 
2219aac854edSJunchao Zhang       // Create sptrsv handles for U and U^T
2220aac854edSJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2221aac854edSJunchao Zhang       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2222aac854edSJunchao Zhang #else
2223aac854edSJunchao Zhang       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2224aac854edSJunchao Zhang #endif
2225aac854edSJunchao Zhang       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2226aac854edSJunchao Zhang       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2227aac854edSJunchao Zhang     }
2228aac854edSJunchao Zhang     // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them
2229aac854edSJunchao Zhang     B->ops->solve          = MatSolve_SeqAIJKokkos_Cholesky;
2230aac854edSJunchao Zhang     B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky;
2231aac854edSJunchao Zhang 
2232aac854edSJunchao Zhang     // Copy the value
2233aac854edSJunchao Zhang     PetscScalar *Ua = factors->aU_h.data();
2234aac854edSJunchao Zhang     PetscScalar *D  = factors->D_h.data();
2235aac854edSJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
2236aac854edSJunchao Zhang       D[i]      = Ba[Bdiag[i]];     // actually Aa[Adiag[i]] is the inverse of the diagonal
2237aac854edSJunchao Zhang       Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U
2238aac854edSJunchao Zhang       for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k];
2239aac854edSJunchao Zhang     }
2240aac854edSJunchao Zhang     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2241aac854edSJunchao Zhang     PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h));
2242aac854edSJunchao Zhang 
2243aac854edSJunchao Zhang     factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again
2244aac854edSJunchao Zhang     factors->transpose_updated         = PETSC_FALSE;
2245aac854edSJunchao Zhang   }
2246aac854edSJunchao Zhang 
2247aac854edSJunchao Zhang   B->ops->matsolve          = NULL;
2248aac854edSJunchao Zhang   B->ops->matsolvetranspose = NULL;
2249aac854edSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
2250aac854edSJunchao Zhang }
2251aac854edSJunchao Zhang 
2252aac854edSJunchao Zhang static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2253aac854edSJunchao Zhang {
2254aac854edSJunchao Zhang   PetscFunctionBegin;
2255aac854edSJunchao Zhang   if (info->solveonhost) {
2256aac854edSJunchao Zhang     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2257aac854edSJunchao Zhang     PetscCall(MatSetType(B, MATSEQSBAIJ));
2258aac854edSJunchao Zhang     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2259aac854edSJunchao Zhang   }
2260aac854edSJunchao Zhang 
2261aac854edSJunchao Zhang   PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2262aac854edSJunchao Zhang 
2263aac854edSJunchao Zhang   if (!info->solveonhost) {
2264bfe80ac4SPierre Jolivet     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2265aac854edSJunchao Zhang     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2266aac854edSJunchao Zhang     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2267aac854edSJunchao Zhang     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2268aac854edSJunchao Zhang   }
2269aac854edSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
2270aac854edSJunchao Zhang }
2271aac854edSJunchao Zhang 
2272aac854edSJunchao Zhang static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2273aac854edSJunchao Zhang {
2274aac854edSJunchao Zhang   PetscFunctionBegin;
2275aac854edSJunchao Zhang   if (info->solveonhost) {
2276aac854edSJunchao Zhang     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2277aac854edSJunchao Zhang     PetscCall(MatSetType(B, MATSEQSBAIJ));
2278aac854edSJunchao Zhang     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2279aac854edSJunchao Zhang   }
2280aac854edSJunchao Zhang 
2281aac854edSJunchao Zhang   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm
2282aac854edSJunchao Zhang 
2283aac854edSJunchao Zhang   if (!info->solveonhost) {
2284bfe80ac4SPierre Jolivet     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2285aac854edSJunchao Zhang     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2286aac854edSJunchao Zhang     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2287aac854edSJunchao Zhang     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2288aac854edSJunchao Zhang   }
2289aac854edSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
2290aac854edSJunchao Zhang }
2291aac854edSJunchao Zhang 
2292aac854edSJunchao Zhang // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix
2293aac854edSJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type)
2294d71ae5a4SJacob Faibussowitsch {
2295930e68a5SMark Adams   PetscFunctionBegin;
2296930e68a5SMark Adams   *type = MATSOLVERKOKKOS;
22973ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2298930e68a5SMark Adams }
2299930e68a5SMark Adams 
2300930e68a5SMark Adams /*MC
230186a27549SJunchao Zhang   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
230211a5261eSBarry Smith   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
2303930e68a5SMark Adams 
2304930e68a5SMark Adams   Level: beginner
2305930e68a5SMark Adams 
23061cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
2307930e68a5SMark Adams M*/
230886a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
2309930e68a5SMark Adams {
2310930e68a5SMark Adams   PetscInt n = A->rmap->n;
2311aac854edSJunchao Zhang   MPI_Comm comm;
2312930e68a5SMark Adams 
2313930e68a5SMark Adams   PetscFunctionBegin;
2314aac854edSJunchao Zhang   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2315aac854edSJunchao Zhang   PetscCall(MatCreate(comm, B));
23169566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*B, n, n, n, n));
2317aac854edSJunchao Zhang   PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2318930e68a5SMark Adams   (*B)->factortype = ftype;
23199566063dSJacob Faibussowitsch   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
23209566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2321aac854edSJunchao Zhang   PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2322aac854edSJunchao Zhang 
2323aac854edSJunchao Zhang   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2324aac854edSJunchao Zhang     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
2325aac854edSJunchao Zhang     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
2326aac854edSJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2327aac854edSJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2328aac854edSJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2329aac854edSJunchao Zhang   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2330aac854edSJunchao Zhang     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJKokkos;
2331aac854edSJunchao Zhang     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos;
2332aac854edSJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2333aac854edSJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2334aac854edSJunchao Zhang   } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
2335aac854edSJunchao Zhang 
2336aac854edSJunchao Zhang   // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host
2337aac854edSJunchao Zhang   (*B)->canuseordering = PETSC_TRUE;
2338aac854edSJunchao Zhang   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos));
23393ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2340930e68a5SMark Adams }
23418f7e8f9dSMark Adams 
2342aac854edSJunchao Zhang PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void)
2343d71ae5a4SJacob Faibussowitsch {
234486a27549SJunchao Zhang   PetscFunctionBegin;
23459566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
2346aac854edSJunchao Zhang   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos));
23479566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
2348aac854edSJunchao Zhang   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos));
23493ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
235086a27549SJunchao Zhang }
235186a27549SJunchao Zhang 
2352076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */
2353d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
2354d71ae5a4SJacob Faibussowitsch {
235545402d8aSJunchao Zhang   const auto        &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
235645402d8aSJunchao Zhang   const auto        &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
235745402d8aSJunchao Zhang   const auto        &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
2358076ba34aSJunchao Zhang   const PetscInt    *i  = iv.data();
2359076ba34aSJunchao Zhang   const PetscInt    *j  = jv.data();
2360076ba34aSJunchao Zhang   const PetscScalar *a  = av.data();
2361076ba34aSJunchao Zhang   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
2362076ba34aSJunchao Zhang 
2363076ba34aSJunchao Zhang   PetscFunctionBegin;
23649566063dSJacob Faibussowitsch   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
2365076ba34aSJunchao Zhang   for (PetscInt k = 0; k < m; k++) {
23669566063dSJacob Faibussowitsch     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
236748a46eb9SPierre Jolivet     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
23689566063dSJacob Faibussowitsch     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
2369076ba34aSJunchao Zhang   }
23703ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2371076ba34aSJunchao Zhang }
2372