1 #include <petsc_kokkos.hpp> 2 #include <petscvec_kokkos.hpp> 3 #include <petscmat_kokkos.hpp> 4 #include <petscpkg_version.h> 5 #include <petsc/private/petscimpl.h> 6 #include <petsc/private/sfimpl.h> 7 #include <petsc/private/kokkosimpl.hpp> 8 #include <petscsys.h> 9 10 #include <Kokkos_Core.hpp> 11 #include <KokkosBlas.hpp> 12 #include <KokkosSparse_CrsMatrix.hpp> 13 14 // To suppress compiler warnings: 15 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63: 16 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, 17 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*, 18 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations] 19 #define DISABLE_CUSPARSE_DEPRECATED 20 #include <KokkosSparse_spmv.hpp> 21 22 #include <KokkosSparse_spiluk.hpp> 23 #include <KokkosSparse_sptrsv.hpp> 24 #include <KokkosSparse_spgemm.hpp> 25 #include <KokkosSparse_spadd.hpp> 26 #include <KokkosBatched_LU_Decl.hpp> 27 #include <KokkosBatched_InverseLU_Decl.hpp> 28 29 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 30 31 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 32 #include <KokkosSparse_Utils.hpp> 33 using KokkosSparse::sort_crs_matrix; 34 using KokkosSparse::Impl::transpose_matrix; 35 #else 36 #include <KokkosKernels_Sorting.hpp> 37 using KokkosKernels::sort_crs_matrix; 38 using KokkosKernels::Impl::transpose_matrix; 39 #endif 40 41 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0) 42 using KokkosSparse::spiluk_symbolic; 43 using KokkosSparse::spiluk_numeric; 44 using KokkosSparse::sptrsv_symbolic; 45 using KokkosSparse::sptrsv_solve; 46 using KokkosSparse::Experimental::SPTRSVAlgorithm; 47 using KokkosSparse::Experimental::SPILUKAlgorithm; 48 #else 49 using KokkosSparse::Experimental::spiluk_symbolic; 50 using KokkosSparse::Experimental::spiluk_numeric; 51 using KokkosSparse::Experimental::sptrsv_symbolic; 52 using KokkosSparse::Experimental::sptrsv_solve; 53 using KokkosSparse::Experimental::SPTRSVAlgorithm; 54 using KokkosSparse::Experimental::SPILUKAlgorithm; 55 #endif 56 57 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 58 59 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 60 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 61 In the latter case, it is important to set a_dual's sync state correctly. 62 */ 63 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 64 { 65 Mat_SeqAIJ *aijseq; 66 Mat_SeqAIJKokkos *aijkok; 67 68 PetscFunctionBegin; 69 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 70 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 71 72 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 73 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 74 75 /* If aijkok does not exist, we just copy i, j to device. 76 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 77 In both cases, we build a new aijkok structure. 78 */ 79 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 80 if (aijkok && aijkok->host_aij_allocated_by_kokkos) { /* Avoid accidentally freeing much needed a,i,j on host when deleting aijkok */ 81 PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i)); 82 PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j)); 83 PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a)); 84 PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1)); 85 PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz())); 86 PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz())); 87 aijseq->free_a = PETSC_TRUE; 88 aijseq->free_ij = PETSC_TRUE; 89 /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where 90 we have the CsrMatrix on device first and then copy to host, followed by 91 MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL. 92 One could improve it by not using NULL garray. 93 */ 94 } 95 delete aijkok; 96 aijkok = new Mat_SeqAIJKokkos(A, A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /* don't copy mat values to device */); 97 A->spptr = aijkok; 98 } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag. 99 const PetscInt *adiag; 100 /* the a->diag is created at assmebly here because the rest of the Kokkos AIJ code assumes it always exists. This needs to be fixed since it is now only created when needed! */ 101 PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL)); 102 MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n); 103 auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h); 104 aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h); 105 } 106 PetscFunctionReturn(PETSC_SUCCESS); 107 } 108 109 /* Sync CSR data to device if not yet */ 110 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 111 { 112 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 113 114 PetscFunctionBegin; 115 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 116 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 117 if (aijkok->a_dual.need_sync_device()) { 118 PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 119 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 120 aijkok->hermitian_updated = PETSC_FALSE; 121 } 122 PetscFunctionReturn(PETSC_SUCCESS); 123 } 124 125 /* Mark the CSR data on device as modified */ 126 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 127 { 128 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 129 130 PetscFunctionBegin; 131 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 132 aijkok->a_dual.clear_sync_state(); 133 aijkok->a_dual.modify_device(); 134 aijkok->transpose_updated = PETSC_FALSE; 135 aijkok->hermitian_updated = PETSC_FALSE; 136 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 137 PetscFunctionReturn(PETSC_SUCCESS); 138 } 139 140 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 141 { 142 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 143 auto exec = PetscGetKokkosExecutionSpace(); 144 145 PetscFunctionBegin; 146 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 147 /* We do not expect one needs factors on host */ 148 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 149 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 150 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec)); 151 PetscFunctionReturn(PETSC_SUCCESS); 152 } 153 154 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 155 { 156 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 157 158 PetscFunctionBegin; 159 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 160 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 161 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 162 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 163 */ 164 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 165 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 166 *array = aijkok->a_dual.view_host().data(); 167 } else { /* Happens when calling MatSetValues on a newly created matrix */ 168 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 169 } 170 PetscFunctionReturn(PETSC_SUCCESS); 171 } 172 173 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 174 { 175 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 176 177 PetscFunctionBegin; 178 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 179 PetscFunctionReturn(PETSC_SUCCESS); 180 } 181 182 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 183 { 184 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 185 186 PetscFunctionBegin; 187 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 188 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 189 *array = aijkok->a_dual.view_host().data(); 190 } else { 191 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 192 } 193 PetscFunctionReturn(PETSC_SUCCESS); 194 } 195 196 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 197 { 198 PetscFunctionBegin; 199 *array = NULL; 200 PetscFunctionReturn(PETSC_SUCCESS); 201 } 202 203 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 204 { 205 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 206 207 PetscFunctionBegin; 208 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 209 *array = aijkok->a_dual.view_host().data(); 210 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 211 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 212 } 213 PetscFunctionReturn(PETSC_SUCCESS); 214 } 215 216 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 217 { 218 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 219 220 PetscFunctionBegin; 221 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 222 aijkok->a_dual.clear_sync_state(); 223 aijkok->a_dual.modify_host(); 224 } 225 PetscFunctionReturn(PETSC_SUCCESS); 226 } 227 228 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 229 { 230 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 231 232 PetscFunctionBegin; 233 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 234 235 if (i) *i = aijkok->i_device_data(); 236 if (j) *j = aijkok->j_device_data(); 237 if (a) { 238 PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 239 *a = aijkok->a_device_data(); 240 } 241 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 242 PetscFunctionReturn(PETSC_SUCCESS); 243 } 244 245 static PetscErrorCode MatGetCurrentMemType_SeqAIJKokkos(PETSC_UNUSED Mat A, PetscMemType *mtype) 246 { 247 PetscFunctionBegin; 248 *mtype = PETSC_MEMTYPE_KOKKOS; 249 PetscFunctionReturn(PETSC_SUCCESS); 250 } 251 252 /* 253 Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 254 255 Input Parameter: 256 . A - the MATSEQAIJKOKKOS matrix 257 258 Output Parameters: 259 + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 260 - T_d - the transpose on device, whose value array is allocated but not initialized 261 */ 262 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 263 { 264 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 265 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 266 const PetscInt *Ai = aseq->i, *Aj = aseq->j; 267 MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 268 MatRowMapType *Ti = Ti_h.data(); 269 MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 270 MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 271 PetscInt *Tj = Tj_h.data(); 272 PetscInt *perm = perm_h.data(); 273 PetscInt *offset; 274 275 PetscFunctionBegin; 276 // Populate Ti 277 PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 278 Ti++; 279 for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 280 Ti--; 281 for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 282 283 // Populate Tj and the permutation array 284 PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 285 for (PetscInt i = 0; i < m; i++) { 286 for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 287 PetscInt r = Aj[j]; // row r of T 288 PetscInt disp = Ti[r] + offset[r]; 289 290 Tj[disp] = i; // col i of T 291 perm[disp] = j; 292 offset[r]++; 293 } 294 } 295 PetscCall(PetscFree(offset)); 296 297 // Sort each row of T, along with the permutation array 298 for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 299 300 // Output perm and T on device 301 auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 302 auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 303 PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 304 PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 305 PetscFunctionReturn(PETSC_SUCCESS); 306 } 307 308 // Generate the transpose on device and cache it internally 309 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 310 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 311 { 312 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 313 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 314 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 315 KokkosCsrMatrix &T = akok->csrmatT; 316 317 PetscFunctionBegin; 318 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 319 PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device 320 321 const auto &Aa = akok->a_dual.view_device(); 322 323 if (A->symmetric == PETSC_BOOL3_TRUE) { 324 *csrmatT = akok->csrmat; 325 } else { 326 // See if we already have a cached transpose and its value is up to date 327 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 328 if (!akok->transpose_updated) { // if the value is out of date, update the cached version 329 const auto &perm = akok->transpose_perm; // get the permutation array 330 auto &Ta = T.values; 331 332 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 333 } 334 } else { // Generate T of size n x m for the first time 335 MatRowMapKokkosView perm; 336 337 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 338 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 339 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 340 } 341 akok->transpose_updated = PETSC_TRUE; 342 *csrmatT = akok->csrmatT; 343 } 344 PetscFunctionReturn(PETSC_SUCCESS); 345 } 346 347 // Generate the Hermitian on device and cache it internally 348 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 349 { 350 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 351 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 352 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 353 KokkosCsrMatrix &T = akok->csrmatH; 354 355 PetscFunctionBegin; 356 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 357 PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device 358 359 const auto &Aa = akok->a_dual.view_device(); 360 361 if (A->hermitian == PETSC_BOOL3_TRUE) { 362 *csrmatH = akok->csrmat; 363 } else { 364 // See if we already have a cached Hermitian and its value is up to date 365 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 366 if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 367 const auto &perm = akok->transpose_perm; // get the permutation array 368 auto &Ta = T.values; 369 370 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 371 } 372 } else { // Generate T of size n x m for the first time 373 MatRowMapKokkosView perm; 374 375 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 376 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 377 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 378 } 379 akok->hermitian_updated = PETSC_TRUE; 380 *csrmatH = akok->csrmatH; 381 } 382 PetscFunctionReturn(PETSC_SUCCESS); 383 } 384 385 /* y = A x */ 386 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 387 { 388 Mat_SeqAIJKokkos *aijkok; 389 ConstPetscScalarKokkosView xv; 390 PetscScalarKokkosView yv; 391 392 PetscFunctionBegin; 393 PetscCall(PetscLogGpuTimeBegin()); 394 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 395 PetscCall(VecGetKokkosView(xx, &xv)); 396 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 397 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 398 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 399 PetscCall(VecRestoreKokkosView(xx, &xv)); 400 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 401 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 402 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 403 PetscCall(PetscLogGpuTimeEnd()); 404 PetscFunctionReturn(PETSC_SUCCESS); 405 } 406 407 /* y = A^T x */ 408 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 409 { 410 Mat_SeqAIJKokkos *aijkok; 411 const char *mode; 412 ConstPetscScalarKokkosView xv; 413 PetscScalarKokkosView yv; 414 KokkosCsrMatrix csrmat; 415 416 PetscFunctionBegin; 417 PetscCall(PetscLogGpuTimeBegin()); 418 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 419 PetscCall(VecGetKokkosView(xx, &xv)); 420 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 421 if (A->form_explicit_transpose) { 422 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 423 mode = "N"; 424 } else { 425 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 426 csrmat = aijkok->csrmat; 427 mode = "T"; 428 } 429 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 430 PetscCall(VecRestoreKokkosView(xx, &xv)); 431 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 432 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 433 PetscCall(PetscLogGpuTimeEnd()); 434 PetscFunctionReturn(PETSC_SUCCESS); 435 } 436 437 /* y = A^H x */ 438 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 439 { 440 Mat_SeqAIJKokkos *aijkok; 441 const char *mode; 442 ConstPetscScalarKokkosView xv; 443 PetscScalarKokkosView yv; 444 KokkosCsrMatrix csrmat; 445 446 PetscFunctionBegin; 447 PetscCall(PetscLogGpuTimeBegin()); 448 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 449 PetscCall(VecGetKokkosView(xx, &xv)); 450 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 451 if (A->form_explicit_transpose) { 452 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 453 mode = "N"; 454 } else { 455 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 456 csrmat = aijkok->csrmat; 457 mode = "C"; 458 } 459 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 460 PetscCall(VecRestoreKokkosView(xx, &xv)); 461 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 462 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 463 PetscCall(PetscLogGpuTimeEnd()); 464 PetscFunctionReturn(PETSC_SUCCESS); 465 } 466 467 /* z = A x + y */ 468 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 469 { 470 Mat_SeqAIJKokkos *aijkok; 471 ConstPetscScalarKokkosView xv; 472 PetscScalarKokkosView zv; 473 474 PetscFunctionBegin; 475 PetscCall(PetscLogGpuTimeBegin()); 476 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 477 if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host 478 PetscCall(VecGetKokkosView(xx, &xv)); 479 PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device 480 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 481 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 482 PetscCall(VecRestoreKokkosView(xx, &xv)); 483 PetscCall(VecRestoreKokkosView(zz, &zv)); 484 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 485 PetscCall(PetscLogGpuTimeEnd()); 486 PetscFunctionReturn(PETSC_SUCCESS); 487 } 488 489 /* z = A^T x + y */ 490 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 491 { 492 Mat_SeqAIJKokkos *aijkok; 493 const char *mode; 494 ConstPetscScalarKokkosView xv; 495 PetscScalarKokkosView zv; 496 KokkosCsrMatrix csrmat; 497 498 PetscFunctionBegin; 499 PetscCall(PetscLogGpuTimeBegin()); 500 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 501 if (zz != yy) PetscCall(VecCopy(yy, zz)); 502 PetscCall(VecGetKokkosView(xx, &xv)); 503 PetscCall(VecGetKokkosView(zz, &zv)); 504 if (A->form_explicit_transpose) { 505 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 506 mode = "N"; 507 } else { 508 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 509 csrmat = aijkok->csrmat; 510 mode = "T"; 511 } 512 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 513 PetscCall(VecRestoreKokkosView(xx, &xv)); 514 PetscCall(VecRestoreKokkosView(zz, &zv)); 515 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 516 PetscCall(PetscLogGpuTimeEnd()); 517 PetscFunctionReturn(PETSC_SUCCESS); 518 } 519 520 /* z = A^H x + y */ 521 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 522 { 523 Mat_SeqAIJKokkos *aijkok; 524 const char *mode; 525 ConstPetscScalarKokkosView xv; 526 PetscScalarKokkosView zv; 527 KokkosCsrMatrix csrmat; 528 529 PetscFunctionBegin; 530 PetscCall(PetscLogGpuTimeBegin()); 531 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 532 if (zz != yy) PetscCall(VecCopy(yy, zz)); 533 PetscCall(VecGetKokkosView(xx, &xv)); 534 PetscCall(VecGetKokkosView(zz, &zv)); 535 if (A->form_explicit_transpose) { 536 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 537 mode = "N"; 538 } else { 539 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 540 csrmat = aijkok->csrmat; 541 mode = "C"; 542 } 543 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 544 PetscCall(VecRestoreKokkosView(xx, &xv)); 545 PetscCall(VecRestoreKokkosView(zz, &zv)); 546 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 547 PetscCall(PetscLogGpuTimeEnd()); 548 PetscFunctionReturn(PETSC_SUCCESS); 549 } 550 551 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 552 { 553 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 554 555 PetscFunctionBegin; 556 switch (op) { 557 case MAT_FORM_EXPLICIT_TRANSPOSE: 558 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 559 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 560 A->form_explicit_transpose = flg; 561 break; 562 default: 563 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 564 break; 565 } 566 PetscFunctionReturn(PETSC_SUCCESS); 567 } 568 569 /* Depending on reuse, either build a new mat, or use the existing mat */ 570 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 571 { 572 Mat_SeqAIJ *aseq; 573 574 PetscFunctionBegin; 575 PetscCall(PetscKokkosInitializeCheck()); 576 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 577 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 578 PetscCall(MatSetType(*newmat, mtype)); 579 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 580 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 581 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 582 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 583 PetscCall(PetscFree(A->defaultvectype)); 584 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 585 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 586 PetscCall(MatSetOps_SeqAIJKokkos(A)); 587 aseq = static_cast<Mat_SeqAIJ *>(A->data); 588 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 589 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 590 A->spptr = new Mat_SeqAIJKokkos(A, A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE); 591 } 592 } 593 PetscFunctionReturn(PETSC_SUCCESS); 594 } 595 596 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 597 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 598 */ 599 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 600 { 601 Mat_SeqAIJ *bseq; 602 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 603 Mat mat; 604 605 PetscFunctionBegin; 606 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 607 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 608 mat = *B; 609 if (A->assembled) { 610 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 611 bkok = new Mat_SeqAIJKokkos(mat, mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE); 612 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 613 /* Now copy values to B if needed */ 614 if (dupOption == MAT_COPY_VALUES) { 615 if (akok->a_dual.need_sync_device()) { 616 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 617 bkok->a_dual.modify_host(); 618 } else { /* If device has the latest data, we only copy data on device */ 619 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 620 bkok->a_dual.modify_device(); 621 } 622 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 623 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 624 bkok->a_dual.modify_host(); 625 } 626 mat->spptr = bkok; 627 } 628 629 PetscCall(PetscFree(mat->defaultvectype)); 630 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 631 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 632 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 633 PetscFunctionReturn(PETSC_SUCCESS); 634 } 635 636 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 637 { 638 Mat At; 639 KokkosCsrMatrix internT; 640 Mat_SeqAIJKokkos *atkok, *bkok; 641 642 PetscFunctionBegin; 643 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 644 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 645 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 646 /* Deep copy internT, as we want to isolate the internal transpose */ 647 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 648 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 649 if (reuse == MAT_INITIAL_MATRIX) *B = At; 650 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 651 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 652 if ((*B)->assembled) { 653 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 654 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 655 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 656 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 657 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 658 MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 659 MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 660 PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 661 PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 662 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 663 } 664 PetscFunctionReturn(PETSC_SUCCESS); 665 } 666 667 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 668 { 669 Mat_SeqAIJKokkos *aijkok; 670 671 PetscFunctionBegin; 672 if (A->factortype == MAT_FACTOR_NONE) { 673 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 674 delete aijkok; 675 } else { 676 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 677 } 678 A->spptr = NULL; 679 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 680 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 681 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 682 #if defined(PETSC_HAVE_HYPRE) 683 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL)); 684 #endif 685 PetscCall(MatDestroy_SeqAIJ(A)); 686 PetscFunctionReturn(PETSC_SUCCESS); 687 } 688 689 /*MC 690 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 691 692 A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types 693 694 Options Database Key: 695 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 696 697 Level: beginner 698 699 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 700 M*/ 701 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 702 { 703 PetscFunctionBegin; 704 PetscCall(PetscKokkosInitializeCheck()); 705 PetscCall(MatCreate_SeqAIJ(A)); 706 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 707 PetscFunctionReturn(PETSC_SUCCESS); 708 } 709 710 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 711 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 712 { 713 Mat_SeqAIJ *a, *b; 714 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 715 MatScalarKokkosView aa, ba, ca; 716 MatRowMapKokkosView ai, bi, ci; 717 MatColIdxKokkosView aj, bj, cj; 718 PetscInt m, n, nnz, aN; 719 720 PetscFunctionBegin; 721 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 722 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 723 PetscAssertPointer(C, 4); 724 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 725 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 726 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 727 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 728 729 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 730 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 731 a = static_cast<Mat_SeqAIJ *>(A->data); 732 b = static_cast<Mat_SeqAIJ *>(B->data); 733 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 734 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 735 aa = akok->a_dual.view_device(); 736 ai = akok->i_dual.view_device(); 737 ba = bkok->a_dual.view_device(); 738 bi = bkok->i_dual.view_device(); 739 m = A->rmap->n; /* M, N and nnz of C */ 740 n = A->cmap->n + B->cmap->n; 741 nnz = a->nz + b->nz; 742 aN = A->cmap->n; /* N of A */ 743 if (reuse == MAT_INITIAL_MATRIX) { 744 aj = akok->j_dual.view_device(); 745 bj = bkok->j_dual.view_device(); 746 auto ca = MatScalarKokkosView("a", aa.extent(0) + ba.extent(0)); 747 auto ci = MatRowMapKokkosView("i", ai.extent(0)); 748 auto cj = MatColIdxKokkosView("j", aj.extent(0) + bj.extent(0)); 749 750 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 751 Kokkos::parallel_for( 752 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 753 PetscInt i = t.league_rank(); /* row i */ 754 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 755 756 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 757 ci(i) = coffset; 758 if (i == m - 1) ci(m) = ai(m) + bi(m); 759 }); 760 761 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 762 if (k < alen) { 763 ca(coffset + k) = aa(ai(i) + k); 764 cj(coffset + k) = aj(ai(i) + k); 765 } else { 766 ca(coffset + k) = ba(bi(i) + k - alen); 767 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 768 } 769 }); 770 }); 771 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci, cj, ca)); 772 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 773 } else if (reuse == MAT_REUSE_MATRIX) { 774 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 775 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 776 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 777 ca = ckok->a_dual.view_device(); 778 ci = ckok->i_dual.view_device(); 779 780 Kokkos::parallel_for( 781 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 782 PetscInt i = t.league_rank(); /* row i */ 783 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 784 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 785 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 786 else ca(ci(i) + k) = ba(bi(i) + k - alen); 787 }); 788 }); 789 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 790 } 791 PetscFunctionReturn(PETSC_SUCCESS); 792 } 793 794 static PetscErrorCode MatProductCtxDestroy_SeqAIJKokkos(void **pdata) 795 { 796 PetscFunctionBegin; 797 delete *reinterpret_cast<MatProductCtx_SeqAIJKokkos **>(pdata); 798 PetscFunctionReturn(PETSC_SUCCESS); 799 } 800 801 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 802 { 803 Mat_Product *product = C->product; 804 Mat A, B; 805 bool transA, transB; /* use bool, since KK needs this type */ 806 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 807 Mat_SeqAIJ *c; 808 MatProductCtx_SeqAIJKokkos *pdata; 809 KokkosCsrMatrix csrmatA, csrmatB; 810 811 PetscFunctionBegin; 812 MatCheckProduct(C, 1); 813 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 814 pdata = static_cast<MatProductCtx_SeqAIJKokkos *>(C->product->data); 815 816 // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 817 // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 818 // we still do numeric. 819 if (pdata->reusesym) { // numeric reuses results from symbolic 820 pdata->reusesym = PETSC_FALSE; 821 PetscFunctionReturn(PETSC_SUCCESS); 822 } 823 824 switch (product->type) { 825 case MATPRODUCT_AB: 826 transA = false; 827 transB = false; 828 break; 829 case MATPRODUCT_AtB: 830 transA = true; 831 transB = false; 832 break; 833 case MATPRODUCT_ABt: 834 transA = false; 835 transB = true; 836 break; 837 default: 838 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 839 } 840 841 A = product->A; 842 B = product->B; 843 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 844 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 845 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 846 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 847 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 848 849 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 850 851 csrmatA = akok->csrmat; 852 csrmatB = bkok->csrmat; 853 854 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 855 if (transA) { 856 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 857 transA = false; 858 } 859 860 if (transB) { 861 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 862 transB = false; 863 } 864 PetscCall(PetscLogGpuTimeBegin()); 865 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 866 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 867 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 868 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 869 #endif 870 871 PetscCall(PetscLogGpuTimeEnd()); 872 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 873 /* shorter version of MatAssemblyEnd_SeqAIJ */ 874 c = (Mat_SeqAIJ *)C->data; 875 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded, %" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 876 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 877 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 878 c->reallocs = 0; 879 C->info.mallocs = 0; 880 C->info.nz_unneeded = 0; 881 C->assembled = C->was_assembled = PETSC_TRUE; 882 C->num_ass++; 883 PetscFunctionReturn(PETSC_SUCCESS); 884 } 885 886 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 887 { 888 Mat_Product *product = C->product; 889 MatProductType ptype; 890 Mat A, B; 891 bool transA, transB; 892 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 893 MatProductCtx_SeqAIJKokkos *pdata; 894 MPI_Comm comm; 895 KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 896 897 PetscFunctionBegin; 898 MatCheckProduct(C, 1); 899 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 900 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 901 A = product->A; 902 B = product->B; 903 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 904 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 905 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 906 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 907 csrmatA = akok->csrmat; 908 csrmatB = bkok->csrmat; 909 910 ptype = product->type; 911 // Take advantage of the symmetry if true 912 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 913 ptype = MATPRODUCT_AB; 914 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 915 } 916 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 917 ptype = MATPRODUCT_AB; 918 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 919 } 920 921 switch (ptype) { 922 case MATPRODUCT_AB: 923 transA = false; 924 transB = false; 925 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 926 break; 927 case MATPRODUCT_AtB: 928 transA = true; 929 transB = false; 930 if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs)); 931 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 932 break; 933 case MATPRODUCT_ABt: 934 transA = false; 935 transB = true; 936 if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs)); 937 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 938 break; 939 default: 940 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 941 } 942 PetscCallCXX(product->data = pdata = new MatProductCtx_SeqAIJKokkos()); 943 pdata->reusesym = product->api_user; 944 945 /* TODO: add command line options to select spgemm algorithms */ 946 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 947 948 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 949 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 950 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 951 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 952 #endif 953 #endif 954 PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 955 956 PetscCall(PetscLogGpuTimeBegin()); 957 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 958 if (transA) { 959 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 960 transA = false; 961 } 962 963 if (transB) { 964 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 965 transB = false; 966 } 967 968 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 969 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 970 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 971 calling new Mat_SeqAIJKokkos(). 972 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 973 */ 974 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 975 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 976 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 977 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 978 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 979 #endif 980 PetscCall(PetscLogGpuTimeEnd()); 981 982 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 983 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 984 C->product->destroy = MatProductCtxDestroy_SeqAIJKokkos; 985 PetscFunctionReturn(PETSC_SUCCESS); 986 } 987 988 /* handles sparse matrix matrix ops */ 989 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 990 { 991 Mat_Product *product = mat->product; 992 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 993 994 PetscFunctionBegin; 995 MatCheckProduct(mat, 1); 996 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 997 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 998 if (Biskok && Ciskok) { 999 switch (product->type) { 1000 case MATPRODUCT_AB: 1001 case MATPRODUCT_AtB: 1002 case MATPRODUCT_ABt: 1003 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 1004 break; 1005 case MATPRODUCT_PtAP: 1006 case MATPRODUCT_RARt: 1007 case MATPRODUCT_ABC: 1008 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 1009 break; 1010 default: 1011 break; 1012 } 1013 } else { /* fallback for AIJ */ 1014 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 1015 } 1016 PetscFunctionReturn(PETSC_SUCCESS); 1017 } 1018 1019 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 1020 { 1021 Mat_SeqAIJKokkos *aijkok; 1022 1023 PetscFunctionBegin; 1024 PetscCall(PetscLogGpuTimeBegin()); 1025 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1026 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1027 KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 1028 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1029 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 1030 PetscCall(PetscLogGpuTimeEnd()); 1031 PetscFunctionReturn(PETSC_SUCCESS); 1032 } 1033 1034 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular) 1035 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a) 1036 { 1037 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1038 1039 PetscFunctionBegin; 1040 if (A->assembled && aijseq->diagDense) { // no missing diagonals 1041 PetscInt n = PetscMin(A->rmap->n, A->cmap->n); 1042 1043 PetscCall(PetscLogGpuTimeBegin()); 1044 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1045 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1046 const auto &Aa = aijkok->a_dual.view_device(); 1047 const auto &Adiag = aijkok->diag_dual.view_device(); 1048 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; })); 1049 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1050 PetscCall(PetscLogGpuFlops(n)); 1051 PetscCall(PetscLogGpuTimeEnd()); 1052 } else { // need reassembly, very slow! 1053 PetscCall(MatShift_Basic(A, a)); 1054 } 1055 PetscFunctionReturn(PETSC_SUCCESS); 1056 } 1057 1058 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is) 1059 { 1060 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data); 1061 1062 PetscFunctionBegin; 1063 if (Y->assembled && aijseq->diagDense) { // no missing diagonals 1064 ConstPetscScalarKokkosView dv; 1065 PetscInt n, nv; 1066 1067 PetscCall(PetscLogGpuTimeBegin()); 1068 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1069 PetscCall(VecGetKokkosView(D, &dv)); 1070 PetscCall(VecGetLocalSize(D, &nv)); 1071 n = PetscMin(Y->rmap->n, Y->cmap->n); 1072 PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match"); 1073 1074 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1075 const auto &Aa = aijkok->a_dual.view_device(); 1076 const auto &Adiag = aijkok->diag_dual.view_device(); 1077 PetscCallCXX(Kokkos::parallel_for( 1078 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1079 if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i); 1080 else Aa(Adiag(i)) += dv(i); 1081 })); 1082 PetscCall(VecRestoreKokkosView(D, &dv)); 1083 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1084 PetscCall(PetscLogGpuFlops(n)); 1085 PetscCall(PetscLogGpuTimeEnd()); 1086 } else { // need reassembly, very slow! 1087 PetscCall(MatDiagonalSet_Default(Y, D, is)); 1088 } 1089 PetscFunctionReturn(PETSC_SUCCESS); 1090 } 1091 1092 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr) 1093 { 1094 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1095 PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz; 1096 ConstPetscScalarKokkosView lv, rv; 1097 1098 PetscFunctionBegin; 1099 PetscCall(PetscLogGpuTimeBegin()); 1100 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1101 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1102 const auto &Aa = aijkok->a_dual.view_device(); 1103 const auto &Ai = aijkok->i_dual.view_device(); 1104 const auto &Aj = aijkok->j_dual.view_device(); 1105 if (ll) { 1106 PetscCall(VecGetLocalSize(ll, &m)); 1107 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1108 PetscCall(VecGetKokkosView(ll, &lv)); 1109 PetscCallCXX(Kokkos::parallel_for( // for each row 1110 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1111 PetscInt i = t.league_rank(); // row i 1112 PetscInt len = Ai(i + 1) - Ai(i); 1113 // scale entries on the row 1114 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); }); 1115 })); 1116 PetscCall(VecRestoreKokkosView(ll, &lv)); 1117 PetscCall(PetscLogGpuFlops(nz)); 1118 } 1119 if (rr) { 1120 PetscCall(VecGetLocalSize(rr, &n)); 1121 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1122 PetscCall(VecGetKokkosView(rr, &rv)); 1123 PetscCallCXX(Kokkos::parallel_for( // for each nonzero 1124 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); })); 1125 PetscCall(VecRestoreKokkosView(rr, &lv)); 1126 PetscCall(PetscLogGpuFlops(nz)); 1127 } 1128 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1129 PetscCall(PetscLogGpuTimeEnd()); 1130 PetscFunctionReturn(PETSC_SUCCESS); 1131 } 1132 1133 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 1134 { 1135 Mat_SeqAIJKokkos *aijkok; 1136 1137 PetscFunctionBegin; 1138 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1139 if (aijkok) { /* Only zero the device if data is already there */ 1140 KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0); 1141 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1142 } else { /* Might be preallocated but not assembled */ 1143 PetscCall(MatZeroEntries_SeqAIJ(A)); 1144 } 1145 PetscFunctionReturn(PETSC_SUCCESS); 1146 } 1147 1148 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1149 { 1150 Mat_SeqAIJKokkos *aijkok; 1151 PetscInt n; 1152 PetscScalarKokkosView xv; 1153 1154 PetscFunctionBegin; 1155 PetscCall(VecGetLocalSize(x, &n)); 1156 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1157 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1158 1159 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1160 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1161 1162 const auto &Aa = aijkok->a_dual.view_device(); 1163 const auto &Ai = aijkok->i_dual.view_device(); 1164 const auto &Adiag = aijkok->diag_dual.view_device(); 1165 1166 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1167 Kokkos::parallel_for( 1168 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1169 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1170 else xv(i) = 0; 1171 }); 1172 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1173 PetscFunctionReturn(PETSC_SUCCESS); 1174 } 1175 1176 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1177 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1178 { 1179 Mat_SeqAIJKokkos *aijkok; 1180 1181 PetscFunctionBegin; 1182 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1183 PetscAssertPointer(kv, 2); 1184 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1185 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1186 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1187 *kv = aijkok->a_dual.view_device(); 1188 PetscFunctionReturn(PETSC_SUCCESS); 1189 } 1190 1191 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1192 { 1193 PetscFunctionBegin; 1194 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1195 PetscAssertPointer(kv, 2); 1196 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1197 PetscFunctionReturn(PETSC_SUCCESS); 1198 } 1199 1200 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1201 { 1202 Mat_SeqAIJKokkos *aijkok; 1203 1204 PetscFunctionBegin; 1205 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1206 PetscAssertPointer(kv, 2); 1207 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1208 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1209 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1210 *kv = aijkok->a_dual.view_device(); 1211 PetscFunctionReturn(PETSC_SUCCESS); 1212 } 1213 1214 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1215 { 1216 PetscFunctionBegin; 1217 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1218 PetscAssertPointer(kv, 2); 1219 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1220 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1221 PetscFunctionReturn(PETSC_SUCCESS); 1222 } 1223 1224 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1225 { 1226 Mat_SeqAIJKokkos *aijkok; 1227 1228 PetscFunctionBegin; 1229 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1230 PetscAssertPointer(kv, 2); 1231 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1232 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1233 *kv = aijkok->a_dual.view_device(); 1234 PetscFunctionReturn(PETSC_SUCCESS); 1235 } 1236 1237 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1238 { 1239 PetscFunctionBegin; 1240 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1241 PetscAssertPointer(kv, 2); 1242 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1243 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1244 PetscFunctionReturn(PETSC_SUCCESS); 1245 } 1246 1247 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A) 1248 { 1249 Mat_SeqAIJKokkos *akok; 1250 1251 PetscFunctionBegin; 1252 PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_d.extent(0), i_d, j_d, a_d)); 1253 PetscCall(MatCreate(comm, A)); 1254 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1255 PetscFunctionReturn(PETSC_SUCCESS); 1256 } 1257 1258 /* Computes Y += alpha X */ 1259 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1260 { 1261 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1262 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1263 ConstMatScalarKokkosView Xa; 1264 MatScalarKokkosView Ya; 1265 auto exec = PetscGetKokkosExecutionSpace(); 1266 1267 PetscFunctionBegin; 1268 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1269 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1270 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1271 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1272 PetscCall(PetscLogGpuTimeBegin()); 1273 1274 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1275 PetscBool e; 1276 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1277 if (e) { 1278 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1279 if (e) pattern = SAME_NONZERO_PATTERN; 1280 } 1281 } 1282 1283 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1284 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1285 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1286 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1287 */ 1288 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1289 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1290 Xa = xkok->a_dual.view_device(); 1291 Ya = ykok->a_dual.view_device(); 1292 1293 if (pattern == SAME_NONZERO_PATTERN) { 1294 KokkosBlas::axpy(exec, alpha, Xa, Ya); 1295 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1296 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1297 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1298 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1299 1300 Kokkos::parallel_for( 1301 Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1302 PetscInt i = t.league_rank(); // row i 1303 Kokkos::single(Kokkos::PerTeam(t), [=]() { 1304 // Only one thread works in a team 1305 PetscInt p, q = Yi(i); 1306 for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 1307 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 1308 if (Xj(p) == Yj(q)) { // Found it 1309 Ya(q) += alpha * Xa(p); 1310 q++; 1311 } else { 1312 // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1313 // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1314 #if PETSC_PKG_KOKKOS_VERSION_GE(5, 0, 0) 1315 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = KokkosKernels::ArithTraits<PetscScalar>::nan(); 1316 #elif PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 1317 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 1318 #else 1319 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 1320 #endif 1321 } 1322 } 1323 }); 1324 }); 1325 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1326 } else { // different nonzero patterns 1327 Mat Z; 1328 KokkosCsrMatrix zcsr; 1329 KernelHandle kh; 1330 kh.create_spadd_handle(true); // X, Y are sorted 1331 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1332 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1333 zkok = new Mat_SeqAIJKokkos(zcsr); 1334 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1335 PetscCall(MatHeaderReplace(Y, &Z)); 1336 kh.destroy_spadd_handle(); 1337 } 1338 PetscCall(PetscLogGpuTimeEnd()); 1339 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 1340 PetscFunctionReturn(PETSC_SUCCESS); 1341 } 1342 1343 struct MatCOOStruct_SeqAIJKokkos { 1344 PetscCount n; 1345 PetscCount Atot; 1346 PetscInt nz; 1347 PetscCountKokkosView jmap; 1348 PetscCountKokkosView perm; 1349 1350 MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h) 1351 { 1352 nz = coo_h->nz; 1353 n = coo_h->n; 1354 Atot = coo_h->Atot; 1355 jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1)); 1356 perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot)); 1357 } 1358 }; 1359 1360 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data) 1361 { 1362 PetscFunctionBegin; 1363 PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data)); 1364 PetscFunctionReturn(PETSC_SUCCESS); 1365 } 1366 1367 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1368 { 1369 Mat_SeqAIJKokkos *akok; 1370 Mat_SeqAIJ *aseq; 1371 PetscContainer container_h; 1372 MatCOOStruct_SeqAIJ *coo_h; 1373 MatCOOStruct_SeqAIJKokkos *coo_d; 1374 1375 PetscFunctionBegin; 1376 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1377 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1378 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1379 delete akok; 1380 mat->spptr = akok = new Mat_SeqAIJKokkos(mat, mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE); 1381 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1382 1383 // Copy the COO struct to device 1384 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 1385 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 1386 PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h)); 1387 1388 // Put the COO struct in a container and then attach that to the matrix 1389 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos)); 1390 PetscFunctionReturn(PETSC_SUCCESS); 1391 } 1392 1393 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1394 { 1395 MatScalarKokkosView Aa; 1396 ConstMatScalarKokkosView kv; 1397 PetscMemType memtype; 1398 PetscContainer container; 1399 MatCOOStruct_SeqAIJKokkos *coo; 1400 1401 PetscFunctionBegin; 1402 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 1403 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 1404 1405 const auto &n = coo->n; 1406 const auto &Annz = coo->nz; 1407 const auto &jmap = coo->jmap; 1408 const auto &perm = coo->perm; 1409 1410 PetscCall(PetscGetMemType(v, &memtype)); 1411 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1412 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n)); 1413 } else { 1414 kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */ 1415 } 1416 1417 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1418 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1419 1420 PetscCall(PetscLogGpuTimeBegin()); 1421 Kokkos::parallel_for( 1422 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) { 1423 PetscScalar sum = 0.0; 1424 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1425 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1426 }); 1427 PetscCall(PetscLogGpuTimeEnd()); 1428 1429 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1430 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1431 PetscFunctionReturn(PETSC_SUCCESS); 1432 } 1433 1434 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1435 { 1436 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1437 1438 PetscFunctionBegin; 1439 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1440 A->boundtocpu = PETSC_FALSE; 1441 1442 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1443 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1444 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1445 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1446 A->ops->scale = MatScale_SeqAIJKokkos; 1447 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1448 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1449 A->ops->mult = MatMult_SeqAIJKokkos; 1450 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1451 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1452 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1453 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1454 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1455 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1456 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1457 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1458 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1459 A->ops->shift = MatShift_SeqAIJKokkos; 1460 A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos; 1461 A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos; 1462 A->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJKokkos; 1463 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1464 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1465 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1466 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1467 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1468 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1469 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1470 1471 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1472 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1473 #if defined(PETSC_HAVE_HYPRE) 1474 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE)); 1475 #endif 1476 PetscFunctionReturn(PETSC_SUCCESS); 1477 } 1478 1479 /* 1480 Extract the (prescribled) diagonal blocks of the matrix and then invert them 1481 1482 Input Parameters: 1483 + A - the MATSEQAIJKOKKOS matrix 1484 . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i) 1485 . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[] 1486 . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i) 1487 - work - a pre-allocated work buffer (as big as diagVal) for use by this routine 1488 1489 Output Parameter: 1490 . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order) 1491 */ 1492 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal) 1493 { 1494 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1495 PetscInt nblocks = bs.extent(0) - 1; 1496 1497 PetscFunctionBegin; 1498 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device 1499 1500 // Pull out the diagonal blocks of the matrix and then invert the blocks 1501 auto Aa = akok->a_dual.view_device(); 1502 auto Ai = akok->i_dual.view_device(); 1503 auto Aj = akok->j_dual.view_device(); 1504 auto Adiag = akok->diag_dual.view_device(); 1505 // TODO: how to tune the team size? 1506 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY) 1507 auto ts = Kokkos::AUTO(); 1508 #else 1509 auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs 1510 #endif 1511 PetscCallCXX(Kokkos::parallel_for( 1512 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) { 1513 const PetscInt bid = teamMember.league_rank(); // block id 1514 const PetscInt rstart = bs(bid); // this block starts from this row 1515 const PetscInt m = bs(bid + 1) - bs(bid); // size of this block 1516 const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order 1517 const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m); 1518 1519 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B 1520 PetscInt i = rstart + r; // i-th row in A 1521 1522 if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case) 1523 PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row 1524 1525 for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet 1526 if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero 1527 B(r, c) = 0.0; 1528 } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column 1529 B(r, c) = Aa(first + c); 1530 } else { // this entry does not show up in the CSR 1531 B(r, c) = 0.0; 1532 } 1533 } 1534 } else { // rare case that the diagonal does not exist 1535 const PetscInt begin = Ai(i); 1536 const PetscInt end = Ai(i + 1); 1537 for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0; 1538 for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not. 1539 if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j); 1540 else if (Aj(j) >= rstart + m) break; 1541 } 1542 } 1543 }); 1544 1545 // LU-decompose B (w/o pivoting) and then invert B 1546 KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0); 1547 KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W); 1548 })); 1549 // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels 1550 PetscFunctionReturn(PETSC_SUCCESS); 1551 } 1552 1553 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1554 { 1555 Mat_SeqAIJ *aseq; 1556 PetscInt i, m, n; 1557 auto exec = PetscGetKokkosExecutionSpace(); 1558 1559 PetscFunctionBegin; 1560 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1561 1562 m = akok->nrows(); 1563 n = akok->ncols(); 1564 PetscCall(MatSetSizes(A, m, n, m, n)); 1565 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1566 1567 /* Set up data structures of A as a MATSEQAIJ */ 1568 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1569 aseq = (Mat_SeqAIJ *)A->data; 1570 1571 PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */ 1572 PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec)); 1573 1574 aseq->i = akok->i_host_data(); 1575 aseq->j = akok->j_host_data(); 1576 aseq->a = akok->a_host_data(); 1577 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1578 aseq->free_a = PETSC_FALSE; 1579 aseq->free_ij = PETSC_FALSE; 1580 aseq->nz = akok->nnz(); 1581 aseq->maxnz = aseq->nz; 1582 1583 PetscCall(PetscMalloc1(m, &aseq->imax)); 1584 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1585 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1586 1587 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1588 akok->nonzerostate = A->nonzerostate; 1589 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1590 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1591 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1592 PetscFunctionReturn(PETSC_SUCCESS); 1593 } 1594 1595 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 1596 { 1597 PetscFunctionBegin; 1598 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1599 *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 1600 PetscFunctionReturn(PETSC_SUCCESS); 1601 } 1602 1603 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 1604 { 1605 Mat_SeqAIJKokkos *akok; 1606 1607 PetscFunctionBegin; 1608 PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 1609 PetscCall(MatCreate(comm, A)); 1610 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1611 PetscFunctionReturn(PETSC_SUCCESS); 1612 } 1613 1614 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1615 1616 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1617 */ 1618 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1619 { 1620 PetscFunctionBegin; 1621 PetscCall(MatCreate(comm, A)); 1622 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1623 PetscFunctionReturn(PETSC_SUCCESS); 1624 } 1625 1626 /*@C 1627 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1628 (the default parallel PETSc format). This matrix will ultimately be handled by 1629 Kokkos for calculations. 1630 1631 Collective 1632 1633 Input Parameters: 1634 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1635 . m - number of rows 1636 . n - number of columns 1637 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 1638 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 1639 1640 Output Parameter: 1641 . A - the matrix 1642 1643 Level: intermediate 1644 1645 Notes: 1646 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1647 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1648 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1649 1650 The AIJ format, also called 1651 compressed row storage, is fully compatible with standard Fortran 1652 storage. That is, the stored row and column indices can begin at 1653 either one (as in Fortran) or zero. 1654 1655 Specify the preallocated storage with either `nz` or `nnz` (not both). 1656 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1657 allocation. 1658 1659 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()` 1660 @*/ 1661 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1662 { 1663 PetscFunctionBegin; 1664 PetscCall(PetscKokkosInitializeCheck()); 1665 PetscCall(MatCreate(comm, A)); 1666 PetscCall(MatSetSizes(*A, m, n, m, n)); 1667 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1668 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1669 PetscFunctionReturn(PETSC_SUCCESS); 1670 } 1671 1672 // After matrix numeric factorization, there are still steps to do before triangular solve can be called. 1673 // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does). 1674 // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve(). 1675 // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck. 1676 static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A) 1677 { 1678 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1679 const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy 1680 const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy 1681 1682 PetscFunctionBegin; 1683 if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet 1684 if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d)); 1685 if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d)); 1686 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1687 } 1688 PetscFunctionReturn(PETSC_SUCCESS); 1689 } 1690 1691 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1692 { 1693 const PetscInt n = A->rmap->n; 1694 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1695 const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy 1696 const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy 1697 1698 PetscFunctionBegin; 1699 if (!factors->transpose_updated) { 1700 if (has_upper) { 1701 if (!factors->iUt_d.extent(0)) { // Allocate Ut on device if not yet 1702 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix 1703 factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 1704 factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 1705 } 1706 1707 if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host 1708 if (!factors->U) { 1709 Mat_SeqAIJ *seq; 1710 1711 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U)); 1712 PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut)); 1713 1714 seq = static_cast<Mat_SeqAIJ *>(factors->Ut->data); 1715 factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1); 1716 factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz); 1717 factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz); 1718 } else { 1719 PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h 1720 } 1721 // Copy Ut from host to device 1722 PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h)); 1723 PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h)); 1724 PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h)); 1725 } else { // If U was computed on device, we also compute the transpose there 1726 // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. 1727 PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, 1728 factors->jU_d, factors->aU_d, 1729 factors->iUt_d, factors->jUt_d, 1730 factors->aUt_d)); 1731 PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d)); 1732 } 1733 PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d)); 1734 } 1735 1736 // do the same for L with LU 1737 if (has_lower) { 1738 if (!factors->iLt_d.extent(0)) { // Allocate Lt on device if not yet 1739 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix 1740 factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 1741 factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 1742 } 1743 1744 if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host 1745 if (!factors->L) { 1746 Mat_SeqAIJ *seq; 1747 1748 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L)); 1749 PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt)); 1750 1751 seq = static_cast<Mat_SeqAIJ *>(factors->Lt->data); 1752 factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1); 1753 factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz); 1754 factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz); 1755 } else { 1756 PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h 1757 } 1758 // Copy Lt from host to device 1759 PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h)); 1760 PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h)); 1761 PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h)); 1762 } else { // If L was computed on device, we also compute the transpose there 1763 // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. 1764 PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, 1765 factors->jL_d, factors->aL_d, 1766 factors->iLt_d, factors->jLt_d, 1767 factors->aLt_d)); 1768 PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d)); 1769 } 1770 PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d)); 1771 } 1772 1773 factors->transpose_updated = PETSC_TRUE; 1774 } 1775 PetscFunctionReturn(PETSC_SUCCESS); 1776 } 1777 1778 // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A. 1779 // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty. 1780 static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx) 1781 { 1782 auto exec = PetscGetKokkosExecutionSpace(); 1783 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1784 PetscInt m = A->rmap->n; 1785 PetscScalarKokkosView D = factors->D_d; 1786 PetscScalarKokkosView X, Y, B; // alias 1787 ConstPetscScalarKokkosView b; 1788 PetscScalarKokkosView x; 1789 PetscIntKokkosView &rowperm = factors->rowperm; 1790 PetscBool identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1791 1792 PetscFunctionBegin; 1793 PetscCall(PetscLogGpuTimeBegin()); 1794 PetscCall(MatSeqAIJKokkosSolveCheck(A)); // for UX = T 1795 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B 1796 PetscCall(VecGetKokkosView(bb, &b)); 1797 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1798 1799 // Solve U^T Y = B 1800 if (identity) { // Reorder b with the row permutation 1801 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1802 Y = factors->workVector; 1803 } else { 1804 B = factors->workVector; 1805 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); })); 1806 Y = x; 1807 } 1808 PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y)); 1809 1810 // Solve diag(D) Y' = Y. 1811 // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication. 1812 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); })); 1813 1814 // Solve UX = Y 1815 if (identity) { 1816 X = x; 1817 } else { 1818 X = factors->workVector; // B is not needed anymore 1819 } 1820 PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X)); 1821 1822 // Reorder X with the inverse column (row) permutation 1823 if (!identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); })); 1824 1825 PetscCall(VecRestoreKokkosView(bb, &b)); 1826 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1827 PetscCall(PetscLogGpuTimeEnd()); 1828 PetscFunctionReturn(PETSC_SUCCESS); 1829 } 1830 1831 // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively. 1832 // R and C are represented by rowperm and colperm in factors. 1833 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty. 1834 static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx) 1835 { 1836 auto exec = PetscGetKokkosExecutionSpace(); 1837 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1838 PetscInt m = A->rmap->n; 1839 PetscScalarKokkosView X, Y, B; // alias 1840 ConstPetscScalarKokkosView b; 1841 PetscScalarKokkosView x; 1842 PetscIntKokkosView &rowperm = factors->rowperm; 1843 PetscIntKokkosView &colperm = factors->colperm; 1844 PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1845 PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1846 1847 PetscFunctionBegin; 1848 PetscCall(PetscLogGpuTimeBegin()); 1849 PetscCall(MatSeqAIJKokkosSolveCheck(A)); 1850 PetscCall(VecGetKokkosView(bb, &b)); 1851 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1852 1853 // Solve L Y = B (i.e., L (U C^- x) = R b). R b indicates applying the row permutation on b. 1854 if (row_identity) { 1855 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1856 Y = factors->workVector; 1857 } else { 1858 B = factors->workVector; 1859 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); })); 1860 Y = x; 1861 } 1862 PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y)); 1863 1864 // Solve U C^- x = Y 1865 if (col_identity) { 1866 X = x; 1867 } else { 1868 X = factors->workVector; 1869 } 1870 PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X)); 1871 1872 // x = C X; Reorder X with the inverse col permutation 1873 if (!col_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); })); 1874 1875 PetscCall(VecRestoreKokkosView(bb, &b)); 1876 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1877 PetscCall(PetscLogGpuTimeEnd()); 1878 PetscFunctionReturn(PETSC_SUCCESS); 1879 } 1880 1881 // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively. 1882 // R and C are represented by rowperm and colperm in factors. 1883 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty. 1884 // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R. 1885 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx) 1886 { 1887 auto exec = PetscGetKokkosExecutionSpace(); 1888 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1889 PetscInt m = A->rmap->n; 1890 PetscScalarKokkosView X, Y, B; // alias 1891 ConstPetscScalarKokkosView b; 1892 PetscScalarKokkosView x; 1893 PetscIntKokkosView &rowperm = factors->rowperm; 1894 PetscIntKokkosView &colperm = factors->colperm; 1895 PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1896 PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1897 1898 PetscFunctionBegin; 1899 PetscCall(PetscLogGpuTimeBegin()); 1900 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T 1901 PetscCall(VecGetKokkosView(bb, &b)); 1902 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1903 1904 // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b). Note C^- b = C^T b, which means applying the column permutation on b. 1905 if (col_identity) { // Reorder b with the col permutation 1906 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1907 Y = factors->workVector; 1908 } else { 1909 B = factors->workVector; 1910 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); })); 1911 Y = x; 1912 } 1913 PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y)); 1914 1915 // Solve L^T X = Y 1916 if (row_identity) { 1917 X = x; 1918 } else { 1919 X = factors->workVector; 1920 } 1921 PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X)); 1922 1923 // x = R^- X = R^T X; Reorder X with the inverse row permutation 1924 if (!row_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); })); 1925 1926 PetscCall(VecRestoreKokkosView(bb, &b)); 1927 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1928 PetscCall(PetscLogGpuTimeEnd()); 1929 PetscFunctionReturn(PETSC_SUCCESS); 1930 } 1931 1932 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1933 { 1934 PetscFunctionBegin; 1935 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1936 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1937 1938 if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device 1939 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1940 Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data); 1941 const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag; 1942 const MatScalar *Ba = b->a; 1943 PetscInt m = B->rmap->n, n = B->cmap->n; 1944 1945 if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time 1946 // Allocate memory and copy the structure 1947 factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1); 1948 factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries 1949 factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m); 1950 factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1); 1951 factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m])); 1952 factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m])); 1953 1954 PetscInt *Li = factors->iL_h.data(); 1955 PetscInt *Lj = factors->jL_h.data(); 1956 PetscInt *Ui = factors->iU_h.data(); 1957 PetscInt *Uj = factors->jU_h.data(); 1958 1959 Li[0] = Ui[0] = 0; 1960 for (PetscInt i = 0; i < m; i++) { 1961 PetscInt llen = Bi[i + 1] - Bi[i]; // exclusive of the diagonal entry 1962 PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry 1963 1964 PetscCall(PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen)); // entries of L on the left of the diagonal 1965 Lj[Li[i] + llen] = i; // diagonal entry of L 1966 1967 Uj[Ui[i]] = i; // diagonal entry of U 1968 PetscCall(PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 1969 1970 Li[i + 1] = Li[i] + llen + 1; 1971 Ui[i + 1] = Ui[i] + ulen; 1972 } 1973 1974 factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h); 1975 factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h); 1976 factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h); 1977 factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h); 1978 factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h); 1979 factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h); 1980 1981 // Copy row/col permutation to device 1982 IS rowperm = ((Mat_SeqAIJ *)B->data)->row; 1983 PetscBool row_identity; 1984 PetscCall(ISIdentity(rowperm, &row_identity)); 1985 if (!row_identity) { 1986 const PetscInt *ip; 1987 1988 PetscCall(ISGetIndices(rowperm, &ip)); 1989 factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m); 1990 PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m))); 1991 PetscCall(ISRestoreIndices(rowperm, &ip)); 1992 PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt))); 1993 } 1994 1995 IS colperm = ((Mat_SeqAIJ *)B->data)->col; 1996 PetscBool col_identity; 1997 PetscCall(ISIdentity(colperm, &col_identity)); 1998 if (!col_identity) { 1999 const PetscInt *ip; 2000 2001 PetscCall(ISGetIndices(colperm, &ip)); 2002 factors->colperm = PetscIntKokkosView(NoInit("colperm"), n); 2003 PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n))); 2004 PetscCall(ISRestoreIndices(colperm, &ip)); 2005 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 2006 } 2007 2008 /* Create sptrsv handles for L, U and their transpose */ 2009 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2010 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2011 #else 2012 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2013 #endif 2014 factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */); 2015 factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */); 2016 factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */); 2017 factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */); 2018 } 2019 2020 // Copy the value 2021 for (PetscInt i = 0; i < m; i++) { 2022 PetscInt llen = Bi[i + 1] - Bi[i]; 2023 PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; 2024 const PetscInt *Li = factors->iL_h.data(); 2025 const PetscInt *Ui = factors->iU_h.data(); 2026 2027 PetscScalar *La = factors->aL_h.data(); 2028 PetscScalar *Ua = factors->aU_h.data(); 2029 2030 PetscCall(PetscArraycpy(La + Li[i], Ba + Bi[i], llen)); // entries of L 2031 La[Li[i] + llen] = 1.0; // diagonal entry 2032 2033 Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]]; // diagonal entry 2034 PetscCall(PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1)); // entries of U 2035 } 2036 2037 PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h)); 2038 PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h)); 2039 // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic 2040 factors->transpose_updated = PETSC_FALSE; 2041 factors->sptrsv_symbolic_completed = PETSC_FALSE; 2042 2043 B->ops->solve = MatSolve_SeqAIJKokkos_LU; 2044 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU; 2045 } 2046 2047 B->ops->matsolve = NULL; 2048 B->ops->matsolvetranspose = NULL; 2049 PetscFunctionReturn(PETSC_SUCCESS); 2050 } 2051 2052 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info) 2053 { 2054 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 2055 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2056 PetscInt fill_lev = info->levels; 2057 2058 PetscFunctionBegin; 2059 PetscCall(PetscLogGpuTimeBegin()); 2060 PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false"); 2061 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2062 2063 auto a_d = aijkok->a_dual.view_device(); 2064 auto i_d = aijkok->i_dual.view_device(); 2065 auto j_d = aijkok->j_dual.view_device(); 2066 2067 PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d)); 2068 2069 B->assembled = PETSC_TRUE; 2070 B->preallocated = PETSC_TRUE; 2071 B->ops->solve = MatSolve_SeqAIJKokkos_LU; 2072 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU; 2073 B->ops->matsolve = NULL; 2074 B->ops->matsolvetranspose = NULL; 2075 2076 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 2077 factors->transpose_updated = PETSC_FALSE; 2078 factors->sptrsv_symbolic_completed = PETSC_FALSE; 2079 /* TODO: log flops, but how to know that? */ 2080 PetscCall(PetscLogGpuTimeEnd()); 2081 PetscFunctionReturn(PETSC_SUCCESS); 2082 } 2083 2084 // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering 2085 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info) 2086 { 2087 Mat_SeqAIJKokkos *aijkok; 2088 Mat_SeqAIJ *b; 2089 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2090 PetscInt fill_lev = info->levels; 2091 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 2092 PetscInt n = A->rmap->n; 2093 2094 PetscFunctionBegin; 2095 PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now"); 2096 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2097 2098 /* Create a spiluk handle and then do symbolic factorization */ 2099 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 2100 factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 2101 2102 auto spiluk_handle = factors->kh.get_spiluk_handle(); 2103 2104 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 2105 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 2106 Kokkos::realloc(factors->iU_d, n + 1); 2107 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 2108 2109 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 2110 auto i_d = aijkok->i_dual.view_device(); 2111 auto j_d = aijkok->j_dual.view_device(); 2112 PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d)); 2113 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 2114 2115 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 2116 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 2117 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 2118 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 2119 2120 /* TODO: add options to select sptrsv algorithms */ 2121 /* Create sptrsv handles for L, U and their transpose */ 2122 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2123 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2124 #else 2125 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2126 #endif 2127 2128 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 2129 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 2130 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 2131 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 2132 2133 /* Fill fields of the factor matrix B */ 2134 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 2135 b = (Mat_SeqAIJ *)B->data; 2136 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 2137 B->info.fill_ratio_given = info->fill; 2138 B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 2139 2140 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0; 2141 PetscFunctionReturn(PETSC_SUCCESS); 2142 } 2143 2144 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2145 { 2146 PetscFunctionBegin; 2147 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2148 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2149 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2150 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 2151 PetscFunctionReturn(PETSC_SUCCESS); 2152 } 2153 2154 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2155 { 2156 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2157 2158 PetscFunctionBegin; 2159 if (!info->factoronhost) { 2160 PetscCall(ISIdentity(isrow, &row_identity)); 2161 PetscCall(ISIdentity(iscol, &col_identity)); 2162 } 2163 2164 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2165 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2166 2167 if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering 2168 PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info)); 2169 } else { 2170 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host 2171 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 2172 } 2173 PetscFunctionReturn(PETSC_SUCCESS); 2174 } 2175 2176 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 2177 { 2178 PetscFunctionBegin; 2179 PetscCall(MatSeqAIJKokkosSyncHost(A)); 2180 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 2181 2182 if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device 2183 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2184 Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data); 2185 const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag; 2186 const MatScalar *Ba = b->a; 2187 PetscInt m = B->rmap->n; 2188 2189 if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization 2190 // Allocate memory and copy the structure 2191 factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h 2192 factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]); 2193 factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]); 2194 factors->D_h = MatScalarKokkosViewHost(NoInit("D_h"), m); 2195 factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h); 2196 factors->D_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h); 2197 2198 // Build jU_h from the skewed Aj 2199 PetscInt *Uj = factors->jU_h.data(); 2200 for (PetscInt i = 0; i < m; i++) { 2201 PetscInt ulen = Bi[i + 1] - Bi[i]; 2202 Uj[Bi[i]] = i; // diagonal entry 2203 PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal 2204 } 2205 2206 // Copy iU, jU to device 2207 PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h)); 2208 PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h)); 2209 2210 // Copy row/col permutation to device 2211 IS rowperm = ((Mat_SeqAIJ *)B->data)->row; 2212 PetscBool row_identity; 2213 PetscCall(ISIdentity(rowperm, &row_identity)); 2214 if (!row_identity) { 2215 const PetscInt *ip; 2216 2217 PetscCall(ISGetIndices(rowperm, &ip)); 2218 PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m)); 2219 PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m))); 2220 PetscCall(ISRestoreIndices(rowperm, &ip)); 2221 PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt))); 2222 } 2223 2224 // Create sptrsv handles for U and U^T 2225 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2226 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2227 #else 2228 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2229 #endif 2230 factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */); 2231 factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */); 2232 } 2233 // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them 2234 B->ops->solve = MatSolve_SeqAIJKokkos_Cholesky; 2235 B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky; 2236 2237 // Copy the value 2238 PetscScalar *Ua = factors->aU_h.data(); 2239 PetscScalar *D = factors->D_h.data(); 2240 for (PetscInt i = 0; i < m; i++) { 2241 D[i] = Ba[Bdiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 2242 Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U 2243 for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k]; 2244 } 2245 PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h)); 2246 PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h)); 2247 2248 factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again 2249 factors->transpose_updated = PETSC_FALSE; 2250 } 2251 2252 B->ops->matsolve = NULL; 2253 B->ops->matsolvetranspose = NULL; 2254 PetscFunctionReturn(PETSC_SUCCESS); 2255 } 2256 2257 static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2258 { 2259 PetscFunctionBegin; 2260 if (info->solveonhost) { 2261 // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc. 2262 PetscCall(MatSetType(B, MATSEQSBAIJ)); 2263 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 2264 } 2265 2266 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2267 2268 if (!info->solveonhost) { 2269 // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr 2270 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2271 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2272 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos; 2273 } 2274 PetscFunctionReturn(PETSC_SUCCESS); 2275 } 2276 2277 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2278 { 2279 PetscFunctionBegin; 2280 if (info->solveonhost) { 2281 // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc. 2282 PetscCall(MatSetType(B, MATSEQSBAIJ)); 2283 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 2284 } 2285 2286 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm 2287 2288 if (!info->solveonhost) { 2289 // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr 2290 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2291 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2292 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos; 2293 } 2294 PetscFunctionReturn(PETSC_SUCCESS); 2295 } 2296 2297 // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix 2298 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type) 2299 { 2300 PetscFunctionBegin; 2301 *type = MATSOLVERKOKKOS; 2302 PetscFunctionReturn(PETSC_SUCCESS); 2303 } 2304 2305 /*MC 2306 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 2307 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 2308 2309 Level: beginner 2310 2311 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS` 2312 M*/ 2313 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 2314 { 2315 PetscInt n = A->rmap->n; 2316 MPI_Comm comm; 2317 2318 PetscFunctionBegin; 2319 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 2320 PetscCall(MatCreate(comm, B)); 2321 PetscCall(MatSetSizes(*B, n, n, n, n)); 2322 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2323 (*B)->factortype = ftype; 2324 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 2325 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2326 PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2327 2328 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2329 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 2330 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 2331 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2332 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2333 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2334 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2335 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJKokkos; 2336 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos; 2337 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2338 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2339 } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 2340 2341 // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host 2342 (*B)->canuseordering = PETSC_TRUE; 2343 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos)); 2344 PetscFunctionReturn(PETSC_SUCCESS); 2345 } 2346 2347 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void) 2348 { 2349 PetscFunctionBegin; 2350 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 2351 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos)); 2352 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 2353 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos)); 2354 PetscFunctionReturn(PETSC_SUCCESS); 2355 } 2356 2357 /* Utility to print out a KokkosCsrMatrix for debugging */ 2358 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 2359 { 2360 const auto &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map); 2361 const auto &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries); 2362 const auto &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values); 2363 const PetscInt *i = iv.data(); 2364 const PetscInt *j = jv.data(); 2365 const PetscScalar *a = av.data(); 2366 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 2367 2368 PetscFunctionBegin; 2369 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 2370 for (PetscInt k = 0; k < m; k++) { 2371 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 2372 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 2373 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 2374 } 2375 PetscFunctionReturn(PETSC_SUCCESS); 2376 } 2377