1 #include <petsc_kokkos.hpp> 2 #include <petscvec_kokkos.hpp> 3 #include <petscmat_kokkos.hpp> 4 #include <petscpkg_version.h> 5 #include <petsc/private/petscimpl.h> 6 #include <petsc/private/sfimpl.h> 7 #include <petsc/private/kokkosimpl.hpp> 8 #include <petscsys.h> 9 10 #include <Kokkos_Core.hpp> 11 #include <KokkosBlas.hpp> 12 #include <KokkosSparse_CrsMatrix.hpp> 13 14 // To suppress compiler warnings: 15 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63: 16 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, 17 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*, 18 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations] 19 #define DISABLE_CUSPARSE_DEPRECATED 20 #include <KokkosSparse_spmv.hpp> 21 22 #include <KokkosSparse_spiluk.hpp> 23 #include <KokkosSparse_sptrsv.hpp> 24 #include <KokkosSparse_spgemm.hpp> 25 #include <KokkosSparse_spadd.hpp> 26 #include <KokkosBatched_LU_Decl.hpp> 27 #include <KokkosBatched_InverseLU_Decl.hpp> 28 29 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 30 31 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 32 #include <KokkosSparse_Utils.hpp> 33 using KokkosSparse::sort_crs_matrix; 34 using KokkosSparse::Impl::transpose_matrix; 35 #else 36 #include <KokkosKernels_Sorting.hpp> 37 using KokkosKernels::sort_crs_matrix; 38 using KokkosKernels::Impl::transpose_matrix; 39 #endif 40 41 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0) 42 using KokkosSparse::spiluk_symbolic; 43 using KokkosSparse::spiluk_numeric; 44 using KokkosSparse::sptrsv_symbolic; 45 using KokkosSparse::sptrsv_solve; 46 using KokkosSparse::Experimental::SPTRSVAlgorithm; 47 using KokkosSparse::Experimental::SPILUKAlgorithm; 48 #else 49 using KokkosSparse::Experimental::spiluk_symbolic; 50 using KokkosSparse::Experimental::spiluk_numeric; 51 using KokkosSparse::Experimental::sptrsv_symbolic; 52 using KokkosSparse::Experimental::sptrsv_solve; 53 using KokkosSparse::Experimental::SPTRSVAlgorithm; 54 using KokkosSparse::Experimental::SPILUKAlgorithm; 55 #endif 56 57 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 58 59 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 60 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 61 In the latter case, it is important to set a_dual's sync state correctly. 62 */ 63 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 64 { 65 Mat_SeqAIJ *aijseq; 66 Mat_SeqAIJKokkos *aijkok; 67 68 PetscFunctionBegin; 69 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 70 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 71 72 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 73 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 74 75 /* If aijkok does not exist, we just copy i, j to device. 76 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 77 In both cases, we build a new aijkok structure. 78 */ 79 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 80 if (aijkok && aijkok->host_aij_allocated_by_kokkos) { /* Avoid accidentally freeing much needed a,i,j on host when deleting aijkok */ 81 PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i)); 82 PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j)); 83 PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a)); 84 PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1)); 85 PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz())); 86 PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz())); 87 aijseq->free_a = PETSC_TRUE; 88 aijseq->free_ij = PETSC_TRUE; 89 /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where 90 we have the CsrMatrix on device first and then copy to host, followed by 91 MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL. 92 One could improve it by not using NULL garray. 93 */ 94 } 95 delete aijkok; 96 aijkok = new Mat_SeqAIJKokkos(A, A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /* don't copy mat values to device */); 97 A->spptr = aijkok; 98 } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag. 99 const PetscInt *adiag; 100 /* the a->diag is created at assmebly here because the rest of the Kokkos AIJ code assumes it always exists. This needs to be fixed since it is now only created when needed! */ 101 PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL)); 102 MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n); 103 auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h); 104 aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h); 105 } 106 PetscFunctionReturn(PETSC_SUCCESS); 107 } 108 109 /* Sync CSR data to device if not yet */ 110 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 111 { 112 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 113 114 PetscFunctionBegin; 115 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 116 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 117 if (aijkok->a_dual.need_sync_device()) { 118 PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 119 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 120 aijkok->hermitian_updated = PETSC_FALSE; 121 } 122 PetscFunctionReturn(PETSC_SUCCESS); 123 } 124 125 /* Mark the CSR data on device as modified */ 126 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 127 { 128 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 129 130 PetscFunctionBegin; 131 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 132 aijkok->a_dual.clear_sync_state(); 133 aijkok->a_dual.modify_device(); 134 aijkok->transpose_updated = PETSC_FALSE; 135 aijkok->hermitian_updated = PETSC_FALSE; 136 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 137 PetscFunctionReturn(PETSC_SUCCESS); 138 } 139 140 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 141 { 142 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 143 auto exec = PetscGetKokkosExecutionSpace(); 144 145 PetscFunctionBegin; 146 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 147 /* We do not expect one needs factors on host */ 148 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 149 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 150 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec)); 151 PetscFunctionReturn(PETSC_SUCCESS); 152 } 153 154 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 155 { 156 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 157 158 PetscFunctionBegin; 159 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 160 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 161 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 162 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 163 */ 164 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 165 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 166 *array = aijkok->a_dual.view_host().data(); 167 } else { /* Happens when calling MatSetValues on a newly created matrix */ 168 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 169 } 170 PetscFunctionReturn(PETSC_SUCCESS); 171 } 172 173 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 174 { 175 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 176 177 PetscFunctionBegin; 178 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 179 PetscFunctionReturn(PETSC_SUCCESS); 180 } 181 182 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 183 { 184 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 185 186 PetscFunctionBegin; 187 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 188 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 189 *array = aijkok->a_dual.view_host().data(); 190 } else { 191 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 192 } 193 PetscFunctionReturn(PETSC_SUCCESS); 194 } 195 196 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 197 { 198 PetscFunctionBegin; 199 *array = NULL; 200 PetscFunctionReturn(PETSC_SUCCESS); 201 } 202 203 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 204 { 205 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 206 207 PetscFunctionBegin; 208 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 209 *array = aijkok->a_dual.view_host().data(); 210 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 211 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 212 } 213 PetscFunctionReturn(PETSC_SUCCESS); 214 } 215 216 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 217 { 218 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 219 220 PetscFunctionBegin; 221 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 222 aijkok->a_dual.clear_sync_state(); 223 aijkok->a_dual.modify_host(); 224 } 225 PetscFunctionReturn(PETSC_SUCCESS); 226 } 227 228 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 229 { 230 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 231 232 PetscFunctionBegin; 233 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 234 235 if (i) *i = aijkok->i_device_data(); 236 if (j) *j = aijkok->j_device_data(); 237 if (a) { 238 PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 239 *a = aijkok->a_device_data(); 240 } 241 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 242 PetscFunctionReturn(PETSC_SUCCESS); 243 } 244 245 static PetscErrorCode MatGetCurrentMemType_SeqAIJKokkos(PETSC_UNUSED Mat A, PetscMemType *mtype) 246 { 247 PetscFunctionBegin; 248 *mtype = PETSC_MEMTYPE_KOKKOS; 249 PetscFunctionReturn(PETSC_SUCCESS); 250 } 251 252 /* 253 Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 254 255 Input Parameter: 256 . A - the MATSEQAIJKOKKOS matrix 257 258 Output Parameters: 259 + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 260 - T_d - the transpose on device, whose value array is allocated but not initialized 261 */ 262 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 263 { 264 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 265 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 266 const PetscInt *Ai = aseq->i, *Aj = aseq->j; 267 MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 268 MatRowMapType *Ti = Ti_h.data(); 269 MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 270 MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 271 PetscInt *Tj = Tj_h.data(); 272 PetscInt *perm = perm_h.data(); 273 PetscInt *offset; 274 275 PetscFunctionBegin; 276 // Populate Ti 277 PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 278 Ti++; 279 for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 280 Ti--; 281 for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 282 283 // Populate Tj and the permutation array 284 PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 285 for (PetscInt i = 0; i < m; i++) { 286 for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 287 PetscInt r = Aj[j]; // row r of T 288 PetscInt disp = Ti[r] + offset[r]; 289 290 Tj[disp] = i; // col i of T 291 perm[disp] = j; 292 offset[r]++; 293 } 294 } 295 PetscCall(PetscFree(offset)); 296 297 // Sort each row of T, along with the permutation array 298 for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 299 300 // Output perm and T on device 301 auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 302 auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 303 PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 304 PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 305 PetscFunctionReturn(PETSC_SUCCESS); 306 } 307 308 // Generate the transpose on device and cache it internally 309 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 310 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 311 { 312 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 313 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 314 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 315 KokkosCsrMatrix &T = akok->csrmatT; 316 317 PetscFunctionBegin; 318 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 319 PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device 320 321 const auto &Aa = akok->a_dual.view_device(); 322 323 if (A->symmetric == PETSC_BOOL3_TRUE) { 324 *csrmatT = akok->csrmat; 325 } else { 326 // See if we already have a cached transpose and its value is up to date 327 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 328 if (!akok->transpose_updated) { // if the value is out of date, update the cached version 329 const auto &perm = akok->transpose_perm; // get the permutation array 330 auto &Ta = T.values; 331 332 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 333 } 334 } else { // Generate T of size n x m for the first time 335 MatRowMapKokkosView perm; 336 337 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 338 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 339 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 340 } 341 akok->transpose_updated = PETSC_TRUE; 342 *csrmatT = akok->csrmatT; 343 } 344 PetscFunctionReturn(PETSC_SUCCESS); 345 } 346 347 // Generate the Hermitian on device and cache it internally 348 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 349 { 350 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 351 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 352 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 353 KokkosCsrMatrix &T = akok->csrmatH; 354 355 PetscFunctionBegin; 356 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 357 PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device 358 359 const auto &Aa = akok->a_dual.view_device(); 360 361 if (A->hermitian == PETSC_BOOL3_TRUE) { 362 *csrmatH = akok->csrmat; 363 } else { 364 // See if we already have a cached Hermitian and its value is up to date 365 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 366 if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 367 const auto &perm = akok->transpose_perm; // get the permutation array 368 auto &Ta = T.values; 369 370 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 371 } 372 } else { // Generate T of size n x m for the first time 373 MatRowMapKokkosView perm; 374 375 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 376 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 377 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 378 } 379 akok->hermitian_updated = PETSC_TRUE; 380 *csrmatH = akok->csrmatH; 381 } 382 PetscFunctionReturn(PETSC_SUCCESS); 383 } 384 385 /* y = A x */ 386 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 387 { 388 Mat_SeqAIJKokkos *aijkok; 389 ConstPetscScalarKokkosView xv; 390 PetscScalarKokkosView yv; 391 392 PetscFunctionBegin; 393 PetscCall(PetscLogGpuTimeBegin()); 394 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 395 PetscCall(VecGetKokkosView(xx, &xv)); 396 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 397 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 398 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 399 PetscCall(VecRestoreKokkosView(xx, &xv)); 400 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 401 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 402 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 403 PetscCall(PetscLogGpuTimeEnd()); 404 PetscFunctionReturn(PETSC_SUCCESS); 405 } 406 407 /* y = A^T x */ 408 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 409 { 410 Mat_SeqAIJKokkos *aijkok; 411 const char *mode; 412 ConstPetscScalarKokkosView xv; 413 PetscScalarKokkosView yv; 414 KokkosCsrMatrix csrmat; 415 416 PetscFunctionBegin; 417 PetscCall(PetscLogGpuTimeBegin()); 418 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 419 PetscCall(VecGetKokkosView(xx, &xv)); 420 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 421 if (A->form_explicit_transpose) { 422 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 423 mode = "N"; 424 } else { 425 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 426 csrmat = aijkok->csrmat; 427 mode = "T"; 428 } 429 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 430 PetscCall(VecRestoreKokkosView(xx, &xv)); 431 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 432 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 433 PetscCall(PetscLogGpuTimeEnd()); 434 PetscFunctionReturn(PETSC_SUCCESS); 435 } 436 437 /* y = A^H x */ 438 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 439 { 440 Mat_SeqAIJKokkos *aijkok; 441 const char *mode; 442 ConstPetscScalarKokkosView xv; 443 PetscScalarKokkosView yv; 444 KokkosCsrMatrix csrmat; 445 446 PetscFunctionBegin; 447 PetscCall(PetscLogGpuTimeBegin()); 448 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 449 PetscCall(VecGetKokkosView(xx, &xv)); 450 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 451 if (A->form_explicit_transpose) { 452 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 453 mode = "N"; 454 } else { 455 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 456 csrmat = aijkok->csrmat; 457 mode = "C"; 458 } 459 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 460 PetscCall(VecRestoreKokkosView(xx, &xv)); 461 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 462 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 463 PetscCall(PetscLogGpuTimeEnd()); 464 PetscFunctionReturn(PETSC_SUCCESS); 465 } 466 467 /* z = A x + y */ 468 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 469 { 470 Mat_SeqAIJKokkos *aijkok; 471 ConstPetscScalarKokkosView xv; 472 PetscScalarKokkosView zv; 473 474 PetscFunctionBegin; 475 PetscCall(PetscLogGpuTimeBegin()); 476 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 477 if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host 478 PetscCall(VecGetKokkosView(xx, &xv)); 479 PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device 480 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 481 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 482 PetscCall(VecRestoreKokkosView(xx, &xv)); 483 PetscCall(VecRestoreKokkosView(zz, &zv)); 484 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 485 PetscCall(PetscLogGpuTimeEnd()); 486 PetscFunctionReturn(PETSC_SUCCESS); 487 } 488 489 /* z = A^T x + y */ 490 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 491 { 492 Mat_SeqAIJKokkos *aijkok; 493 const char *mode; 494 ConstPetscScalarKokkosView xv; 495 PetscScalarKokkosView zv; 496 KokkosCsrMatrix csrmat; 497 498 PetscFunctionBegin; 499 PetscCall(PetscLogGpuTimeBegin()); 500 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 501 if (zz != yy) PetscCall(VecCopy(yy, zz)); 502 PetscCall(VecGetKokkosView(xx, &xv)); 503 PetscCall(VecGetKokkosView(zz, &zv)); 504 if (A->form_explicit_transpose) { 505 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 506 mode = "N"; 507 } else { 508 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 509 csrmat = aijkok->csrmat; 510 mode = "T"; 511 } 512 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 513 PetscCall(VecRestoreKokkosView(xx, &xv)); 514 PetscCall(VecRestoreKokkosView(zz, &zv)); 515 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 516 PetscCall(PetscLogGpuTimeEnd()); 517 PetscFunctionReturn(PETSC_SUCCESS); 518 } 519 520 /* z = A^H x + y */ 521 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 522 { 523 Mat_SeqAIJKokkos *aijkok; 524 const char *mode; 525 ConstPetscScalarKokkosView xv; 526 PetscScalarKokkosView zv; 527 KokkosCsrMatrix csrmat; 528 529 PetscFunctionBegin; 530 PetscCall(PetscLogGpuTimeBegin()); 531 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 532 if (zz != yy) PetscCall(VecCopy(yy, zz)); 533 PetscCall(VecGetKokkosView(xx, &xv)); 534 PetscCall(VecGetKokkosView(zz, &zv)); 535 if (A->form_explicit_transpose) { 536 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 537 mode = "N"; 538 } else { 539 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 540 csrmat = aijkok->csrmat; 541 mode = "C"; 542 } 543 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 544 PetscCall(VecRestoreKokkosView(xx, &xv)); 545 PetscCall(VecRestoreKokkosView(zz, &zv)); 546 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 547 PetscCall(PetscLogGpuTimeEnd()); 548 PetscFunctionReturn(PETSC_SUCCESS); 549 } 550 551 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 552 { 553 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 554 555 PetscFunctionBegin; 556 switch (op) { 557 case MAT_FORM_EXPLICIT_TRANSPOSE: 558 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 559 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 560 A->form_explicit_transpose = flg; 561 break; 562 default: 563 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 564 break; 565 } 566 PetscFunctionReturn(PETSC_SUCCESS); 567 } 568 569 /* Depending on reuse, either build a new mat, or use the existing mat */ 570 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 571 { 572 Mat_SeqAIJ *aseq; 573 574 PetscFunctionBegin; 575 PetscCall(PetscKokkosInitializeCheck()); 576 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 577 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 578 PetscCall(MatSetType(*newmat, mtype)); 579 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 580 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 581 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 582 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 583 PetscCall(PetscFree(A->defaultvectype)); 584 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 585 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 586 PetscCall(MatSetOps_SeqAIJKokkos(A)); 587 aseq = static_cast<Mat_SeqAIJ *>(A->data); 588 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 589 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 590 A->spptr = new Mat_SeqAIJKokkos(A, A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE); 591 } 592 } 593 PetscFunctionReturn(PETSC_SUCCESS); 594 } 595 596 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 597 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 598 */ 599 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 600 { 601 Mat_SeqAIJ *bseq; 602 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 603 Mat mat; 604 605 PetscFunctionBegin; 606 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 607 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 608 mat = *B; 609 if (A->assembled) { 610 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 611 bkok = new Mat_SeqAIJKokkos(mat, mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE); 612 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 613 /* Now copy values to B if needed */ 614 if (dupOption == MAT_COPY_VALUES) { 615 if (akok->a_dual.need_sync_device()) { 616 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 617 bkok->a_dual.modify_host(); 618 } else { /* If device has the latest data, we only copy data on device */ 619 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 620 bkok->a_dual.modify_device(); 621 } 622 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 623 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 624 bkok->a_dual.modify_host(); 625 } 626 mat->spptr = bkok; 627 } 628 629 PetscCall(PetscFree(mat->defaultvectype)); 630 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 631 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 632 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 633 PetscFunctionReturn(PETSC_SUCCESS); 634 } 635 636 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 637 { 638 Mat At; 639 KokkosCsrMatrix internT; 640 Mat_SeqAIJKokkos *atkok, *bkok; 641 642 PetscFunctionBegin; 643 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 644 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 645 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 646 /* Deep copy internT, as we want to isolate the internal transpose */ 647 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 648 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 649 if (reuse == MAT_INITIAL_MATRIX) *B = At; 650 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 651 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 652 if ((*B)->assembled) { 653 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 654 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 655 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 656 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 657 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 658 MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 659 MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 660 PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 661 PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 662 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 663 } 664 PetscFunctionReturn(PETSC_SUCCESS); 665 } 666 667 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 668 { 669 Mat_SeqAIJKokkos *aijkok; 670 671 PetscFunctionBegin; 672 if (A->factortype == MAT_FACTOR_NONE) { 673 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 674 delete aijkok; 675 } else { 676 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 677 } 678 A->spptr = NULL; 679 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 680 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 681 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 682 #if defined(PETSC_HAVE_HYPRE) 683 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL)); 684 #endif 685 PetscCall(MatDestroy_SeqAIJ(A)); 686 PetscFunctionReturn(PETSC_SUCCESS); 687 } 688 689 /*MC 690 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 691 692 A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types 693 694 Options Database Key: 695 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 696 697 Level: beginner 698 699 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 700 M*/ 701 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 702 { 703 PetscFunctionBegin; 704 PetscCall(PetscKokkosInitializeCheck()); 705 PetscCall(MatCreate_SeqAIJ(A)); 706 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 707 PetscFunctionReturn(PETSC_SUCCESS); 708 } 709 710 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 711 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 712 { 713 Mat_SeqAIJ *a, *b; 714 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 715 MatScalarKokkosView aa, ba, ca; 716 MatRowMapKokkosView ai, bi, ci; 717 MatColIdxKokkosView aj, bj, cj; 718 PetscInt m, n, nnz, aN; 719 720 PetscFunctionBegin; 721 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 722 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 723 PetscAssertPointer(C, 4); 724 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 725 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 726 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 727 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 728 729 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 730 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 731 a = static_cast<Mat_SeqAIJ *>(A->data); 732 b = static_cast<Mat_SeqAIJ *>(B->data); 733 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 734 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 735 aa = akok->a_dual.view_device(); 736 ai = akok->i_dual.view_device(); 737 ba = bkok->a_dual.view_device(); 738 bi = bkok->i_dual.view_device(); 739 m = A->rmap->n; /* M, N and nnz of C */ 740 n = A->cmap->n + B->cmap->n; 741 nnz = a->nz + b->nz; 742 aN = A->cmap->n; /* N of A */ 743 if (reuse == MAT_INITIAL_MATRIX) { 744 aj = akok->j_dual.view_device(); 745 bj = bkok->j_dual.view_device(); 746 auto ca = MatScalarKokkosView("a", aa.extent(0) + ba.extent(0)); 747 auto ci = MatRowMapKokkosView("i", ai.extent(0)); 748 auto cj = MatColIdxKokkosView("j", aj.extent(0) + bj.extent(0)); 749 750 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 751 Kokkos::parallel_for( 752 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 753 PetscInt i = t.league_rank(); /* row i */ 754 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 755 756 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 757 ci(i) = coffset; 758 if (i == m - 1) ci(m) = ai(m) + bi(m); 759 }); 760 761 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 762 if (k < alen) { 763 ca(coffset + k) = aa(ai(i) + k); 764 cj(coffset + k) = aj(ai(i) + k); 765 } else { 766 ca(coffset + k) = ba(bi(i) + k - alen); 767 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 768 } 769 }); 770 }); 771 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci, cj, ca)); 772 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 773 } else if (reuse == MAT_REUSE_MATRIX) { 774 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 775 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 776 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 777 ca = ckok->a_dual.view_device(); 778 ci = ckok->i_dual.view_device(); 779 780 Kokkos::parallel_for( 781 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 782 PetscInt i = t.league_rank(); /* row i */ 783 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 784 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 785 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 786 else ca(ci(i) + k) = ba(bi(i) + k - alen); 787 }); 788 }); 789 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 790 } 791 PetscFunctionReturn(PETSC_SUCCESS); 792 } 793 794 static PetscErrorCode MatProductCtxDestroy_SeqAIJKokkos(void **pdata) 795 { 796 PetscFunctionBegin; 797 delete *reinterpret_cast<MatProductCtx_SeqAIJKokkos **>(pdata); 798 PetscFunctionReturn(PETSC_SUCCESS); 799 } 800 801 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 802 { 803 Mat_Product *product = C->product; 804 Mat A, B; 805 bool transA, transB; /* use bool, since KK needs this type */ 806 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 807 Mat_SeqAIJ *c; 808 MatProductCtx_SeqAIJKokkos *pdata; 809 KokkosCsrMatrix csrmatA, csrmatB; 810 811 PetscFunctionBegin; 812 MatCheckProduct(C, 1); 813 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 814 pdata = static_cast<MatProductCtx_SeqAIJKokkos *>(C->product->data); 815 816 // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 817 // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 818 // we still do numeric. 819 if (pdata->reusesym) { // numeric reuses results from symbolic 820 pdata->reusesym = PETSC_FALSE; 821 PetscFunctionReturn(PETSC_SUCCESS); 822 } 823 824 switch (product->type) { 825 case MATPRODUCT_AB: 826 transA = false; 827 transB = false; 828 break; 829 case MATPRODUCT_AtB: 830 transA = true; 831 transB = false; 832 break; 833 case MATPRODUCT_ABt: 834 transA = false; 835 transB = true; 836 break; 837 default: 838 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 839 } 840 841 A = product->A; 842 B = product->B; 843 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 844 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 845 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 846 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 847 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 848 849 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 850 851 csrmatA = akok->csrmat; 852 csrmatB = bkok->csrmat; 853 854 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 855 if (transA) { 856 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 857 transA = false; 858 } 859 860 if (transB) { 861 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 862 transB = false; 863 } 864 PetscCall(PetscLogGpuTimeBegin()); 865 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 866 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 867 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 868 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 869 #endif 870 871 PetscCall(PetscLogGpuTimeEnd()); 872 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 873 /* shorter version of MatAssemblyEnd_SeqAIJ */ 874 c = (Mat_SeqAIJ *)C->data; 875 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded, %" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 876 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 877 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 878 c->reallocs = 0; 879 C->info.mallocs = 0; 880 C->info.nz_unneeded = 0; 881 C->assembled = C->was_assembled = PETSC_TRUE; 882 C->num_ass++; 883 PetscFunctionReturn(PETSC_SUCCESS); 884 } 885 886 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 887 { 888 Mat_Product *product = C->product; 889 MatProductType ptype; 890 Mat A, B; 891 bool transA, transB; 892 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 893 MatProductCtx_SeqAIJKokkos *pdata; 894 MPI_Comm comm; 895 KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 896 897 PetscFunctionBegin; 898 MatCheckProduct(C, 1); 899 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 900 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 901 A = product->A; 902 B = product->B; 903 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 904 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 905 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 906 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 907 csrmatA = akok->csrmat; 908 csrmatB = bkok->csrmat; 909 910 ptype = product->type; 911 // Take advantage of the symmetry if true 912 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 913 ptype = MATPRODUCT_AB; 914 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 915 } 916 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 917 ptype = MATPRODUCT_AB; 918 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 919 } 920 921 switch (ptype) { 922 case MATPRODUCT_AB: 923 transA = false; 924 transB = false; 925 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 926 break; 927 case MATPRODUCT_AtB: 928 transA = true; 929 transB = false; 930 if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs)); 931 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 932 break; 933 case MATPRODUCT_ABt: 934 transA = false; 935 transB = true; 936 if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs)); 937 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 938 break; 939 default: 940 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 941 } 942 PetscCallCXX(product->data = pdata = new MatProductCtx_SeqAIJKokkos()); 943 pdata->reusesym = product->api_user; 944 945 /* TODO: add command line options to select spgemm algorithms */ 946 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 947 948 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 949 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 950 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 951 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 952 #endif 953 #endif 954 PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 955 956 PetscCall(PetscLogGpuTimeBegin()); 957 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 958 if (transA) { 959 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 960 transA = false; 961 } 962 963 if (transB) { 964 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 965 transB = false; 966 } 967 968 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 969 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 970 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 971 calling new Mat_SeqAIJKokkos(). 972 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 973 */ 974 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 975 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 976 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 977 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 978 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 979 #endif 980 PetscCall(PetscLogGpuTimeEnd()); 981 982 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 983 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 984 C->product->destroy = MatProductCtxDestroy_SeqAIJKokkos; 985 PetscFunctionReturn(PETSC_SUCCESS); 986 } 987 988 /* handles sparse matrix matrix ops */ 989 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 990 { 991 Mat_Product *product = mat->product; 992 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 993 994 PetscFunctionBegin; 995 MatCheckProduct(mat, 1); 996 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 997 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 998 if (Biskok && Ciskok) { 999 switch (product->type) { 1000 case MATPRODUCT_AB: 1001 case MATPRODUCT_AtB: 1002 case MATPRODUCT_ABt: 1003 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 1004 break; 1005 case MATPRODUCT_PtAP: 1006 case MATPRODUCT_RARt: 1007 case MATPRODUCT_ABC: 1008 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 1009 break; 1010 default: 1011 break; 1012 } 1013 } else { /* fallback for AIJ */ 1014 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 1015 } 1016 PetscFunctionReturn(PETSC_SUCCESS); 1017 } 1018 1019 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 1020 { 1021 Mat_SeqAIJKokkos *aijkok; 1022 1023 PetscFunctionBegin; 1024 PetscCall(PetscLogGpuTimeBegin()); 1025 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1026 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1027 KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 1028 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1029 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 1030 PetscCall(PetscLogGpuTimeEnd()); 1031 PetscFunctionReturn(PETSC_SUCCESS); 1032 } 1033 1034 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular) 1035 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a) 1036 { 1037 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1038 1039 PetscFunctionBegin; 1040 if (A->assembled && aijseq->diagDense) { // no missing diagonals 1041 PetscInt n = PetscMin(A->rmap->n, A->cmap->n); 1042 1043 PetscCall(PetscLogGpuTimeBegin()); 1044 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1045 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1046 const auto &Aa = aijkok->a_dual.view_device(); 1047 const auto &Adiag = aijkok->diag_dual.view_device(); 1048 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; })); 1049 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1050 PetscCall(PetscLogGpuFlops(n)); 1051 PetscCall(PetscLogGpuTimeEnd()); 1052 } else { // need reassembly, very slow! 1053 PetscCall(MatShift_Basic(A, a)); 1054 } 1055 PetscFunctionReturn(PETSC_SUCCESS); 1056 } 1057 1058 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is) 1059 { 1060 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data); 1061 1062 PetscFunctionBegin; 1063 if (Y->assembled && aijseq->diagDense) { // no missing diagonals 1064 ConstPetscScalarKokkosView dv; 1065 PetscInt n, nv; 1066 1067 PetscCall(PetscLogGpuTimeBegin()); 1068 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1069 PetscCall(VecGetKokkosView(D, &dv)); 1070 PetscCall(VecGetLocalSize(D, &nv)); 1071 n = PetscMin(Y->rmap->n, Y->cmap->n); 1072 PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match"); 1073 1074 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1075 const auto &Aa = aijkok->a_dual.view_device(); 1076 const auto &Adiag = aijkok->diag_dual.view_device(); 1077 PetscCallCXX(Kokkos::parallel_for( 1078 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1079 if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i); 1080 else Aa(Adiag(i)) += dv(i); 1081 })); 1082 PetscCall(VecRestoreKokkosView(D, &dv)); 1083 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1084 PetscCall(PetscLogGpuFlops(n)); 1085 PetscCall(PetscLogGpuTimeEnd()); 1086 } else { // need reassembly, very slow! 1087 PetscCall(MatDiagonalSet_Default(Y, D, is)); 1088 } 1089 PetscFunctionReturn(PETSC_SUCCESS); 1090 } 1091 1092 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr) 1093 { 1094 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1095 PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz; 1096 ConstPetscScalarKokkosView lv, rv; 1097 1098 PetscFunctionBegin; 1099 PetscCall(PetscLogGpuTimeBegin()); 1100 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1101 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1102 const auto &Aa = aijkok->a_dual.view_device(); 1103 const auto &Ai = aijkok->i_dual.view_device(); 1104 const auto &Aj = aijkok->j_dual.view_device(); 1105 if (ll) { 1106 PetscCall(VecGetLocalSize(ll, &m)); 1107 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1108 PetscCall(VecGetKokkosView(ll, &lv)); 1109 PetscCallCXX(Kokkos::parallel_for( // for each row 1110 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1111 PetscInt i = t.league_rank(); // row i 1112 PetscInt len = Ai(i + 1) - Ai(i); 1113 // scale entries on the row 1114 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); }); 1115 })); 1116 PetscCall(VecRestoreKokkosView(ll, &lv)); 1117 PetscCall(PetscLogGpuFlops(nz)); 1118 } 1119 if (rr) { 1120 PetscCall(VecGetLocalSize(rr, &n)); 1121 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1122 PetscCall(VecGetKokkosView(rr, &rv)); 1123 PetscCallCXX(Kokkos::parallel_for( // for each nonzero 1124 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); })); 1125 PetscCall(VecRestoreKokkosView(rr, &lv)); 1126 PetscCall(PetscLogGpuFlops(nz)); 1127 } 1128 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1129 PetscCall(PetscLogGpuTimeEnd()); 1130 PetscFunctionReturn(PETSC_SUCCESS); 1131 } 1132 1133 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 1134 { 1135 Mat_SeqAIJKokkos *aijkok; 1136 1137 PetscFunctionBegin; 1138 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1139 if (aijkok) { /* Only zero the device if data is already there */ 1140 KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0); 1141 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1142 } else { /* Might be preallocated but not assembled */ 1143 PetscCall(MatZeroEntries_SeqAIJ(A)); 1144 } 1145 PetscFunctionReturn(PETSC_SUCCESS); 1146 } 1147 1148 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1149 { 1150 Mat_SeqAIJKokkos *aijkok; 1151 PetscInt n; 1152 PetscScalarKokkosView xv; 1153 1154 PetscFunctionBegin; 1155 PetscCall(VecGetLocalSize(x, &n)); 1156 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1157 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1158 1159 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1160 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1161 1162 const auto &Aa = aijkok->a_dual.view_device(); 1163 const auto &Ai = aijkok->i_dual.view_device(); 1164 const auto &Adiag = aijkok->diag_dual.view_device(); 1165 1166 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1167 Kokkos::parallel_for( 1168 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1169 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1170 else xv(i) = 0; 1171 }); 1172 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1173 PetscFunctionReturn(PETSC_SUCCESS); 1174 } 1175 1176 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1177 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1178 { 1179 Mat_SeqAIJKokkos *aijkok; 1180 1181 PetscFunctionBegin; 1182 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1183 PetscAssertPointer(kv, 2); 1184 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1185 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1186 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1187 *kv = aijkok->a_dual.view_device(); 1188 PetscFunctionReturn(PETSC_SUCCESS); 1189 } 1190 1191 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1192 { 1193 PetscFunctionBegin; 1194 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1195 PetscAssertPointer(kv, 2); 1196 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1197 PetscFunctionReturn(PETSC_SUCCESS); 1198 } 1199 1200 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1201 { 1202 Mat_SeqAIJKokkos *aijkok; 1203 1204 PetscFunctionBegin; 1205 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1206 PetscAssertPointer(kv, 2); 1207 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1208 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1209 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1210 *kv = aijkok->a_dual.view_device(); 1211 PetscFunctionReturn(PETSC_SUCCESS); 1212 } 1213 1214 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1215 { 1216 PetscFunctionBegin; 1217 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1218 PetscAssertPointer(kv, 2); 1219 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1220 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1221 PetscFunctionReturn(PETSC_SUCCESS); 1222 } 1223 1224 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1225 { 1226 Mat_SeqAIJKokkos *aijkok; 1227 1228 PetscFunctionBegin; 1229 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1230 PetscAssertPointer(kv, 2); 1231 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1232 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1233 *kv = aijkok->a_dual.view_device(); 1234 PetscFunctionReturn(PETSC_SUCCESS); 1235 } 1236 1237 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1238 { 1239 PetscFunctionBegin; 1240 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1241 PetscAssertPointer(kv, 2); 1242 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1243 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1244 PetscFunctionReturn(PETSC_SUCCESS); 1245 } 1246 1247 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A) 1248 { 1249 Mat_SeqAIJKokkos *akok; 1250 1251 PetscFunctionBegin; 1252 PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_d.extent(0), i_d, j_d, a_d)); 1253 PetscCall(MatCreate(comm, A)); 1254 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1255 PetscFunctionReturn(PETSC_SUCCESS); 1256 } 1257 1258 /* Computes Y += alpha X */ 1259 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1260 { 1261 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1262 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1263 ConstMatScalarKokkosView Xa; 1264 MatScalarKokkosView Ya; 1265 auto exec = PetscGetKokkosExecutionSpace(); 1266 1267 PetscFunctionBegin; 1268 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1269 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1270 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1271 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1272 PetscCall(PetscLogGpuTimeBegin()); 1273 1274 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1275 PetscBool e; 1276 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1277 if (e) { 1278 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1279 if (e) pattern = SAME_NONZERO_PATTERN; 1280 } 1281 } 1282 1283 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1284 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1285 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1286 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1287 */ 1288 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1289 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1290 Xa = xkok->a_dual.view_device(); 1291 Ya = ykok->a_dual.view_device(); 1292 1293 if (pattern == SAME_NONZERO_PATTERN) { 1294 KokkosBlas::axpy(exec, alpha, Xa, Ya); 1295 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1296 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1297 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1298 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1299 1300 Kokkos::parallel_for( 1301 Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1302 PetscInt i = t.league_rank(); // row i 1303 Kokkos::single(Kokkos::PerTeam(t), [=]() { 1304 // Only one thread works in a team 1305 PetscInt p, q = Yi(i); 1306 for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 1307 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 1308 if (Xj(p) == Yj(q)) { // Found it 1309 Ya(q) += alpha * Xa(p); 1310 q++; 1311 } else { 1312 // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1313 // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1314 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 1315 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 1316 #else 1317 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 1318 #endif 1319 } 1320 } 1321 }); 1322 }); 1323 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1324 } else { // different nonzero patterns 1325 Mat Z; 1326 KokkosCsrMatrix zcsr; 1327 KernelHandle kh; 1328 kh.create_spadd_handle(true); // X, Y are sorted 1329 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1330 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1331 zkok = new Mat_SeqAIJKokkos(zcsr); 1332 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1333 PetscCall(MatHeaderReplace(Y, &Z)); 1334 kh.destroy_spadd_handle(); 1335 } 1336 PetscCall(PetscLogGpuTimeEnd()); 1337 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 1338 PetscFunctionReturn(PETSC_SUCCESS); 1339 } 1340 1341 struct MatCOOStruct_SeqAIJKokkos { 1342 PetscCount n; 1343 PetscCount Atot; 1344 PetscInt nz; 1345 PetscCountKokkosView jmap; 1346 PetscCountKokkosView perm; 1347 1348 MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h) 1349 { 1350 nz = coo_h->nz; 1351 n = coo_h->n; 1352 Atot = coo_h->Atot; 1353 jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1)); 1354 perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot)); 1355 } 1356 }; 1357 1358 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data) 1359 { 1360 PetscFunctionBegin; 1361 PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data)); 1362 PetscFunctionReturn(PETSC_SUCCESS); 1363 } 1364 1365 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1366 { 1367 Mat_SeqAIJKokkos *akok; 1368 Mat_SeqAIJ *aseq; 1369 PetscContainer container_h; 1370 MatCOOStruct_SeqAIJ *coo_h; 1371 MatCOOStruct_SeqAIJKokkos *coo_d; 1372 1373 PetscFunctionBegin; 1374 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1375 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1376 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1377 delete akok; 1378 mat->spptr = akok = new Mat_SeqAIJKokkos(mat, mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE); 1379 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1380 1381 // Copy the COO struct to device 1382 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 1383 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 1384 PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h)); 1385 1386 // Put the COO struct in a container and then attach that to the matrix 1387 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos)); 1388 PetscFunctionReturn(PETSC_SUCCESS); 1389 } 1390 1391 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1392 { 1393 MatScalarKokkosView Aa; 1394 ConstMatScalarKokkosView kv; 1395 PetscMemType memtype; 1396 PetscContainer container; 1397 MatCOOStruct_SeqAIJKokkos *coo; 1398 1399 PetscFunctionBegin; 1400 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 1401 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 1402 1403 const auto &n = coo->n; 1404 const auto &Annz = coo->nz; 1405 const auto &jmap = coo->jmap; 1406 const auto &perm = coo->perm; 1407 1408 PetscCall(PetscGetMemType(v, &memtype)); 1409 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1410 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n)); 1411 } else { 1412 kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */ 1413 } 1414 1415 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1416 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1417 1418 PetscCall(PetscLogGpuTimeBegin()); 1419 Kokkos::parallel_for( 1420 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) { 1421 PetscScalar sum = 0.0; 1422 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1423 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1424 }); 1425 PetscCall(PetscLogGpuTimeEnd()); 1426 1427 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1428 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1429 PetscFunctionReturn(PETSC_SUCCESS); 1430 } 1431 1432 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1433 { 1434 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1435 1436 PetscFunctionBegin; 1437 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1438 A->boundtocpu = PETSC_FALSE; 1439 1440 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1441 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1442 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1443 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1444 A->ops->scale = MatScale_SeqAIJKokkos; 1445 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1446 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1447 A->ops->mult = MatMult_SeqAIJKokkos; 1448 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1449 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1450 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1451 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1452 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1453 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1454 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1455 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1456 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1457 A->ops->shift = MatShift_SeqAIJKokkos; 1458 A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos; 1459 A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos; 1460 A->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJKokkos; 1461 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1462 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1463 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1464 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1465 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1466 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1467 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1468 1469 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1470 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1471 #if defined(PETSC_HAVE_HYPRE) 1472 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE)); 1473 #endif 1474 PetscFunctionReturn(PETSC_SUCCESS); 1475 } 1476 1477 /* 1478 Extract the (prescribled) diagonal blocks of the matrix and then invert them 1479 1480 Input Parameters: 1481 + A - the MATSEQAIJKOKKOS matrix 1482 . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i) 1483 . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[] 1484 . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i) 1485 - work - a pre-allocated work buffer (as big as diagVal) for use by this routine 1486 1487 Output Parameter: 1488 . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order) 1489 */ 1490 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal) 1491 { 1492 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1493 PetscInt nblocks = bs.extent(0) - 1; 1494 1495 PetscFunctionBegin; 1496 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device 1497 1498 // Pull out the diagonal blocks of the matrix and then invert the blocks 1499 auto Aa = akok->a_dual.view_device(); 1500 auto Ai = akok->i_dual.view_device(); 1501 auto Aj = akok->j_dual.view_device(); 1502 auto Adiag = akok->diag_dual.view_device(); 1503 // TODO: how to tune the team size? 1504 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY) 1505 auto ts = Kokkos::AUTO(); 1506 #else 1507 auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs 1508 #endif 1509 PetscCallCXX(Kokkos::parallel_for( 1510 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) { 1511 const PetscInt bid = teamMember.league_rank(); // block id 1512 const PetscInt rstart = bs(bid); // this block starts from this row 1513 const PetscInt m = bs(bid + 1) - bs(bid); // size of this block 1514 const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order 1515 const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m); 1516 1517 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B 1518 PetscInt i = rstart + r; // i-th row in A 1519 1520 if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case) 1521 PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row 1522 1523 for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet 1524 if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero 1525 B(r, c) = 0.0; 1526 } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column 1527 B(r, c) = Aa(first + c); 1528 } else { // this entry does not show up in the CSR 1529 B(r, c) = 0.0; 1530 } 1531 } 1532 } else { // rare case that the diagonal does not exist 1533 const PetscInt begin = Ai(i); 1534 const PetscInt end = Ai(i + 1); 1535 for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0; 1536 for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not. 1537 if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j); 1538 else if (Aj(j) >= rstart + m) break; 1539 } 1540 } 1541 }); 1542 1543 // LU-decompose B (w/o pivoting) and then invert B 1544 KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0); 1545 KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W); 1546 })); 1547 // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels 1548 PetscFunctionReturn(PETSC_SUCCESS); 1549 } 1550 1551 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1552 { 1553 Mat_SeqAIJ *aseq; 1554 PetscInt i, m, n; 1555 auto exec = PetscGetKokkosExecutionSpace(); 1556 1557 PetscFunctionBegin; 1558 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1559 1560 m = akok->nrows(); 1561 n = akok->ncols(); 1562 PetscCall(MatSetSizes(A, m, n, m, n)); 1563 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1564 1565 /* Set up data structures of A as a MATSEQAIJ */ 1566 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1567 aseq = (Mat_SeqAIJ *)A->data; 1568 1569 PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */ 1570 PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec)); 1571 1572 aseq->i = akok->i_host_data(); 1573 aseq->j = akok->j_host_data(); 1574 aseq->a = akok->a_host_data(); 1575 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1576 aseq->free_a = PETSC_FALSE; 1577 aseq->free_ij = PETSC_FALSE; 1578 aseq->nz = akok->nnz(); 1579 aseq->maxnz = aseq->nz; 1580 1581 PetscCall(PetscMalloc1(m, &aseq->imax)); 1582 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1583 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1584 1585 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1586 akok->nonzerostate = A->nonzerostate; 1587 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1588 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1589 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1590 PetscFunctionReturn(PETSC_SUCCESS); 1591 } 1592 1593 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 1594 { 1595 PetscFunctionBegin; 1596 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1597 *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 1598 PetscFunctionReturn(PETSC_SUCCESS); 1599 } 1600 1601 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 1602 { 1603 Mat_SeqAIJKokkos *akok; 1604 1605 PetscFunctionBegin; 1606 PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 1607 PetscCall(MatCreate(comm, A)); 1608 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1609 PetscFunctionReturn(PETSC_SUCCESS); 1610 } 1611 1612 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1613 1614 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1615 */ 1616 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1617 { 1618 PetscFunctionBegin; 1619 PetscCall(MatCreate(comm, A)); 1620 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1621 PetscFunctionReturn(PETSC_SUCCESS); 1622 } 1623 1624 /*@C 1625 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1626 (the default parallel PETSc format). This matrix will ultimately be handled by 1627 Kokkos for calculations. 1628 1629 Collective 1630 1631 Input Parameters: 1632 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1633 . m - number of rows 1634 . n - number of columns 1635 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 1636 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 1637 1638 Output Parameter: 1639 . A - the matrix 1640 1641 Level: intermediate 1642 1643 Notes: 1644 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1645 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1646 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1647 1648 The AIJ format, also called 1649 compressed row storage, is fully compatible with standard Fortran 1650 storage. That is, the stored row and column indices can begin at 1651 either one (as in Fortran) or zero. 1652 1653 Specify the preallocated storage with either `nz` or `nnz` (not both). 1654 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1655 allocation. 1656 1657 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()` 1658 @*/ 1659 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1660 { 1661 PetscFunctionBegin; 1662 PetscCall(PetscKokkosInitializeCheck()); 1663 PetscCall(MatCreate(comm, A)); 1664 PetscCall(MatSetSizes(*A, m, n, m, n)); 1665 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1666 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1667 PetscFunctionReturn(PETSC_SUCCESS); 1668 } 1669 1670 // After matrix numeric factorization, there are still steps to do before triangular solve can be called. 1671 // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does). 1672 // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve(). 1673 // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck. 1674 static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A) 1675 { 1676 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1677 const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy 1678 const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy 1679 1680 PetscFunctionBegin; 1681 if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet 1682 if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d)); 1683 if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d)); 1684 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1685 } 1686 PetscFunctionReturn(PETSC_SUCCESS); 1687 } 1688 1689 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1690 { 1691 const PetscInt n = A->rmap->n; 1692 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1693 const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy 1694 const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy 1695 1696 PetscFunctionBegin; 1697 if (!factors->transpose_updated) { 1698 if (has_upper) { 1699 if (!factors->iUt_d.extent(0)) { // Allocate Ut on device if not yet 1700 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix 1701 factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 1702 factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 1703 } 1704 1705 if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host 1706 if (!factors->U) { 1707 Mat_SeqAIJ *seq; 1708 1709 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U)); 1710 PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut)); 1711 1712 seq = static_cast<Mat_SeqAIJ *>(factors->Ut->data); 1713 factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1); 1714 factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz); 1715 factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz); 1716 } else { 1717 PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h 1718 } 1719 // Copy Ut from host to device 1720 PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h)); 1721 PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h)); 1722 PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h)); 1723 } else { // If U was computed on device, we also compute the transpose there 1724 // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. 1725 PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, 1726 factors->jU_d, factors->aU_d, 1727 factors->iUt_d, factors->jUt_d, 1728 factors->aUt_d)); 1729 PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d)); 1730 } 1731 PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d)); 1732 } 1733 1734 // do the same for L with LU 1735 if (has_lower) { 1736 if (!factors->iLt_d.extent(0)) { // Allocate Lt on device if not yet 1737 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix 1738 factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 1739 factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 1740 } 1741 1742 if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host 1743 if (!factors->L) { 1744 Mat_SeqAIJ *seq; 1745 1746 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L)); 1747 PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt)); 1748 1749 seq = static_cast<Mat_SeqAIJ *>(factors->Lt->data); 1750 factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1); 1751 factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz); 1752 factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz); 1753 } else { 1754 PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h 1755 } 1756 // Copy Lt from host to device 1757 PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h)); 1758 PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h)); 1759 PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h)); 1760 } else { // If L was computed on device, we also compute the transpose there 1761 // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. 1762 PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, 1763 factors->jL_d, factors->aL_d, 1764 factors->iLt_d, factors->jLt_d, 1765 factors->aLt_d)); 1766 PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d)); 1767 } 1768 PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d)); 1769 } 1770 1771 factors->transpose_updated = PETSC_TRUE; 1772 } 1773 PetscFunctionReturn(PETSC_SUCCESS); 1774 } 1775 1776 // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A. 1777 // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty. 1778 static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx) 1779 { 1780 auto exec = PetscGetKokkosExecutionSpace(); 1781 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1782 PetscInt m = A->rmap->n; 1783 PetscScalarKokkosView D = factors->D_d; 1784 PetscScalarKokkosView X, Y, B; // alias 1785 ConstPetscScalarKokkosView b; 1786 PetscScalarKokkosView x; 1787 PetscIntKokkosView &rowperm = factors->rowperm; 1788 PetscBool identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1789 1790 PetscFunctionBegin; 1791 PetscCall(PetscLogGpuTimeBegin()); 1792 PetscCall(MatSeqAIJKokkosSolveCheck(A)); // for UX = T 1793 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B 1794 PetscCall(VecGetKokkosView(bb, &b)); 1795 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1796 1797 // Solve U^T Y = B 1798 if (identity) { // Reorder b with the row permutation 1799 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1800 Y = factors->workVector; 1801 } else { 1802 B = factors->workVector; 1803 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); })); 1804 Y = x; 1805 } 1806 PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y)); 1807 1808 // Solve diag(D) Y' = Y. 1809 // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication. 1810 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); })); 1811 1812 // Solve UX = Y 1813 if (identity) { 1814 X = x; 1815 } else { 1816 X = factors->workVector; // B is not needed anymore 1817 } 1818 PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X)); 1819 1820 // Reorder X with the inverse column (row) permutation 1821 if (!identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); })); 1822 1823 PetscCall(VecRestoreKokkosView(bb, &b)); 1824 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1825 PetscCall(PetscLogGpuTimeEnd()); 1826 PetscFunctionReturn(PETSC_SUCCESS); 1827 } 1828 1829 // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively. 1830 // R and C are represented by rowperm and colperm in factors. 1831 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty. 1832 static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx) 1833 { 1834 auto exec = PetscGetKokkosExecutionSpace(); 1835 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1836 PetscInt m = A->rmap->n; 1837 PetscScalarKokkosView X, Y, B; // alias 1838 ConstPetscScalarKokkosView b; 1839 PetscScalarKokkosView x; 1840 PetscIntKokkosView &rowperm = factors->rowperm; 1841 PetscIntKokkosView &colperm = factors->colperm; 1842 PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1843 PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1844 1845 PetscFunctionBegin; 1846 PetscCall(PetscLogGpuTimeBegin()); 1847 PetscCall(MatSeqAIJKokkosSolveCheck(A)); 1848 PetscCall(VecGetKokkosView(bb, &b)); 1849 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1850 1851 // Solve L Y = B (i.e., L (U C^- x) = R b). R b indicates applying the row permutation on b. 1852 if (row_identity) { 1853 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1854 Y = factors->workVector; 1855 } else { 1856 B = factors->workVector; 1857 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); })); 1858 Y = x; 1859 } 1860 PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y)); 1861 1862 // Solve U C^- x = Y 1863 if (col_identity) { 1864 X = x; 1865 } else { 1866 X = factors->workVector; 1867 } 1868 PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X)); 1869 1870 // x = C X; Reorder X with the inverse col permutation 1871 if (!col_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); })); 1872 1873 PetscCall(VecRestoreKokkosView(bb, &b)); 1874 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1875 PetscCall(PetscLogGpuTimeEnd()); 1876 PetscFunctionReturn(PETSC_SUCCESS); 1877 } 1878 1879 // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively. 1880 // R and C are represented by rowperm and colperm in factors. 1881 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty. 1882 // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R. 1883 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx) 1884 { 1885 auto exec = PetscGetKokkosExecutionSpace(); 1886 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1887 PetscInt m = A->rmap->n; 1888 PetscScalarKokkosView X, Y, B; // alias 1889 ConstPetscScalarKokkosView b; 1890 PetscScalarKokkosView x; 1891 PetscIntKokkosView &rowperm = factors->rowperm; 1892 PetscIntKokkosView &colperm = factors->colperm; 1893 PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1894 PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1895 1896 PetscFunctionBegin; 1897 PetscCall(PetscLogGpuTimeBegin()); 1898 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T 1899 PetscCall(VecGetKokkosView(bb, &b)); 1900 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1901 1902 // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b). Note C^- b = C^T b, which means applying the column permutation on b. 1903 if (col_identity) { // Reorder b with the col permutation 1904 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1905 Y = factors->workVector; 1906 } else { 1907 B = factors->workVector; 1908 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); })); 1909 Y = x; 1910 } 1911 PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y)); 1912 1913 // Solve L^T X = Y 1914 if (row_identity) { 1915 X = x; 1916 } else { 1917 X = factors->workVector; 1918 } 1919 PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X)); 1920 1921 // x = R^- X = R^T X; Reorder X with the inverse row permutation 1922 if (!row_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); })); 1923 1924 PetscCall(VecRestoreKokkosView(bb, &b)); 1925 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1926 PetscCall(PetscLogGpuTimeEnd()); 1927 PetscFunctionReturn(PETSC_SUCCESS); 1928 } 1929 1930 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1931 { 1932 PetscFunctionBegin; 1933 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1934 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1935 1936 if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device 1937 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1938 Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data); 1939 const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag; 1940 const MatScalar *Ba = b->a; 1941 PetscInt m = B->rmap->n, n = B->cmap->n; 1942 1943 if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time 1944 // Allocate memory and copy the structure 1945 factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1); 1946 factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries 1947 factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m); 1948 factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1); 1949 factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m])); 1950 factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m])); 1951 1952 PetscInt *Li = factors->iL_h.data(); 1953 PetscInt *Lj = factors->jL_h.data(); 1954 PetscInt *Ui = factors->iU_h.data(); 1955 PetscInt *Uj = factors->jU_h.data(); 1956 1957 Li[0] = Ui[0] = 0; 1958 for (PetscInt i = 0; i < m; i++) { 1959 PetscInt llen = Bi[i + 1] - Bi[i]; // exclusive of the diagonal entry 1960 PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry 1961 1962 PetscCall(PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen)); // entries of L on the left of the diagonal 1963 Lj[Li[i] + llen] = i; // diagonal entry of L 1964 1965 Uj[Ui[i]] = i; // diagonal entry of U 1966 PetscCall(PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 1967 1968 Li[i + 1] = Li[i] + llen + 1; 1969 Ui[i + 1] = Ui[i] + ulen; 1970 } 1971 1972 factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h); 1973 factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h); 1974 factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h); 1975 factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h); 1976 factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h); 1977 factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h); 1978 1979 // Copy row/col permutation to device 1980 IS rowperm = ((Mat_SeqAIJ *)B->data)->row; 1981 PetscBool row_identity; 1982 PetscCall(ISIdentity(rowperm, &row_identity)); 1983 if (!row_identity) { 1984 const PetscInt *ip; 1985 1986 PetscCall(ISGetIndices(rowperm, &ip)); 1987 factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m); 1988 PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m))); 1989 PetscCall(ISRestoreIndices(rowperm, &ip)); 1990 PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt))); 1991 } 1992 1993 IS colperm = ((Mat_SeqAIJ *)B->data)->col; 1994 PetscBool col_identity; 1995 PetscCall(ISIdentity(colperm, &col_identity)); 1996 if (!col_identity) { 1997 const PetscInt *ip; 1998 1999 PetscCall(ISGetIndices(colperm, &ip)); 2000 factors->colperm = PetscIntKokkosView(NoInit("colperm"), n); 2001 PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n))); 2002 PetscCall(ISRestoreIndices(colperm, &ip)); 2003 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 2004 } 2005 2006 /* Create sptrsv handles for L, U and their transpose */ 2007 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2008 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2009 #else 2010 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2011 #endif 2012 factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */); 2013 factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */); 2014 factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */); 2015 factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */); 2016 } 2017 2018 // Copy the value 2019 for (PetscInt i = 0; i < m; i++) { 2020 PetscInt llen = Bi[i + 1] - Bi[i]; 2021 PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; 2022 const PetscInt *Li = factors->iL_h.data(); 2023 const PetscInt *Ui = factors->iU_h.data(); 2024 2025 PetscScalar *La = factors->aL_h.data(); 2026 PetscScalar *Ua = factors->aU_h.data(); 2027 2028 PetscCall(PetscArraycpy(La + Li[i], Ba + Bi[i], llen)); // entries of L 2029 La[Li[i] + llen] = 1.0; // diagonal entry 2030 2031 Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]]; // diagonal entry 2032 PetscCall(PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1)); // entries of U 2033 } 2034 2035 PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h)); 2036 PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h)); 2037 // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic 2038 factors->transpose_updated = PETSC_FALSE; 2039 factors->sptrsv_symbolic_completed = PETSC_FALSE; 2040 2041 B->ops->solve = MatSolve_SeqAIJKokkos_LU; 2042 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU; 2043 } 2044 2045 B->ops->matsolve = NULL; 2046 B->ops->matsolvetranspose = NULL; 2047 PetscFunctionReturn(PETSC_SUCCESS); 2048 } 2049 2050 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info) 2051 { 2052 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 2053 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2054 PetscInt fill_lev = info->levels; 2055 2056 PetscFunctionBegin; 2057 PetscCall(PetscLogGpuTimeBegin()); 2058 PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false"); 2059 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2060 2061 auto a_d = aijkok->a_dual.view_device(); 2062 auto i_d = aijkok->i_dual.view_device(); 2063 auto j_d = aijkok->j_dual.view_device(); 2064 2065 PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d)); 2066 2067 B->assembled = PETSC_TRUE; 2068 B->preallocated = PETSC_TRUE; 2069 B->ops->solve = MatSolve_SeqAIJKokkos_LU; 2070 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU; 2071 B->ops->matsolve = NULL; 2072 B->ops->matsolvetranspose = NULL; 2073 2074 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 2075 factors->transpose_updated = PETSC_FALSE; 2076 factors->sptrsv_symbolic_completed = PETSC_FALSE; 2077 /* TODO: log flops, but how to know that? */ 2078 PetscCall(PetscLogGpuTimeEnd()); 2079 PetscFunctionReturn(PETSC_SUCCESS); 2080 } 2081 2082 // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering 2083 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info) 2084 { 2085 Mat_SeqAIJKokkos *aijkok; 2086 Mat_SeqAIJ *b; 2087 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2088 PetscInt fill_lev = info->levels; 2089 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 2090 PetscInt n = A->rmap->n; 2091 2092 PetscFunctionBegin; 2093 PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now"); 2094 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2095 2096 /* Create a spiluk handle and then do symbolic factorization */ 2097 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 2098 factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 2099 2100 auto spiluk_handle = factors->kh.get_spiluk_handle(); 2101 2102 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 2103 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 2104 Kokkos::realloc(factors->iU_d, n + 1); 2105 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 2106 2107 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 2108 auto i_d = aijkok->i_dual.view_device(); 2109 auto j_d = aijkok->j_dual.view_device(); 2110 PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d)); 2111 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 2112 2113 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 2114 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 2115 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 2116 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 2117 2118 /* TODO: add options to select sptrsv algorithms */ 2119 /* Create sptrsv handles for L, U and their transpose */ 2120 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2121 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2122 #else 2123 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2124 #endif 2125 2126 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 2127 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 2128 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 2129 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 2130 2131 /* Fill fields of the factor matrix B */ 2132 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 2133 b = (Mat_SeqAIJ *)B->data; 2134 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 2135 B->info.fill_ratio_given = info->fill; 2136 B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 2137 2138 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0; 2139 PetscFunctionReturn(PETSC_SUCCESS); 2140 } 2141 2142 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2143 { 2144 PetscFunctionBegin; 2145 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2146 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2147 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2148 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 2149 PetscFunctionReturn(PETSC_SUCCESS); 2150 } 2151 2152 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2153 { 2154 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2155 2156 PetscFunctionBegin; 2157 if (!info->factoronhost) { 2158 PetscCall(ISIdentity(isrow, &row_identity)); 2159 PetscCall(ISIdentity(iscol, &col_identity)); 2160 } 2161 2162 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2163 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2164 2165 if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering 2166 PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info)); 2167 } else { 2168 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host 2169 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 2170 } 2171 PetscFunctionReturn(PETSC_SUCCESS); 2172 } 2173 2174 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 2175 { 2176 PetscFunctionBegin; 2177 PetscCall(MatSeqAIJKokkosSyncHost(A)); 2178 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 2179 2180 if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device 2181 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2182 Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data); 2183 const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag; 2184 const MatScalar *Ba = b->a; 2185 PetscInt m = B->rmap->n; 2186 2187 if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization 2188 // Allocate memory and copy the structure 2189 factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h 2190 factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]); 2191 factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]); 2192 factors->D_h = MatScalarKokkosViewHost(NoInit("D_h"), m); 2193 factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h); 2194 factors->D_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h); 2195 2196 // Build jU_h from the skewed Aj 2197 PetscInt *Uj = factors->jU_h.data(); 2198 for (PetscInt i = 0; i < m; i++) { 2199 PetscInt ulen = Bi[i + 1] - Bi[i]; 2200 Uj[Bi[i]] = i; // diagonal entry 2201 PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal 2202 } 2203 2204 // Copy iU, jU to device 2205 PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h)); 2206 PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h)); 2207 2208 // Copy row/col permutation to device 2209 IS rowperm = ((Mat_SeqAIJ *)B->data)->row; 2210 PetscBool row_identity; 2211 PetscCall(ISIdentity(rowperm, &row_identity)); 2212 if (!row_identity) { 2213 const PetscInt *ip; 2214 2215 PetscCall(ISGetIndices(rowperm, &ip)); 2216 PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m)); 2217 PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m))); 2218 PetscCall(ISRestoreIndices(rowperm, &ip)); 2219 PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt))); 2220 } 2221 2222 // Create sptrsv handles for U and U^T 2223 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2224 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2225 #else 2226 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2227 #endif 2228 factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */); 2229 factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */); 2230 } 2231 // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them 2232 B->ops->solve = MatSolve_SeqAIJKokkos_Cholesky; 2233 B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky; 2234 2235 // Copy the value 2236 PetscScalar *Ua = factors->aU_h.data(); 2237 PetscScalar *D = factors->D_h.data(); 2238 for (PetscInt i = 0; i < m; i++) { 2239 D[i] = Ba[Bdiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 2240 Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U 2241 for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k]; 2242 } 2243 PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h)); 2244 PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h)); 2245 2246 factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again 2247 factors->transpose_updated = PETSC_FALSE; 2248 } 2249 2250 B->ops->matsolve = NULL; 2251 B->ops->matsolvetranspose = NULL; 2252 PetscFunctionReturn(PETSC_SUCCESS); 2253 } 2254 2255 static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2256 { 2257 PetscFunctionBegin; 2258 if (info->solveonhost) { 2259 // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc. 2260 PetscCall(MatSetType(B, MATSEQSBAIJ)); 2261 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 2262 } 2263 2264 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2265 2266 if (!info->solveonhost) { 2267 // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr 2268 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2269 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2270 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos; 2271 } 2272 PetscFunctionReturn(PETSC_SUCCESS); 2273 } 2274 2275 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2276 { 2277 PetscFunctionBegin; 2278 if (info->solveonhost) { 2279 // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc. 2280 PetscCall(MatSetType(B, MATSEQSBAIJ)); 2281 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 2282 } 2283 2284 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm 2285 2286 if (!info->solveonhost) { 2287 // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr 2288 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2289 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2290 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos; 2291 } 2292 PetscFunctionReturn(PETSC_SUCCESS); 2293 } 2294 2295 // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix 2296 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type) 2297 { 2298 PetscFunctionBegin; 2299 *type = MATSOLVERKOKKOS; 2300 PetscFunctionReturn(PETSC_SUCCESS); 2301 } 2302 2303 /*MC 2304 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 2305 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 2306 2307 Level: beginner 2308 2309 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS` 2310 M*/ 2311 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 2312 { 2313 PetscInt n = A->rmap->n; 2314 MPI_Comm comm; 2315 2316 PetscFunctionBegin; 2317 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 2318 PetscCall(MatCreate(comm, B)); 2319 PetscCall(MatSetSizes(*B, n, n, n, n)); 2320 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2321 (*B)->factortype = ftype; 2322 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 2323 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2324 PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2325 2326 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2327 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 2328 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 2329 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2330 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2331 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2332 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2333 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJKokkos; 2334 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos; 2335 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2336 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2337 } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 2338 2339 // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host 2340 (*B)->canuseordering = PETSC_TRUE; 2341 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos)); 2342 PetscFunctionReturn(PETSC_SUCCESS); 2343 } 2344 2345 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void) 2346 { 2347 PetscFunctionBegin; 2348 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 2349 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos)); 2350 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 2351 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos)); 2352 PetscFunctionReturn(PETSC_SUCCESS); 2353 } 2354 2355 /* Utility to print out a KokkosCsrMatrix for debugging */ 2356 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 2357 { 2358 const auto &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map); 2359 const auto &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries); 2360 const auto &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values); 2361 const PetscInt *i = iv.data(); 2362 const PetscInt *j = jv.data(); 2363 const PetscScalar *a = av.data(); 2364 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 2365 2366 PetscFunctionBegin; 2367 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 2368 for (PetscInt k = 0; k < m; k++) { 2369 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 2370 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 2371 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 2372 } 2373 PetscFunctionReturn(PETSC_SUCCESS); 2374 } 2375