1 #include <petsc_kokkos.hpp> 2 #include <petscvec_kokkos.hpp> 3 #include <petscmat_kokkos.hpp> 4 #include <petscpkg_version.h> 5 #include <petsc/private/petscimpl.h> 6 #include <petsc/private/sfimpl.h> 7 #include <petsc/private/kokkosimpl.hpp> 8 #include <petscsys.h> 9 10 #include <Kokkos_Core.hpp> 11 #include <KokkosBlas.hpp> 12 #include <KokkosSparse_CrsMatrix.hpp> 13 14 // To suppress compiler warnings: 15 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63: 16 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, 17 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*, 18 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations] 19 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations") 20 #include <KokkosSparse_spmv.hpp> 21 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END() 22 23 #include <KokkosSparse_spiluk.hpp> 24 #include <KokkosSparse_sptrsv.hpp> 25 #include <KokkosSparse_spgemm.hpp> 26 #include <KokkosSparse_spadd.hpp> 27 #include <KokkosBatched_LU_Decl.hpp> 28 #include <KokkosBatched_InverseLU_Decl.hpp> 29 30 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 31 32 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 33 #include <KokkosSparse_Utils.hpp> 34 using KokkosSparse::sort_crs_matrix; 35 using KokkosSparse::Impl::transpose_matrix; 36 #else 37 #include <KokkosKernels_Sorting.hpp> 38 using KokkosKernels::sort_crs_matrix; 39 using KokkosKernels::Impl::transpose_matrix; 40 #endif 41 42 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0) 43 using KokkosSparse::spiluk_symbolic; 44 using KokkosSparse::spiluk_numeric; 45 using KokkosSparse::sptrsv_symbolic; 46 using KokkosSparse::sptrsv_solve; 47 using KokkosSparse::Experimental::SPTRSVAlgorithm; 48 using KokkosSparse::Experimental::SPILUKAlgorithm; 49 #else 50 using KokkosSparse::Experimental::spiluk_symbolic; 51 using KokkosSparse::Experimental::spiluk_numeric; 52 using KokkosSparse::Experimental::sptrsv_symbolic; 53 using KokkosSparse::Experimental::sptrsv_solve; 54 using KokkosSparse::Experimental::SPTRSVAlgorithm; 55 using KokkosSparse::Experimental::SPILUKAlgorithm; 56 #endif 57 58 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 59 60 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 61 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 62 In the latter case, it is important to set a_dual's sync state correctly. 63 */ 64 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 65 { 66 Mat_SeqAIJ *aijseq; 67 Mat_SeqAIJKokkos *aijkok; 68 69 PetscFunctionBegin; 70 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 71 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 72 73 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 74 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 75 76 /* If aijkok does not exist, we just copy i, j to device. 77 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 78 In both cases, we build a new aijkok structure. 79 */ 80 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 81 delete aijkok; 82 aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 83 A->spptr = aijkok; 84 } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag. 85 MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n); 86 auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h); 87 aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h); 88 } 89 PetscFunctionReturn(PETSC_SUCCESS); 90 } 91 92 /* Sync CSR data to device if not yet */ 93 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 94 { 95 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 96 97 PetscFunctionBegin; 98 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 99 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 100 if (aijkok->a_dual.need_sync_device()) { 101 PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 102 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 103 aijkok->hermitian_updated = PETSC_FALSE; 104 } 105 PetscFunctionReturn(PETSC_SUCCESS); 106 } 107 108 /* Mark the CSR data on device as modified */ 109 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 110 { 111 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 112 113 PetscFunctionBegin; 114 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 115 aijkok->a_dual.clear_sync_state(); 116 aijkok->a_dual.modify_device(); 117 aijkok->transpose_updated = PETSC_FALSE; 118 aijkok->hermitian_updated = PETSC_FALSE; 119 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 120 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 121 PetscFunctionReturn(PETSC_SUCCESS); 122 } 123 124 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 125 { 126 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 127 auto exec = PetscGetKokkosExecutionSpace(); 128 129 PetscFunctionBegin; 130 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 131 /* We do not expect one needs factors on host */ 132 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 133 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 134 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec)); 135 PetscFunctionReturn(PETSC_SUCCESS); 136 } 137 138 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 139 { 140 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 141 142 PetscFunctionBegin; 143 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 144 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 145 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 146 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 147 */ 148 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 149 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 150 *array = aijkok->a_dual.view_host().data(); 151 } else { /* Happens when calling MatSetValues on a newly created matrix */ 152 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 153 } 154 PetscFunctionReturn(PETSC_SUCCESS); 155 } 156 157 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 158 { 159 #if !defined(KOKKOS_ENABLE_UNIFIED_MEMORY) 160 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 161 #endif 162 163 PetscFunctionBegin; 164 #if !defined(KOKKOS_ENABLE_UNIFIED_MEMORY) 165 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 166 #endif 167 PetscFunctionReturn(PETSC_SUCCESS); 168 } 169 170 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 171 { 172 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 173 174 PetscFunctionBegin; 175 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 176 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 177 *array = aijkok->a_dual.view_host().data(); 178 } else { 179 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 180 } 181 PetscFunctionReturn(PETSC_SUCCESS); 182 } 183 184 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 185 { 186 PetscFunctionBegin; 187 *array = NULL; 188 PetscFunctionReturn(PETSC_SUCCESS); 189 } 190 191 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 192 { 193 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 194 195 PetscFunctionBegin; 196 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 197 *array = aijkok->a_dual.view_host().data(); 198 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 199 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 200 } 201 PetscFunctionReturn(PETSC_SUCCESS); 202 } 203 204 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 205 { 206 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 207 208 PetscFunctionBegin; 209 #if !defined(KOKKOS_ENABLE_UNIFIED_MEMORY) 210 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 211 aijkok->a_dual.clear_sync_state(); 212 aijkok->a_dual.modify_host(); 213 } 214 #else 215 (void)aijkok; 216 #endif 217 PetscFunctionReturn(PETSC_SUCCESS); 218 } 219 220 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 221 { 222 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 223 224 PetscFunctionBegin; 225 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 226 227 if (i) *i = aijkok->i_device_data(); 228 if (j) *j = aijkok->j_device_data(); 229 if (a) { 230 PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 231 *a = aijkok->a_device_data(); 232 } 233 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 234 PetscFunctionReturn(PETSC_SUCCESS); 235 } 236 237 /* 238 Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 239 240 Input Parameter: 241 . A - the MATSEQAIJKOKKOS matrix 242 243 Output Parameters: 244 + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 245 - T_d - the transpose on device, whose value array is allocated but not initialized 246 */ 247 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 248 { 249 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 250 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 251 const PetscInt *Ai = aseq->i, *Aj = aseq->j; 252 MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 253 MatRowMapType *Ti = Ti_h.data(); 254 MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 255 MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 256 PetscInt *Tj = Tj_h.data(); 257 PetscInt *perm = perm_h.data(); 258 PetscInt *offset; 259 260 PetscFunctionBegin; 261 // Populate Ti 262 PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 263 Ti++; 264 for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 265 Ti--; 266 for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 267 268 // Populate Tj and the permutation array 269 PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 270 for (PetscInt i = 0; i < m; i++) { 271 for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 272 PetscInt r = Aj[j]; // row r of T 273 PetscInt disp = Ti[r] + offset[r]; 274 275 Tj[disp] = i; // col i of T 276 perm[disp] = j; 277 offset[r]++; 278 } 279 } 280 PetscCall(PetscFree(offset)); 281 282 // Sort each row of T, along with the permutation array 283 for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 284 285 // Output perm and T on device 286 auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 287 auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 288 PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 289 PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 290 PetscFunctionReturn(PETSC_SUCCESS); 291 } 292 293 // Generate the transpose on device and cache it internally 294 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 295 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 296 { 297 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 298 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 299 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 300 KokkosCsrMatrix &T = akok->csrmatT; 301 302 PetscFunctionBegin; 303 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 304 PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device 305 306 const auto &Aa = akok->a_dual.view_device(); 307 308 if (A->symmetric == PETSC_BOOL3_TRUE) { 309 *csrmatT = akok->csrmat; 310 } else { 311 // See if we already have a cached transpose and its value is up to date 312 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 313 if (!akok->transpose_updated) { // if the value is out of date, update the cached version 314 const auto &perm = akok->transpose_perm; // get the permutation array 315 auto &Ta = T.values; 316 317 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 318 } 319 } else { // Generate T of size n x m for the first time 320 MatRowMapKokkosView perm; 321 322 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 323 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 324 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 325 } 326 akok->transpose_updated = PETSC_TRUE; 327 *csrmatT = akok->csrmatT; 328 } 329 PetscFunctionReturn(PETSC_SUCCESS); 330 } 331 332 // Generate the Hermitian on device and cache it internally 333 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 334 { 335 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 336 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 337 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 338 KokkosCsrMatrix &T = akok->csrmatH; 339 340 PetscFunctionBegin; 341 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 342 PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device 343 344 const auto &Aa = akok->a_dual.view_device(); 345 346 if (A->hermitian == PETSC_BOOL3_TRUE) { 347 *csrmatH = akok->csrmat; 348 } else { 349 // See if we already have a cached hermitian and its value is up to date 350 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 351 if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 352 const auto &perm = akok->transpose_perm; // get the permutation array 353 auto &Ta = T.values; 354 355 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 356 } 357 } else { // Generate T of size n x m for the first time 358 MatRowMapKokkosView perm; 359 360 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 361 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 362 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 363 } 364 akok->hermitian_updated = PETSC_TRUE; 365 *csrmatH = akok->csrmatH; 366 } 367 PetscFunctionReturn(PETSC_SUCCESS); 368 } 369 370 /* y = A x */ 371 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 372 { 373 Mat_SeqAIJKokkos *aijkok; 374 ConstPetscScalarKokkosView xv; 375 PetscScalarKokkosView yv; 376 377 PetscFunctionBegin; 378 PetscCall(PetscLogGpuTimeBegin()); 379 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 380 PetscCall(VecGetKokkosView(xx, &xv)); 381 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 382 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 383 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 384 PetscCall(VecRestoreKokkosView(xx, &xv)); 385 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 386 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 387 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 388 PetscCall(PetscLogGpuTimeEnd()); 389 PetscFunctionReturn(PETSC_SUCCESS); 390 } 391 392 /* y = A^T x */ 393 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 394 { 395 Mat_SeqAIJKokkos *aijkok; 396 const char *mode; 397 ConstPetscScalarKokkosView xv; 398 PetscScalarKokkosView yv; 399 KokkosCsrMatrix csrmat; 400 401 PetscFunctionBegin; 402 PetscCall(PetscLogGpuTimeBegin()); 403 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 404 PetscCall(VecGetKokkosView(xx, &xv)); 405 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 406 if (A->form_explicit_transpose) { 407 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 408 mode = "N"; 409 } else { 410 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 411 csrmat = aijkok->csrmat; 412 mode = "T"; 413 } 414 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 415 PetscCall(VecRestoreKokkosView(xx, &xv)); 416 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 417 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 418 PetscCall(PetscLogGpuTimeEnd()); 419 PetscFunctionReturn(PETSC_SUCCESS); 420 } 421 422 /* y = A^H x */ 423 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 424 { 425 Mat_SeqAIJKokkos *aijkok; 426 const char *mode; 427 ConstPetscScalarKokkosView xv; 428 PetscScalarKokkosView yv; 429 KokkosCsrMatrix csrmat; 430 431 PetscFunctionBegin; 432 PetscCall(PetscLogGpuTimeBegin()); 433 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 434 PetscCall(VecGetKokkosView(xx, &xv)); 435 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 436 if (A->form_explicit_transpose) { 437 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 438 mode = "N"; 439 } else { 440 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 441 csrmat = aijkok->csrmat; 442 mode = "C"; 443 } 444 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 445 PetscCall(VecRestoreKokkosView(xx, &xv)); 446 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 447 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 448 PetscCall(PetscLogGpuTimeEnd()); 449 PetscFunctionReturn(PETSC_SUCCESS); 450 } 451 452 /* z = A x + y */ 453 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 454 { 455 Mat_SeqAIJKokkos *aijkok; 456 ConstPetscScalarKokkosView xv; 457 PetscScalarKokkosView zv; 458 459 PetscFunctionBegin; 460 PetscCall(PetscLogGpuTimeBegin()); 461 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 462 if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host 463 PetscCall(VecGetKokkosView(xx, &xv)); 464 PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device 465 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 466 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 467 PetscCall(VecRestoreKokkosView(xx, &xv)); 468 PetscCall(VecRestoreKokkosView(zz, &zv)); 469 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 470 PetscCall(PetscLogGpuTimeEnd()); 471 PetscFunctionReturn(PETSC_SUCCESS); 472 } 473 474 /* z = A^T x + y */ 475 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 476 { 477 Mat_SeqAIJKokkos *aijkok; 478 const char *mode; 479 ConstPetscScalarKokkosView xv; 480 PetscScalarKokkosView zv; 481 KokkosCsrMatrix csrmat; 482 483 PetscFunctionBegin; 484 PetscCall(PetscLogGpuTimeBegin()); 485 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 486 if (zz != yy) PetscCall(VecCopy(yy, zz)); 487 PetscCall(VecGetKokkosView(xx, &xv)); 488 PetscCall(VecGetKokkosView(zz, &zv)); 489 if (A->form_explicit_transpose) { 490 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 491 mode = "N"; 492 } else { 493 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 494 csrmat = aijkok->csrmat; 495 mode = "T"; 496 } 497 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 498 PetscCall(VecRestoreKokkosView(xx, &xv)); 499 PetscCall(VecRestoreKokkosView(zz, &zv)); 500 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 501 PetscCall(PetscLogGpuTimeEnd()); 502 PetscFunctionReturn(PETSC_SUCCESS); 503 } 504 505 /* z = A^H x + y */ 506 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 507 { 508 Mat_SeqAIJKokkos *aijkok; 509 const char *mode; 510 ConstPetscScalarKokkosView xv; 511 PetscScalarKokkosView zv; 512 KokkosCsrMatrix csrmat; 513 514 PetscFunctionBegin; 515 PetscCall(PetscLogGpuTimeBegin()); 516 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 517 if (zz != yy) PetscCall(VecCopy(yy, zz)); 518 PetscCall(VecGetKokkosView(xx, &xv)); 519 PetscCall(VecGetKokkosView(zz, &zv)); 520 if (A->form_explicit_transpose) { 521 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 522 mode = "N"; 523 } else { 524 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 525 csrmat = aijkok->csrmat; 526 mode = "C"; 527 } 528 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 529 PetscCall(VecRestoreKokkosView(xx, &xv)); 530 PetscCall(VecRestoreKokkosView(zz, &zv)); 531 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 532 PetscCall(PetscLogGpuTimeEnd()); 533 PetscFunctionReturn(PETSC_SUCCESS); 534 } 535 536 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 537 { 538 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 539 540 PetscFunctionBegin; 541 switch (op) { 542 case MAT_FORM_EXPLICIT_TRANSPOSE: 543 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 544 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 545 A->form_explicit_transpose = flg; 546 break; 547 default: 548 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 549 break; 550 } 551 PetscFunctionReturn(PETSC_SUCCESS); 552 } 553 554 /* Depending on reuse, either build a new mat, or use the existing mat */ 555 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 556 { 557 Mat_SeqAIJ *aseq; 558 559 PetscFunctionBegin; 560 PetscCall(PetscKokkosInitializeCheck()); 561 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 562 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */ 563 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 564 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 565 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 566 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 567 PetscCall(PetscFree(A->defaultvectype)); 568 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 569 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 570 PetscCall(MatSetOps_SeqAIJKokkos(A)); 571 aseq = static_cast<Mat_SeqAIJ *>(A->data); 572 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 573 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 574 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE); 575 } 576 } 577 PetscFunctionReturn(PETSC_SUCCESS); 578 } 579 580 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 581 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 582 */ 583 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 584 { 585 Mat_SeqAIJ *bseq; 586 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 587 Mat mat; 588 589 PetscFunctionBegin; 590 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 591 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 592 mat = *B; 593 if (A->assembled) { 594 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 595 bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE); 596 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 597 /* Now copy values to B if needed */ 598 if (dupOption == MAT_COPY_VALUES) { 599 if (akok->a_dual.need_sync_device()) { 600 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 601 bkok->a_dual.modify_host(); 602 } else { /* If device has the latest data, we only copy data on device */ 603 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 604 bkok->a_dual.modify_device(); 605 } 606 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 607 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 608 bkok->a_dual.modify_host(); 609 } 610 mat->spptr = bkok; 611 } 612 613 PetscCall(PetscFree(mat->defaultvectype)); 614 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 615 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 616 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 617 PetscFunctionReturn(PETSC_SUCCESS); 618 } 619 620 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 621 { 622 Mat At; 623 KokkosCsrMatrix internT; 624 Mat_SeqAIJKokkos *atkok, *bkok; 625 626 PetscFunctionBegin; 627 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 628 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 629 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 630 /* Deep copy internT, as we want to isolate the internal transpose */ 631 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 632 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 633 if (reuse == MAT_INITIAL_MATRIX) *B = At; 634 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 635 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 636 if ((*B)->assembled) { 637 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 638 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 639 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 640 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 641 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 642 MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 643 MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 644 PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 645 PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 646 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 647 } 648 PetscFunctionReturn(PETSC_SUCCESS); 649 } 650 651 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 652 { 653 Mat_SeqAIJKokkos *aijkok; 654 655 PetscFunctionBegin; 656 if (A->factortype == MAT_FACTOR_NONE) { 657 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 658 delete aijkok; 659 } else { 660 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 661 } 662 A->spptr = NULL; 663 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 664 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 665 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 666 #if defined(PETSC_HAVE_HYPRE) 667 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL)); 668 #endif 669 PetscCall(MatDestroy_SeqAIJ(A)); 670 PetscFunctionReturn(PETSC_SUCCESS); 671 } 672 673 /*MC 674 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 675 676 A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types 677 678 Options Database Key: 679 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 680 681 Level: beginner 682 683 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 684 M*/ 685 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 686 { 687 PetscFunctionBegin; 688 PetscCall(PetscKokkosInitializeCheck()); 689 PetscCall(MatCreate_SeqAIJ(A)); 690 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 691 PetscFunctionReturn(PETSC_SUCCESS); 692 } 693 694 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 695 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 696 { 697 Mat_SeqAIJ *a, *b; 698 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 699 MatScalarKokkosView aa, ba, ca; 700 MatRowMapKokkosView ai, bi, ci; 701 MatColIdxKokkosView aj, bj, cj; 702 PetscInt m, n, nnz, aN; 703 704 PetscFunctionBegin; 705 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 706 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 707 PetscAssertPointer(C, 4); 708 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 709 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 710 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 711 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 712 713 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 714 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 715 a = static_cast<Mat_SeqAIJ *>(A->data); 716 b = static_cast<Mat_SeqAIJ *>(B->data); 717 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 718 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 719 aa = akok->a_dual.view_device(); 720 ai = akok->i_dual.view_device(); 721 ba = bkok->a_dual.view_device(); 722 bi = bkok->i_dual.view_device(); 723 m = A->rmap->n; /* M, N and nnz of C */ 724 n = A->cmap->n + B->cmap->n; 725 nnz = a->nz + b->nz; 726 aN = A->cmap->n; /* N of A */ 727 if (reuse == MAT_INITIAL_MATRIX) { 728 aj = akok->j_dual.view_device(); 729 bj = bkok->j_dual.view_device(); 730 auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 731 auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 732 auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 733 ca = ca_dual.view_device(); 734 ci = ci_dual.view_device(); 735 cj = cj_dual.view_device(); 736 737 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 738 Kokkos::parallel_for( 739 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 740 PetscInt i = t.league_rank(); /* row i */ 741 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 742 743 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 744 ci(i) = coffset; 745 if (i == m - 1) ci(m) = ai(m) + bi(m); 746 }); 747 748 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 749 if (k < alen) { 750 ca(coffset + k) = aa(ai(i) + k); 751 cj(coffset + k) = aj(ai(i) + k); 752 } else { 753 ca(coffset + k) = ba(bi(i) + k - alen); 754 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 755 } 756 }); 757 }); 758 ca_dual.modify_device(); 759 ci_dual.modify_device(); 760 cj_dual.modify_device(); 761 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 762 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 763 } else if (reuse == MAT_REUSE_MATRIX) { 764 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 765 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 766 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 767 ca = ckok->a_dual.view_device(); 768 ci = ckok->i_dual.view_device(); 769 770 Kokkos::parallel_for( 771 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 772 PetscInt i = t.league_rank(); /* row i */ 773 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 774 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 775 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 776 else ca(ci(i) + k) = ba(bi(i) + k - alen); 777 }); 778 }); 779 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 780 } 781 PetscFunctionReturn(PETSC_SUCCESS); 782 } 783 784 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 785 { 786 PetscFunctionBegin; 787 delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 788 PetscFunctionReturn(PETSC_SUCCESS); 789 } 790 791 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 792 { 793 Mat_Product *product = C->product; 794 Mat A, B; 795 bool transA, transB; /* use bool, since KK needs this type */ 796 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 797 Mat_SeqAIJ *c; 798 MatProductData_SeqAIJKokkos *pdata; 799 KokkosCsrMatrix csrmatA, csrmatB; 800 801 PetscFunctionBegin; 802 MatCheckProduct(C, 1); 803 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 804 pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 805 806 // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 807 // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 808 // we still do numeric. 809 if (pdata->reusesym) { // numeric reuses results from symbolic 810 pdata->reusesym = PETSC_FALSE; 811 PetscFunctionReturn(PETSC_SUCCESS); 812 } 813 814 switch (product->type) { 815 case MATPRODUCT_AB: 816 transA = false; 817 transB = false; 818 break; 819 case MATPRODUCT_AtB: 820 transA = true; 821 transB = false; 822 break; 823 case MATPRODUCT_ABt: 824 transA = false; 825 transB = true; 826 break; 827 default: 828 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 829 } 830 831 A = product->A; 832 B = product->B; 833 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 834 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 835 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 836 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 837 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 838 839 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 840 841 csrmatA = akok->csrmat; 842 csrmatB = bkok->csrmat; 843 844 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 845 if (transA) { 846 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 847 transA = false; 848 } 849 850 if (transB) { 851 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 852 transB = false; 853 } 854 PetscCall(PetscLogGpuTimeBegin()); 855 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 856 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 857 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 858 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 859 #endif 860 861 PetscCall(PetscLogGpuTimeEnd()); 862 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 863 /* shorter version of MatAssemblyEnd_SeqAIJ */ 864 c = (Mat_SeqAIJ *)C->data; 865 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 866 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 867 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 868 c->reallocs = 0; 869 C->info.mallocs = 0; 870 C->info.nz_unneeded = 0; 871 C->assembled = C->was_assembled = PETSC_TRUE; 872 C->num_ass++; 873 PetscFunctionReturn(PETSC_SUCCESS); 874 } 875 876 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 877 { 878 Mat_Product *product = C->product; 879 MatProductType ptype; 880 Mat A, B; 881 bool transA, transB; 882 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 883 MatProductData_SeqAIJKokkos *pdata; 884 MPI_Comm comm; 885 KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 886 887 PetscFunctionBegin; 888 MatCheckProduct(C, 1); 889 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 890 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 891 A = product->A; 892 B = product->B; 893 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 894 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 895 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 896 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 897 csrmatA = akok->csrmat; 898 csrmatB = bkok->csrmat; 899 900 ptype = product->type; 901 // Take advantage of the symmetry if true 902 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 903 ptype = MATPRODUCT_AB; 904 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 905 } 906 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 907 ptype = MATPRODUCT_AB; 908 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 909 } 910 911 switch (ptype) { 912 case MATPRODUCT_AB: 913 transA = false; 914 transB = false; 915 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 916 break; 917 case MATPRODUCT_AtB: 918 transA = true; 919 transB = false; 920 if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs)); 921 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 922 break; 923 case MATPRODUCT_ABt: 924 transA = false; 925 transB = true; 926 if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs)); 927 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 928 break; 929 default: 930 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 931 } 932 PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 933 pdata->reusesym = product->api_user; 934 935 /* TODO: add command line options to select spgemm algorithms */ 936 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 937 938 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 939 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 940 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 941 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 942 #endif 943 #endif 944 PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 945 946 PetscCall(PetscLogGpuTimeBegin()); 947 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 948 if (transA) { 949 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 950 transA = false; 951 } 952 953 if (transB) { 954 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 955 transB = false; 956 } 957 958 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 959 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 960 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 961 calling new Mat_SeqAIJKokkos(). 962 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 963 */ 964 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 965 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 966 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 967 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 968 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 969 #endif 970 PetscCall(PetscLogGpuTimeEnd()); 971 972 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 973 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 974 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 975 PetscFunctionReturn(PETSC_SUCCESS); 976 } 977 978 /* handles sparse matrix matrix ops */ 979 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 980 { 981 Mat_Product *product = mat->product; 982 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 983 984 PetscFunctionBegin; 985 MatCheckProduct(mat, 1); 986 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 987 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 988 if (Biskok && Ciskok) { 989 switch (product->type) { 990 case MATPRODUCT_AB: 991 case MATPRODUCT_AtB: 992 case MATPRODUCT_ABt: 993 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 994 break; 995 case MATPRODUCT_PtAP: 996 case MATPRODUCT_RARt: 997 case MATPRODUCT_ABC: 998 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 999 break; 1000 default: 1001 break; 1002 } 1003 } else { /* fallback for AIJ */ 1004 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 1005 } 1006 PetscFunctionReturn(PETSC_SUCCESS); 1007 } 1008 1009 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 1010 { 1011 Mat_SeqAIJKokkos *aijkok; 1012 1013 PetscFunctionBegin; 1014 PetscCall(PetscLogGpuTimeBegin()); 1015 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1016 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1017 KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 1018 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1019 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 1020 PetscCall(PetscLogGpuTimeEnd()); 1021 PetscFunctionReturn(PETSC_SUCCESS); 1022 } 1023 1024 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular) 1025 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a) 1026 { 1027 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1028 1029 PetscFunctionBegin; 1030 if (A->assembled && aijseq->diagonaldense) { // no missing diagonals 1031 PetscInt n = PetscMin(A->rmap->n, A->cmap->n); 1032 1033 PetscCall(PetscLogGpuTimeBegin()); 1034 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1035 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1036 const auto &Aa = aijkok->a_dual.view_device(); 1037 const auto &Adiag = aijkok->diag_dual.view_device(); 1038 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; })); 1039 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1040 PetscCall(PetscLogGpuFlops(n)); 1041 PetscCall(PetscLogGpuTimeEnd()); 1042 } else { // need reassembly, very slow! 1043 PetscCall(MatShift_Basic(A, a)); 1044 } 1045 PetscFunctionReturn(PETSC_SUCCESS); 1046 } 1047 1048 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is) 1049 { 1050 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data); 1051 1052 PetscFunctionBegin; 1053 if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals 1054 ConstPetscScalarKokkosView dv; 1055 PetscInt n, nv; 1056 1057 PetscCall(PetscLogGpuTimeBegin()); 1058 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1059 PetscCall(VecGetKokkosView(D, &dv)); 1060 PetscCall(VecGetLocalSize(D, &nv)); 1061 n = PetscMin(Y->rmap->n, Y->cmap->n); 1062 PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match"); 1063 1064 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1065 const auto &Aa = aijkok->a_dual.view_device(); 1066 const auto &Adiag = aijkok->diag_dual.view_device(); 1067 PetscCallCXX(Kokkos::parallel_for( 1068 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1069 if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i); 1070 else Aa(Adiag(i)) += dv(i); 1071 })); 1072 PetscCall(VecRestoreKokkosView(D, &dv)); 1073 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1074 PetscCall(PetscLogGpuFlops(n)); 1075 PetscCall(PetscLogGpuTimeEnd()); 1076 } else { // need reassembly, very slow! 1077 PetscCall(MatDiagonalSet_Default(Y, D, is)); 1078 } 1079 PetscFunctionReturn(PETSC_SUCCESS); 1080 } 1081 1082 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr) 1083 { 1084 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1085 PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz; 1086 ConstPetscScalarKokkosView lv, rv; 1087 1088 PetscFunctionBegin; 1089 PetscCall(PetscLogGpuTimeBegin()); 1090 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1091 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1092 const auto &Aa = aijkok->a_dual.view_device(); 1093 const auto &Ai = aijkok->i_dual.view_device(); 1094 const auto &Aj = aijkok->j_dual.view_device(); 1095 if (ll) { 1096 PetscCall(VecGetLocalSize(ll, &m)); 1097 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1098 PetscCall(VecGetKokkosView(ll, &lv)); 1099 PetscCallCXX(Kokkos::parallel_for( // for each row 1100 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1101 PetscInt i = t.league_rank(); // row i 1102 PetscInt len = Ai(i + 1) - Ai(i); 1103 // scale entries on the row 1104 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); }); 1105 })); 1106 PetscCall(VecRestoreKokkosView(ll, &lv)); 1107 PetscCall(PetscLogGpuFlops(nz)); 1108 } 1109 if (rr) { 1110 PetscCall(VecGetLocalSize(rr, &n)); 1111 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1112 PetscCall(VecGetKokkosView(rr, &rv)); 1113 PetscCallCXX(Kokkos::parallel_for( // for each nonzero 1114 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); })); 1115 PetscCall(VecRestoreKokkosView(rr, &lv)); 1116 PetscCall(PetscLogGpuFlops(nz)); 1117 } 1118 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1119 PetscCall(PetscLogGpuTimeEnd()); 1120 PetscFunctionReturn(PETSC_SUCCESS); 1121 } 1122 1123 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 1124 { 1125 Mat_SeqAIJKokkos *aijkok; 1126 1127 PetscFunctionBegin; 1128 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1129 if (aijkok) { /* Only zero the device if data is already there */ 1130 KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0); 1131 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1132 } else { /* Might be preallocated but not assembled */ 1133 PetscCall(MatZeroEntries_SeqAIJ(A)); 1134 } 1135 PetscFunctionReturn(PETSC_SUCCESS); 1136 } 1137 1138 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1139 { 1140 Mat_SeqAIJKokkos *aijkok; 1141 PetscInt n; 1142 PetscScalarKokkosView xv; 1143 1144 PetscFunctionBegin; 1145 PetscCall(VecGetLocalSize(x, &n)); 1146 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1147 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1148 1149 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1150 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1151 1152 const auto &Aa = aijkok->a_dual.view_device(); 1153 const auto &Ai = aijkok->i_dual.view_device(); 1154 const auto &Adiag = aijkok->diag_dual.view_device(); 1155 1156 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1157 Kokkos::parallel_for( 1158 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1159 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1160 else xv(i) = 0; 1161 }); 1162 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1163 PetscFunctionReturn(PETSC_SUCCESS); 1164 } 1165 1166 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1167 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1168 { 1169 Mat_SeqAIJKokkos *aijkok; 1170 1171 PetscFunctionBegin; 1172 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1173 PetscAssertPointer(kv, 2); 1174 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1175 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1176 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1177 *kv = aijkok->a_dual.view_device(); 1178 PetscFunctionReturn(PETSC_SUCCESS); 1179 } 1180 1181 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1182 { 1183 PetscFunctionBegin; 1184 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1185 PetscAssertPointer(kv, 2); 1186 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1187 PetscFunctionReturn(PETSC_SUCCESS); 1188 } 1189 1190 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1191 { 1192 Mat_SeqAIJKokkos *aijkok; 1193 1194 PetscFunctionBegin; 1195 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1196 PetscAssertPointer(kv, 2); 1197 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1198 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1199 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1200 *kv = aijkok->a_dual.view_device(); 1201 PetscFunctionReturn(PETSC_SUCCESS); 1202 } 1203 1204 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1205 { 1206 PetscFunctionBegin; 1207 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1208 PetscAssertPointer(kv, 2); 1209 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1210 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1211 PetscFunctionReturn(PETSC_SUCCESS); 1212 } 1213 1214 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1215 { 1216 Mat_SeqAIJKokkos *aijkok; 1217 1218 PetscFunctionBegin; 1219 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1220 PetscAssertPointer(kv, 2); 1221 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1222 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1223 *kv = aijkok->a_dual.view_device(); 1224 PetscFunctionReturn(PETSC_SUCCESS); 1225 } 1226 1227 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1228 { 1229 PetscFunctionBegin; 1230 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1231 PetscAssertPointer(kv, 2); 1232 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1233 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1234 PetscFunctionReturn(PETSC_SUCCESS); 1235 } 1236 1237 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A) 1238 { 1239 Mat_SeqAIJKokkos *akok; 1240 1241 PetscFunctionBegin; 1242 auto exec = PetscGetKokkosExecutionSpace(); 1243 // Create host copies of the input aij 1244 auto i_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), i_d); 1245 auto j_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), j_d); 1246 // Don't copy the vals to the host now 1247 auto a_h = Kokkos::create_mirror_view(HostMirrorMemorySpace(), a_d); 1248 1249 MatScalarKokkosDualView a_dual = MatScalarKokkosDualView(a_d, a_h); 1250 // Note we have modified device data so it will copy lazily 1251 a_dual.modify_device(); 1252 MatRowMapKokkosDualView i_dual = MatRowMapKokkosDualView(i_d, i_h); 1253 MatColIdxKokkosDualView j_dual = MatColIdxKokkosDualView(j_d, j_h); 1254 1255 PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_dual.extent(0), i_dual, j_dual, a_dual)); 1256 PetscCall(MatCreate(comm, A)); 1257 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1258 PetscFunctionReturn(PETSC_SUCCESS); 1259 } 1260 1261 /* Computes Y += alpha X */ 1262 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1263 { 1264 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1265 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1266 ConstMatScalarKokkosView Xa; 1267 MatScalarKokkosView Ya; 1268 auto exec = PetscGetKokkosExecutionSpace(); 1269 1270 PetscFunctionBegin; 1271 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1272 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1273 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1274 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1275 PetscCall(PetscLogGpuTimeBegin()); 1276 1277 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1278 PetscBool e; 1279 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1280 if (e) { 1281 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1282 if (e) pattern = SAME_NONZERO_PATTERN; 1283 } 1284 } 1285 1286 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1287 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1288 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1289 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1290 */ 1291 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1292 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1293 Xa = xkok->a_dual.view_device(); 1294 Ya = ykok->a_dual.view_device(); 1295 1296 if (pattern == SAME_NONZERO_PATTERN) { 1297 KokkosBlas::axpy(exec, alpha, Xa, Ya); 1298 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1299 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1300 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1301 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1302 1303 Kokkos::parallel_for( 1304 Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1305 PetscInt i = t.league_rank(); // row i 1306 Kokkos::single(Kokkos::PerTeam(t), [=]() { 1307 // Only one thread works in a team 1308 PetscInt p, q = Yi(i); 1309 for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 1310 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 1311 if (Xj(p) == Yj(q)) { // Found it 1312 Ya(q) += alpha * Xa(p); 1313 q++; 1314 } else { 1315 // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1316 // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1317 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 1318 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 1319 #else 1320 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 1321 #endif 1322 } 1323 } 1324 }); 1325 }); 1326 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1327 } else { // different nonzero patterns 1328 Mat Z; 1329 KokkosCsrMatrix zcsr; 1330 KernelHandle kh; 1331 kh.create_spadd_handle(true); // X, Y are sorted 1332 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1333 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1334 zkok = new Mat_SeqAIJKokkos(zcsr); 1335 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1336 PetscCall(MatHeaderReplace(Y, &Z)); 1337 kh.destroy_spadd_handle(); 1338 } 1339 PetscCall(PetscLogGpuTimeEnd()); 1340 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 1341 PetscFunctionReturn(PETSC_SUCCESS); 1342 } 1343 1344 struct MatCOOStruct_SeqAIJKokkos { 1345 PetscCount n; 1346 PetscCount Atot; 1347 PetscInt nz; 1348 PetscCountKokkosView jmap; 1349 PetscCountKokkosView perm; 1350 1351 MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h) 1352 { 1353 nz = coo_h->nz; 1354 n = coo_h->n; 1355 Atot = coo_h->Atot; 1356 jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1)); 1357 perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot)); 1358 } 1359 }; 1360 1361 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data) 1362 { 1363 PetscFunctionBegin; 1364 PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data)); 1365 PetscFunctionReturn(PETSC_SUCCESS); 1366 } 1367 1368 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1369 { 1370 Mat_SeqAIJKokkos *akok; 1371 Mat_SeqAIJ *aseq; 1372 PetscContainer container_h; 1373 MatCOOStruct_SeqAIJ *coo_h; 1374 MatCOOStruct_SeqAIJKokkos *coo_d; 1375 1376 PetscFunctionBegin; 1377 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1378 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1379 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1380 delete akok; 1381 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE); 1382 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1383 1384 // Copy the COO struct to device 1385 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 1386 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 1387 PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h)); 1388 1389 // Put the COO struct in a container and then attach that to the matrix 1390 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos)); 1391 PetscFunctionReturn(PETSC_SUCCESS); 1392 } 1393 1394 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1395 { 1396 MatScalarKokkosView Aa; 1397 ConstMatScalarKokkosView kv; 1398 PetscMemType memtype; 1399 PetscContainer container; 1400 MatCOOStruct_SeqAIJKokkos *coo; 1401 1402 PetscFunctionBegin; 1403 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 1404 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 1405 1406 const auto &n = coo->n; 1407 const auto &Annz = coo->nz; 1408 const auto &jmap = coo->jmap; 1409 const auto &perm = coo->perm; 1410 1411 PetscCall(PetscGetMemType(v, &memtype)); 1412 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1413 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n)); 1414 } else { 1415 kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */ 1416 } 1417 1418 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1419 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1420 1421 PetscCall(PetscLogGpuTimeBegin()); 1422 Kokkos::parallel_for( 1423 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) { 1424 PetscScalar sum = 0.0; 1425 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1426 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1427 }); 1428 PetscCall(PetscLogGpuTimeEnd()); 1429 1430 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1431 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1432 PetscFunctionReturn(PETSC_SUCCESS); 1433 } 1434 1435 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1436 { 1437 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1438 1439 PetscFunctionBegin; 1440 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1441 A->boundtocpu = PETSC_FALSE; 1442 1443 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1444 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1445 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1446 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1447 A->ops->scale = MatScale_SeqAIJKokkos; 1448 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1449 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1450 A->ops->mult = MatMult_SeqAIJKokkos; 1451 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1452 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1453 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1454 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1455 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1456 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1457 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1458 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1459 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1460 A->ops->shift = MatShift_SeqAIJKokkos; 1461 A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos; 1462 A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos; 1463 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1464 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1465 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1466 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1467 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1468 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1469 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1470 1471 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1472 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1473 #if defined(PETSC_HAVE_HYPRE) 1474 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE)); 1475 #endif 1476 PetscFunctionReturn(PETSC_SUCCESS); 1477 } 1478 1479 /* 1480 Extract the (prescribled) diagonal blocks of the matrix and then invert them 1481 1482 Input Parameters: 1483 + A - the MATSEQAIJKOKKOS matrix 1484 . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i) 1485 . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[] 1486 . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i) 1487 - work - a pre-allocated work buffer (as big as diagVal) for use by this routine 1488 1489 Output Parameter: 1490 . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order) 1491 */ 1492 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal) 1493 { 1494 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1495 PetscInt nblocks = bs.extent(0) - 1; 1496 1497 PetscFunctionBegin; 1498 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device 1499 1500 // Pull out the diagonal blocks of the matrix and then invert the blocks 1501 auto Aa = akok->a_dual.view_device(); 1502 auto Ai = akok->i_dual.view_device(); 1503 auto Aj = akok->j_dual.view_device(); 1504 auto Adiag = akok->diag_dual.view_device(); 1505 // TODO: how to tune the team size? 1506 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY) 1507 auto ts = Kokkos::AUTO(); 1508 #else 1509 auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs 1510 #endif 1511 PetscCallCXX(Kokkos::parallel_for( 1512 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) { 1513 const PetscInt bid = teamMember.league_rank(); // block id 1514 const PetscInt rstart = bs(bid); // this block starts from this row 1515 const PetscInt m = bs(bid + 1) - bs(bid); // size of this block 1516 const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order 1517 const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m); 1518 1519 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B 1520 PetscInt i = rstart + r; // i-th row in A 1521 1522 if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case) 1523 PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row 1524 1525 for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet 1526 if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero 1527 B(r, c) = 0.0; 1528 } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column 1529 B(r, c) = Aa(first + c); 1530 } else { // this entry does not show up in the CSR 1531 B(r, c) = 0.0; 1532 } 1533 } 1534 } else { // rare case that the diagonal does not exist 1535 const PetscInt begin = Ai(i); 1536 const PetscInt end = Ai(i + 1); 1537 for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0; 1538 for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not. 1539 if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j); 1540 else if (Aj(j) >= rstart + m) break; 1541 } 1542 } 1543 }); 1544 1545 // LU-decompose B (w/o pivoting) and then invert B 1546 KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0); 1547 KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W); 1548 })); 1549 // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels 1550 PetscFunctionReturn(PETSC_SUCCESS); 1551 } 1552 1553 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1554 { 1555 Mat_SeqAIJ *aseq; 1556 PetscInt i, m, n; 1557 auto exec = PetscGetKokkosExecutionSpace(); 1558 1559 PetscFunctionBegin; 1560 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1561 1562 m = akok->nrows(); 1563 n = akok->ncols(); 1564 PetscCall(MatSetSizes(A, m, n, m, n)); 1565 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1566 1567 /* Set up data structures of A as a MATSEQAIJ */ 1568 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1569 aseq = (Mat_SeqAIJ *)A->data; 1570 1571 PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */ 1572 PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec)); 1573 1574 aseq->i = akok->i_host_data(); 1575 aseq->j = akok->j_host_data(); 1576 aseq->a = akok->a_host_data(); 1577 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1578 aseq->free_a = PETSC_FALSE; 1579 aseq->free_ij = PETSC_FALSE; 1580 aseq->nz = akok->nnz(); 1581 aseq->maxnz = aseq->nz; 1582 1583 PetscCall(PetscMalloc1(m, &aseq->imax)); 1584 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1585 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1586 1587 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1588 akok->nonzerostate = A->nonzerostate; 1589 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1590 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1591 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1592 PetscFunctionReturn(PETSC_SUCCESS); 1593 } 1594 1595 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 1596 { 1597 PetscFunctionBegin; 1598 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1599 *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 1600 PetscFunctionReturn(PETSC_SUCCESS); 1601 } 1602 1603 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 1604 { 1605 Mat_SeqAIJKokkos *akok; 1606 1607 PetscFunctionBegin; 1608 PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 1609 PetscCall(MatCreate(comm, A)); 1610 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1611 PetscFunctionReturn(PETSC_SUCCESS); 1612 } 1613 1614 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1615 1616 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1617 */ 1618 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1619 { 1620 PetscFunctionBegin; 1621 PetscCall(MatCreate(comm, A)); 1622 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1623 PetscFunctionReturn(PETSC_SUCCESS); 1624 } 1625 1626 /*@C 1627 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1628 (the default parallel PETSc format). This matrix will ultimately be handled by 1629 Kokkos for calculations. 1630 1631 Collective 1632 1633 Input Parameters: 1634 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1635 . m - number of rows 1636 . n - number of columns 1637 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 1638 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 1639 1640 Output Parameter: 1641 . A - the matrix 1642 1643 Level: intermediate 1644 1645 Notes: 1646 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1647 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1648 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1649 1650 The AIJ format, also called 1651 compressed row storage, is fully compatible with standard Fortran 1652 storage. That is, the stored row and column indices can begin at 1653 either one (as in Fortran) or zero. 1654 1655 Specify the preallocated storage with either `nz` or `nnz` (not both). 1656 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1657 allocation. 1658 1659 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()` 1660 @*/ 1661 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1662 { 1663 PetscFunctionBegin; 1664 PetscCall(PetscKokkosInitializeCheck()); 1665 PetscCall(MatCreate(comm, A)); 1666 PetscCall(MatSetSizes(*A, m, n, m, n)); 1667 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1668 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1669 PetscFunctionReturn(PETSC_SUCCESS); 1670 } 1671 1672 // After matrix numeric factorization, there are still steps to do before triangular solve can be called. 1673 // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does). 1674 // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve(). 1675 // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck. 1676 static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A) 1677 { 1678 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1679 const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy 1680 const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy 1681 1682 PetscFunctionBegin; 1683 if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet 1684 if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d)); 1685 if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d)); 1686 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1687 } 1688 PetscFunctionReturn(PETSC_SUCCESS); 1689 } 1690 1691 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1692 { 1693 const PetscInt n = A->rmap->n; 1694 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1695 const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy 1696 const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy 1697 1698 PetscFunctionBegin; 1699 if (!factors->transpose_updated) { 1700 if (has_upper) { 1701 if (!factors->iUt_d.extent(0)) { // Allocate Ut on device if not yet 1702 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix 1703 factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 1704 factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 1705 } 1706 1707 if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host 1708 if (!factors->U) { 1709 Mat_SeqAIJ *seq; 1710 1711 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U)); 1712 PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut)); 1713 1714 seq = static_cast<Mat_SeqAIJ *>(factors->Ut->data); 1715 factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1); 1716 factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz); 1717 factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz); 1718 } else { 1719 PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h 1720 } 1721 // Copy Ut from host to device 1722 PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h)); 1723 PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h)); 1724 PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h)); 1725 } else { // If U was computed on device, we also compute the transpose there 1726 // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. 1727 PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, 1728 factors->jU_d, factors->aU_d, 1729 factors->iUt_d, factors->jUt_d, 1730 factors->aUt_d)); 1731 PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d)); 1732 } 1733 PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d)); 1734 } 1735 1736 // do the same for L with LU 1737 if (has_lower) { 1738 if (!factors->iLt_d.extent(0)) { // Allocate Lt on device if not yet 1739 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix 1740 factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 1741 factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 1742 } 1743 1744 if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host 1745 if (!factors->L) { 1746 Mat_SeqAIJ *seq; 1747 1748 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L)); 1749 PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt)); 1750 1751 seq = static_cast<Mat_SeqAIJ *>(factors->Lt->data); 1752 factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1); 1753 factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz); 1754 factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz); 1755 } else { 1756 PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h 1757 } 1758 // Copy Lt from host to device 1759 PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h)); 1760 PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h)); 1761 PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h)); 1762 } else { // If L was computed on device, we also compute the transpose there 1763 // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. 1764 PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, 1765 factors->jL_d, factors->aL_d, 1766 factors->iLt_d, factors->jLt_d, 1767 factors->aLt_d)); 1768 PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d)); 1769 } 1770 PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d)); 1771 } 1772 1773 factors->transpose_updated = PETSC_TRUE; 1774 } 1775 PetscFunctionReturn(PETSC_SUCCESS); 1776 } 1777 1778 // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A. 1779 // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty. 1780 static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx) 1781 { 1782 auto exec = PetscGetKokkosExecutionSpace(); 1783 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1784 PetscInt m = A->rmap->n; 1785 PetscScalarKokkosView D = factors->D_d; 1786 PetscScalarKokkosView X, Y, B; // alias 1787 ConstPetscScalarKokkosView b; 1788 PetscScalarKokkosView x; 1789 PetscIntKokkosView &rowperm = factors->rowperm; 1790 PetscBool identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1791 1792 PetscFunctionBegin; 1793 PetscCall(PetscLogGpuTimeBegin()); 1794 PetscCall(MatSeqAIJKokkosSolveCheck(A)); // for UX = T 1795 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B 1796 PetscCall(VecGetKokkosView(bb, &b)); 1797 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1798 1799 // Solve U^T Y = B 1800 if (identity) { // Reorder b with the row permutation 1801 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1802 Y = factors->workVector; 1803 } else { 1804 B = factors->workVector; 1805 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); })); 1806 Y = x; 1807 } 1808 PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y)); 1809 1810 // Solve diag(D) Y' = Y. 1811 // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication. 1812 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); })); 1813 1814 // Solve UX = Y 1815 if (identity) { 1816 X = x; 1817 } else { 1818 X = factors->workVector; // B is not needed anymore 1819 } 1820 PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X)); 1821 1822 // Reorder X with the inverse column (row) permutation 1823 if (!identity) { 1824 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); })); 1825 } 1826 1827 PetscCall(VecRestoreKokkosView(bb, &b)); 1828 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1829 PetscCall(PetscLogGpuTimeEnd()); 1830 PetscFunctionReturn(PETSC_SUCCESS); 1831 } 1832 1833 // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively. 1834 // R and C are represented by rowperm and colperm in factors. 1835 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty. 1836 static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx) 1837 { 1838 auto exec = PetscGetKokkosExecutionSpace(); 1839 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1840 PetscInt m = A->rmap->n; 1841 PetscScalarKokkosView X, Y, B; // alias 1842 ConstPetscScalarKokkosView b; 1843 PetscScalarKokkosView x; 1844 PetscIntKokkosView &rowperm = factors->rowperm; 1845 PetscIntKokkosView &colperm = factors->colperm; 1846 PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1847 PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1848 1849 PetscFunctionBegin; 1850 PetscCall(PetscLogGpuTimeBegin()); 1851 PetscCall(MatSeqAIJKokkosSolveCheck(A)); 1852 PetscCall(VecGetKokkosView(bb, &b)); 1853 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1854 1855 // Solve L Y = B (i.e., L (U C^- x) = R b). R b indicates applying the row permutation on b. 1856 if (row_identity) { 1857 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1858 Y = factors->workVector; 1859 } else { 1860 B = factors->workVector; 1861 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); })); 1862 Y = x; 1863 } 1864 PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y)); 1865 1866 // Solve U C^- x = Y 1867 if (col_identity) { 1868 X = x; 1869 } else { 1870 X = factors->workVector; 1871 } 1872 PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X)); 1873 1874 // x = C X; Reorder X with the inverse col permutation 1875 if (!col_identity) { 1876 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); })); 1877 } 1878 1879 PetscCall(VecRestoreKokkosView(bb, &b)); 1880 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1881 PetscCall(PetscLogGpuTimeEnd()); 1882 PetscFunctionReturn(PETSC_SUCCESS); 1883 } 1884 1885 // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively. 1886 // R and C are represented by rowperm and colperm in factors. 1887 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty. 1888 // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R. 1889 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx) 1890 { 1891 auto exec = PetscGetKokkosExecutionSpace(); 1892 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1893 PetscInt m = A->rmap->n; 1894 PetscScalarKokkosView X, Y, B; // alias 1895 ConstPetscScalarKokkosView b; 1896 PetscScalarKokkosView x; 1897 PetscIntKokkosView &rowperm = factors->rowperm; 1898 PetscIntKokkosView &colperm = factors->colperm; 1899 PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1900 PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1901 1902 PetscFunctionBegin; 1903 PetscCall(PetscLogGpuTimeBegin()); 1904 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T 1905 PetscCall(VecGetKokkosView(bb, &b)); 1906 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1907 1908 // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b). Note C^- b = C^T b, which means applying the column permutation on b. 1909 if (col_identity) { // Reorder b with the col permutation 1910 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1911 Y = factors->workVector; 1912 } else { 1913 B = factors->workVector; 1914 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); })); 1915 Y = x; 1916 } 1917 PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y)); 1918 1919 // Solve L^T X = Y 1920 if (row_identity) { 1921 X = x; 1922 } else { 1923 X = factors->workVector; 1924 } 1925 PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X)); 1926 1927 // x = R^- X = R^T X; Reorder X with the inverse row permutation 1928 if (!row_identity) { 1929 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); })); 1930 } 1931 1932 PetscCall(VecRestoreKokkosView(bb, &b)); 1933 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1934 PetscCall(PetscLogGpuTimeEnd()); 1935 PetscFunctionReturn(PETSC_SUCCESS); 1936 } 1937 1938 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1939 { 1940 PetscFunctionBegin; 1941 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1942 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1943 1944 if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device 1945 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1946 Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data); 1947 const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag; 1948 const MatScalar *Ba = b->a; 1949 PetscInt m = B->rmap->n, n = B->cmap->n; 1950 1951 if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time 1952 // Allocate memory and copy the structure 1953 factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1); 1954 factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries 1955 factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m); 1956 factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1); 1957 factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m])); 1958 factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m])); 1959 1960 PetscInt *Li = factors->iL_h.data(); 1961 PetscInt *Lj = factors->jL_h.data(); 1962 PetscInt *Ui = factors->iU_h.data(); 1963 PetscInt *Uj = factors->jU_h.data(); 1964 1965 Li[0] = Ui[0] = 0; 1966 for (PetscInt i = 0; i < m; i++) { 1967 PetscInt llen = Bi[i + 1] - Bi[i]; // exclusive of the diagonal entry 1968 PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry 1969 1970 PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen); // entries of L on the left of the diagonal 1971 Lj[Li[i] + llen] = i; // diagonal entry of L 1972 1973 Uj[Ui[i]] = i; // diagonal entry of U 1974 PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1); // entries of U on the right of the diagonal 1975 1976 Li[i + 1] = Li[i] + llen + 1; 1977 Ui[i + 1] = Ui[i] + ulen; 1978 } 1979 1980 factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h); 1981 factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h); 1982 factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h); 1983 factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h); 1984 factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h); 1985 factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h); 1986 1987 // Copy row/col permutation to device 1988 IS rowperm = ((Mat_SeqAIJ *)B->data)->row; 1989 PetscBool row_identity; 1990 PetscCall(ISIdentity(rowperm, &row_identity)); 1991 if (!row_identity) { 1992 const PetscInt *ip; 1993 1994 PetscCall(ISGetIndices(rowperm, &ip)); 1995 factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m); 1996 PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m))); 1997 PetscCall(ISRestoreIndices(rowperm, &ip)); 1998 PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt))); 1999 } 2000 2001 IS colperm = ((Mat_SeqAIJ *)B->data)->col; 2002 PetscBool col_identity; 2003 PetscCall(ISIdentity(colperm, &col_identity)); 2004 if (!col_identity) { 2005 const PetscInt *ip; 2006 2007 PetscCall(ISGetIndices(colperm, &ip)); 2008 factors->colperm = PetscIntKokkosView(NoInit("colperm"), n); 2009 PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n))); 2010 PetscCall(ISRestoreIndices(colperm, &ip)); 2011 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 2012 } 2013 2014 /* Create sptrsv handles for L, U and their transpose */ 2015 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2016 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2017 #else 2018 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2019 #endif 2020 factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */); 2021 factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */); 2022 factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */); 2023 factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */); 2024 } 2025 2026 // Copy the value 2027 for (PetscInt i = 0; i < m; i++) { 2028 PetscInt llen = Bi[i + 1] - Bi[i]; 2029 PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; 2030 const PetscInt *Li = factors->iL_h.data(); 2031 const PetscInt *Ui = factors->iU_h.data(); 2032 2033 PetscScalar *La = factors->aL_h.data(); 2034 PetscScalar *Ua = factors->aU_h.data(); 2035 2036 PetscArraycpy(La + Li[i], Ba + Bi[i], llen); // entries of L 2037 La[Li[i] + llen] = 1.0; // diagonal entry 2038 2039 Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]]; // diagonal entry 2040 PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1); // entries of U 2041 } 2042 2043 PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h)); 2044 PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h)); 2045 // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic 2046 factors->transpose_updated = PETSC_FALSE; 2047 factors->sptrsv_symbolic_completed = PETSC_FALSE; 2048 2049 B->ops->solve = MatSolve_SeqAIJKokkos_LU; 2050 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU; 2051 } 2052 2053 B->ops->matsolve = NULL; 2054 B->ops->matsolvetranspose = NULL; 2055 PetscFunctionReturn(PETSC_SUCCESS); 2056 } 2057 2058 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info) 2059 { 2060 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 2061 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2062 PetscInt fill_lev = info->levels; 2063 2064 PetscFunctionBegin; 2065 PetscCall(PetscLogGpuTimeBegin()); 2066 PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false"); 2067 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2068 2069 auto a_d = aijkok->a_dual.view_device(); 2070 auto i_d = aijkok->i_dual.view_device(); 2071 auto j_d = aijkok->j_dual.view_device(); 2072 2073 PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d)); 2074 2075 B->assembled = PETSC_TRUE; 2076 B->preallocated = PETSC_TRUE; 2077 B->ops->solve = MatSolve_SeqAIJKokkos_LU; 2078 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU; 2079 B->ops->matsolve = NULL; 2080 B->ops->matsolvetranspose = NULL; 2081 2082 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 2083 factors->transpose_updated = PETSC_FALSE; 2084 factors->sptrsv_symbolic_completed = PETSC_FALSE; 2085 /* TODO: log flops, but how to know that? */ 2086 PetscCall(PetscLogGpuTimeEnd()); 2087 PetscFunctionReturn(PETSC_SUCCESS); 2088 } 2089 2090 // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering 2091 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info) 2092 { 2093 Mat_SeqAIJKokkos *aijkok; 2094 Mat_SeqAIJ *b; 2095 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2096 PetscInt fill_lev = info->levels; 2097 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 2098 PetscInt n = A->rmap->n; 2099 2100 PetscFunctionBegin; 2101 PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now"); 2102 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2103 2104 /* Create a spiluk handle and then do symbolic factorization */ 2105 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 2106 factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 2107 2108 auto spiluk_handle = factors->kh.get_spiluk_handle(); 2109 2110 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 2111 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 2112 Kokkos::realloc(factors->iU_d, n + 1); 2113 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 2114 2115 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 2116 auto i_d = aijkok->i_dual.view_device(); 2117 auto j_d = aijkok->j_dual.view_device(); 2118 PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d)); 2119 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 2120 2121 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 2122 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 2123 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 2124 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 2125 2126 /* TODO: add options to select sptrsv algorithms */ 2127 /* Create sptrsv handles for L, U and their transpose */ 2128 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2129 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2130 #else 2131 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2132 #endif 2133 2134 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 2135 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 2136 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 2137 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 2138 2139 /* Fill fields of the factor matrix B */ 2140 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 2141 b = (Mat_SeqAIJ *)B->data; 2142 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 2143 B->info.fill_ratio_given = info->fill; 2144 B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 2145 2146 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0; 2147 PetscFunctionReturn(PETSC_SUCCESS); 2148 } 2149 2150 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2151 { 2152 PetscFunctionBegin; 2153 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2154 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2155 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2156 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 2157 PetscFunctionReturn(PETSC_SUCCESS); 2158 } 2159 2160 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2161 { 2162 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2163 2164 PetscFunctionBegin; 2165 if (!info->factoronhost) { 2166 PetscCall(ISIdentity(isrow, &row_identity)); 2167 PetscCall(ISIdentity(iscol, &col_identity)); 2168 } 2169 2170 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2171 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2172 2173 if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering 2174 PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info)); 2175 } else { 2176 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host 2177 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 2178 } 2179 PetscFunctionReturn(PETSC_SUCCESS); 2180 } 2181 2182 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 2183 { 2184 PetscFunctionBegin; 2185 PetscCall(MatSeqAIJKokkosSyncHost(A)); 2186 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 2187 2188 if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device 2189 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2190 Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data); 2191 const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag; 2192 const MatScalar *Ba = b->a; 2193 PetscInt m = B->rmap->n; 2194 2195 if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization 2196 // Allocate memory and copy the structure 2197 factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h 2198 factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]); 2199 factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]); 2200 factors->D_h = MatScalarKokkosViewHost(NoInit("D_h"), m); 2201 factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h); 2202 factors->D_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h); 2203 2204 // Build jU_h from the skewed Aj 2205 PetscInt *Uj = factors->jU_h.data(); 2206 for (PetscInt i = 0; i < m; i++) { 2207 PetscInt ulen = Bi[i + 1] - Bi[i]; 2208 Uj[Bi[i]] = i; // diagonal entry 2209 PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal 2210 } 2211 2212 // Copy iU, jU to device 2213 PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h)); 2214 PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h)); 2215 2216 // Copy row/col permutation to device 2217 IS rowperm = ((Mat_SeqAIJ *)B->data)->row; 2218 PetscBool row_identity; 2219 PetscCall(ISIdentity(rowperm, &row_identity)); 2220 if (!row_identity) { 2221 const PetscInt *ip; 2222 2223 PetscCall(ISGetIndices(rowperm, &ip)); 2224 PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m)); 2225 PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m))); 2226 PetscCall(ISRestoreIndices(rowperm, &ip)); 2227 PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt))); 2228 } 2229 2230 // Create sptrsv handles for U and U^T 2231 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2232 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2233 #else 2234 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2235 #endif 2236 factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */); 2237 factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */); 2238 } 2239 // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them 2240 B->ops->solve = MatSolve_SeqAIJKokkos_Cholesky; 2241 B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky; 2242 2243 // Copy the value 2244 PetscScalar *Ua = factors->aU_h.data(); 2245 PetscScalar *D = factors->D_h.data(); 2246 for (PetscInt i = 0; i < m; i++) { 2247 D[i] = Ba[Bdiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 2248 Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U 2249 for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k]; 2250 } 2251 PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h)); 2252 PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h)); 2253 2254 factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again 2255 factors->transpose_updated = PETSC_FALSE; 2256 } 2257 2258 B->ops->matsolve = NULL; 2259 B->ops->matsolvetranspose = NULL; 2260 PetscFunctionReturn(PETSC_SUCCESS); 2261 } 2262 2263 static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2264 { 2265 PetscFunctionBegin; 2266 if (info->solveonhost) { 2267 // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc. 2268 PetscCall(MatSetType(B, MATSEQSBAIJ)); 2269 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 2270 } 2271 2272 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2273 2274 if (!info->solveonhost) { 2275 // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr 2276 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2277 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2278 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos; 2279 } 2280 PetscFunctionReturn(PETSC_SUCCESS); 2281 } 2282 2283 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2284 { 2285 PetscFunctionBegin; 2286 if (info->solveonhost) { 2287 // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc. 2288 PetscCall(MatSetType(B, MATSEQSBAIJ)); 2289 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 2290 } 2291 2292 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm 2293 2294 if (!info->solveonhost) { 2295 // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr 2296 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2297 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2298 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos; 2299 } 2300 PetscFunctionReturn(PETSC_SUCCESS); 2301 } 2302 2303 // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix 2304 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type) 2305 { 2306 PetscFunctionBegin; 2307 *type = MATSOLVERKOKKOS; 2308 PetscFunctionReturn(PETSC_SUCCESS); 2309 } 2310 2311 /*MC 2312 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 2313 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 2314 2315 Level: beginner 2316 2317 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 2318 M*/ 2319 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 2320 { 2321 PetscInt n = A->rmap->n; 2322 MPI_Comm comm; 2323 2324 PetscFunctionBegin; 2325 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 2326 PetscCall(MatCreate(comm, B)); 2327 PetscCall(MatSetSizes(*B, n, n, n, n)); 2328 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2329 (*B)->factortype = ftype; 2330 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 2331 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2332 PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2333 2334 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2335 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 2336 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 2337 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2338 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2339 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2340 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2341 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJKokkos; 2342 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos; 2343 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2344 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2345 } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 2346 2347 // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host 2348 (*B)->canuseordering = PETSC_TRUE; 2349 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos)); 2350 PetscFunctionReturn(PETSC_SUCCESS); 2351 } 2352 2353 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void) 2354 { 2355 PetscFunctionBegin; 2356 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 2357 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos)); 2358 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 2359 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos)); 2360 PetscFunctionReturn(PETSC_SUCCESS); 2361 } 2362 2363 /* Utility to print out a KokkosCsrMatrix for debugging */ 2364 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 2365 { 2366 const auto &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map); 2367 const auto &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries); 2368 const auto &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values); 2369 const PetscInt *i = iv.data(); 2370 const PetscInt *j = jv.data(); 2371 const PetscScalar *a = av.data(); 2372 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 2373 2374 PetscFunctionBegin; 2375 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 2376 for (PetscInt k = 0; k < m; k++) { 2377 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 2378 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 2379 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 2380 } 2381 PetscFunctionReturn(PETSC_SUCCESS); 2382 } 2383