1 #include <petsc_kokkos.hpp> 2 #include <petscvec_kokkos.hpp> 3 #include <petscmat_kokkos.hpp> 4 #include <petscpkg_version.h> 5 #include <petsc/private/petscimpl.h> 6 #include <petsc/private/sfimpl.h> 7 #include <petsc/private/kokkosimpl.hpp> 8 #include <petscsystypes.h> 9 #include <petscerror.h> 10 11 #include <Kokkos_Core.hpp> 12 #include <KokkosBlas.hpp> 13 #include <KokkosSparse_CrsMatrix.hpp> 14 15 // To suppress compiler warnings: 16 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63: 17 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, 18 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*, 19 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations] 20 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations") 21 #include <KokkosSparse_spmv.hpp> 22 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END() 23 24 #include <KokkosSparse_spiluk.hpp> 25 #include <KokkosSparse_sptrsv.hpp> 26 #include <KokkosSparse_spgemm.hpp> 27 #include <KokkosSparse_spadd.hpp> 28 #include <KokkosBatched_LU_Decl.hpp> 29 #include <KokkosBatched_InverseLU_Decl.hpp> 30 31 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 32 33 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 34 #include <KokkosSparse_Utils.hpp> 35 using KokkosSparse::sort_crs_matrix; 36 using KokkosSparse::Impl::transpose_matrix; 37 #else 38 #include <KokkosKernels_Sorting.hpp> 39 using KokkosKernels::sort_crs_matrix; 40 using KokkosKernels::Impl::transpose_matrix; 41 #endif 42 43 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0) 44 using KokkosSparse::spiluk_symbolic; 45 using KokkosSparse::spiluk_numeric; 46 using KokkosSparse::sptrsv_symbolic; 47 using KokkosSparse::sptrsv_solve; 48 using KokkosSparse::Experimental::SPTRSVAlgorithm; 49 using KokkosSparse::Experimental::SPILUKAlgorithm; 50 #else 51 using KokkosSparse::Experimental::spiluk_symbolic; 52 using KokkosSparse::Experimental::spiluk_numeric; 53 using KokkosSparse::Experimental::sptrsv_symbolic; 54 using KokkosSparse::Experimental::sptrsv_solve; 55 using KokkosSparse::Experimental::SPTRSVAlgorithm; 56 using KokkosSparse::Experimental::SPILUKAlgorithm; 57 #endif 58 59 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 60 61 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 62 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 63 In the latter case, it is important to set a_dual's sync state correctly. 64 */ 65 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 66 { 67 Mat_SeqAIJ *aijseq; 68 Mat_SeqAIJKokkos *aijkok; 69 70 PetscFunctionBegin; 71 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 72 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 73 74 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 75 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 76 77 /* If aijkok does not exist, we just copy i, j to device. 78 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 79 In both cases, we build a new aijkok structure. 80 */ 81 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 82 delete aijkok; 83 aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 84 A->spptr = aijkok; 85 } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag. 86 MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n); 87 auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h); 88 aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h); 89 } 90 PetscFunctionReturn(PETSC_SUCCESS); 91 } 92 93 /* Sync CSR data to device if not yet */ 94 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 95 { 96 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 97 98 PetscFunctionBegin; 99 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 100 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 101 if (aijkok->a_dual.need_sync_device()) { 102 aijkok->a_dual.sync_device(); 103 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 104 aijkok->hermitian_updated = PETSC_FALSE; 105 } 106 PetscFunctionReturn(PETSC_SUCCESS); 107 } 108 109 /* Mark the CSR data on device as modified */ 110 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 111 { 112 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 113 114 PetscFunctionBegin; 115 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 116 aijkok->a_dual.clear_sync_state(); 117 aijkok->a_dual.modify_device(); 118 aijkok->transpose_updated = PETSC_FALSE; 119 aijkok->hermitian_updated = PETSC_FALSE; 120 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 121 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 122 PetscFunctionReturn(PETSC_SUCCESS); 123 } 124 125 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 126 { 127 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 128 auto exec = PetscGetKokkosExecutionSpace(); 129 130 PetscFunctionBegin; 131 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 132 /* We do not expect one needs factors on host */ 133 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 134 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 135 PetscCall(KokkosDualViewSync<HostMirrorMemorySpace>(aijkok->a_dual, exec)); 136 PetscFunctionReturn(PETSC_SUCCESS); 137 } 138 139 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 140 { 141 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 142 143 PetscFunctionBegin; 144 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 145 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 146 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 147 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 148 */ 149 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 150 auto exec = PetscGetKokkosExecutionSpace(); 151 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 152 PetscCallCXX(exec.fence()); 153 *array = aijkok->a_dual.view_host().data(); 154 } else { /* Happens when calling MatSetValues on a newly created matrix */ 155 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 156 } 157 PetscFunctionReturn(PETSC_SUCCESS); 158 } 159 160 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 161 { 162 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 163 164 PetscFunctionBegin; 165 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 166 PetscFunctionReturn(PETSC_SUCCESS); 167 } 168 169 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 170 { 171 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 172 173 PetscFunctionBegin; 174 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 175 auto exec = PetscGetKokkosExecutionSpace(); 176 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 177 PetscCallCXX(exec.fence()); 178 *array = aijkok->a_dual.view_host().data(); 179 } else { 180 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 181 } 182 PetscFunctionReturn(PETSC_SUCCESS); 183 } 184 185 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 186 { 187 PetscFunctionBegin; 188 *array = NULL; 189 PetscFunctionReturn(PETSC_SUCCESS); 190 } 191 192 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 193 { 194 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 195 196 PetscFunctionBegin; 197 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 198 *array = aijkok->a_dual.view_host().data(); 199 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 200 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 201 } 202 PetscFunctionReturn(PETSC_SUCCESS); 203 } 204 205 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 206 { 207 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 208 209 PetscFunctionBegin; 210 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 211 aijkok->a_dual.clear_sync_state(); 212 aijkok->a_dual.modify_host(); 213 } 214 PetscFunctionReturn(PETSC_SUCCESS); 215 } 216 217 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 218 { 219 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 220 221 PetscFunctionBegin; 222 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 223 224 if (i) *i = aijkok->i_device_data(); 225 if (j) *j = aijkok->j_device_data(); 226 if (a) { 227 aijkok->a_dual.sync_device(); 228 *a = aijkok->a_device_data(); 229 } 230 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 231 PetscFunctionReturn(PETSC_SUCCESS); 232 } 233 234 /* 235 Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 236 237 Input Parameter: 238 . A - the MATSEQAIJKOKKOS matrix 239 240 Output Parameters: 241 + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 242 - T_d - the transpose on device, whose value array is allocated but not initialized 243 */ 244 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 245 { 246 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 247 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 248 const PetscInt *Ai = aseq->i, *Aj = aseq->j; 249 MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 250 MatRowMapType *Ti = Ti_h.data(); 251 MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 252 MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 253 PetscInt *Tj = Tj_h.data(); 254 PetscInt *perm = perm_h.data(); 255 PetscInt *offset; 256 257 PetscFunctionBegin; 258 // Populate Ti 259 PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 260 Ti++; 261 for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 262 Ti--; 263 for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 264 265 // Populate Tj and the permutation array 266 PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 267 for (PetscInt i = 0; i < m; i++) { 268 for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 269 PetscInt r = Aj[j]; // row r of T 270 PetscInt disp = Ti[r] + offset[r]; 271 272 Tj[disp] = i; // col i of T 273 perm[disp] = j; 274 offset[r]++; 275 } 276 } 277 PetscCall(PetscFree(offset)); 278 279 // Sort each row of T, along with the permutation array 280 for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 281 282 // Output perm and T on device 283 auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 284 auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 285 PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 286 PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 287 PetscFunctionReturn(PETSC_SUCCESS); 288 } 289 290 // Generate the transpose on device and cache it internally 291 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 292 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 293 { 294 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 295 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 296 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 297 KokkosCsrMatrix &T = akok->csrmatT; 298 299 PetscFunctionBegin; 300 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 301 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 302 303 const auto &Aa = akok->a_dual.view_device(); 304 305 if (A->symmetric == PETSC_BOOL3_TRUE) { 306 *csrmatT = akok->csrmat; 307 } else { 308 // See if we already have a cached transpose and its value is up to date 309 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 310 if (!akok->transpose_updated) { // if the value is out of date, update the cached version 311 const auto &perm = akok->transpose_perm; // get the permutation array 312 auto &Ta = T.values; 313 314 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 315 } 316 } else { // Generate T of size n x m for the first time 317 MatRowMapKokkosView perm; 318 319 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 320 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 321 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 322 } 323 akok->transpose_updated = PETSC_TRUE; 324 *csrmatT = akok->csrmatT; 325 } 326 PetscFunctionReturn(PETSC_SUCCESS); 327 } 328 329 // Generate the Hermitian on device and cache it internally 330 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 331 { 332 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 333 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 334 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 335 KokkosCsrMatrix &T = akok->csrmatH; 336 337 PetscFunctionBegin; 338 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 339 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 340 341 const auto &Aa = akok->a_dual.view_device(); 342 343 if (A->hermitian == PETSC_BOOL3_TRUE) { 344 *csrmatH = akok->csrmat; 345 } else { 346 // See if we already have a cached hermitian and its value is up to date 347 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 348 if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 349 const auto &perm = akok->transpose_perm; // get the permutation array 350 auto &Ta = T.values; 351 352 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 353 } 354 } else { // Generate T of size n x m for the first time 355 MatRowMapKokkosView perm; 356 357 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 358 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 359 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 360 } 361 akok->hermitian_updated = PETSC_TRUE; 362 *csrmatH = akok->csrmatH; 363 } 364 PetscFunctionReturn(PETSC_SUCCESS); 365 } 366 367 /* y = A x */ 368 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 369 { 370 Mat_SeqAIJKokkos *aijkok; 371 ConstPetscScalarKokkosView xv; 372 PetscScalarKokkosView yv; 373 374 PetscFunctionBegin; 375 PetscCall(PetscLogGpuTimeBegin()); 376 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 377 PetscCall(VecGetKokkosView(xx, &xv)); 378 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 379 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 380 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 381 PetscCall(VecRestoreKokkosView(xx, &xv)); 382 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 383 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 384 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 385 PetscCall(PetscLogGpuTimeEnd()); 386 PetscFunctionReturn(PETSC_SUCCESS); 387 } 388 389 /* y = A^T x */ 390 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 391 { 392 Mat_SeqAIJKokkos *aijkok; 393 const char *mode; 394 ConstPetscScalarKokkosView xv; 395 PetscScalarKokkosView yv; 396 KokkosCsrMatrix csrmat; 397 398 PetscFunctionBegin; 399 PetscCall(PetscLogGpuTimeBegin()); 400 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 401 PetscCall(VecGetKokkosView(xx, &xv)); 402 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 403 if (A->form_explicit_transpose) { 404 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 405 mode = "N"; 406 } else { 407 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 408 csrmat = aijkok->csrmat; 409 mode = "T"; 410 } 411 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 412 PetscCall(VecRestoreKokkosView(xx, &xv)); 413 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 414 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 415 PetscCall(PetscLogGpuTimeEnd()); 416 PetscFunctionReturn(PETSC_SUCCESS); 417 } 418 419 /* y = A^H x */ 420 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 421 { 422 Mat_SeqAIJKokkos *aijkok; 423 const char *mode; 424 ConstPetscScalarKokkosView xv; 425 PetscScalarKokkosView yv; 426 KokkosCsrMatrix csrmat; 427 428 PetscFunctionBegin; 429 PetscCall(PetscLogGpuTimeBegin()); 430 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 431 PetscCall(VecGetKokkosView(xx, &xv)); 432 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 433 if (A->form_explicit_transpose) { 434 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 435 mode = "N"; 436 } else { 437 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 438 csrmat = aijkok->csrmat; 439 mode = "C"; 440 } 441 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 442 PetscCall(VecRestoreKokkosView(xx, &xv)); 443 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 444 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 445 PetscCall(PetscLogGpuTimeEnd()); 446 PetscFunctionReturn(PETSC_SUCCESS); 447 } 448 449 /* z = A x + y */ 450 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 451 { 452 Mat_SeqAIJKokkos *aijkok; 453 ConstPetscScalarKokkosView xv; 454 PetscScalarKokkosView zv; 455 456 PetscFunctionBegin; 457 PetscCall(PetscLogGpuTimeBegin()); 458 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 459 if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host 460 PetscCall(VecGetKokkosView(xx, &xv)); 461 PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device 462 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 463 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 464 PetscCall(VecRestoreKokkosView(xx, &xv)); 465 PetscCall(VecRestoreKokkosView(zz, &zv)); 466 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 467 PetscCall(PetscLogGpuTimeEnd()); 468 PetscFunctionReturn(PETSC_SUCCESS); 469 } 470 471 /* z = A^T x + y */ 472 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 473 { 474 Mat_SeqAIJKokkos *aijkok; 475 const char *mode; 476 ConstPetscScalarKokkosView xv; 477 PetscScalarKokkosView zv; 478 KokkosCsrMatrix csrmat; 479 480 PetscFunctionBegin; 481 PetscCall(PetscLogGpuTimeBegin()); 482 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 483 if (zz != yy) PetscCall(VecCopy(yy, zz)); 484 PetscCall(VecGetKokkosView(xx, &xv)); 485 PetscCall(VecGetKokkosView(zz, &zv)); 486 if (A->form_explicit_transpose) { 487 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 488 mode = "N"; 489 } else { 490 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 491 csrmat = aijkok->csrmat; 492 mode = "T"; 493 } 494 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 495 PetscCall(VecRestoreKokkosView(xx, &xv)); 496 PetscCall(VecRestoreKokkosView(zz, &zv)); 497 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 498 PetscCall(PetscLogGpuTimeEnd()); 499 PetscFunctionReturn(PETSC_SUCCESS); 500 } 501 502 /* z = A^H x + y */ 503 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 504 { 505 Mat_SeqAIJKokkos *aijkok; 506 const char *mode; 507 ConstPetscScalarKokkosView xv; 508 PetscScalarKokkosView zv; 509 KokkosCsrMatrix csrmat; 510 511 PetscFunctionBegin; 512 PetscCall(PetscLogGpuTimeBegin()); 513 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 514 if (zz != yy) PetscCall(VecCopy(yy, zz)); 515 PetscCall(VecGetKokkosView(xx, &xv)); 516 PetscCall(VecGetKokkosView(zz, &zv)); 517 if (A->form_explicit_transpose) { 518 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 519 mode = "N"; 520 } else { 521 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 522 csrmat = aijkok->csrmat; 523 mode = "C"; 524 } 525 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 526 PetscCall(VecRestoreKokkosView(xx, &xv)); 527 PetscCall(VecRestoreKokkosView(zz, &zv)); 528 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 529 PetscCall(PetscLogGpuTimeEnd()); 530 PetscFunctionReturn(PETSC_SUCCESS); 531 } 532 533 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 534 { 535 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 536 537 PetscFunctionBegin; 538 switch (op) { 539 case MAT_FORM_EXPLICIT_TRANSPOSE: 540 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 541 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 542 A->form_explicit_transpose = flg; 543 break; 544 default: 545 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 546 break; 547 } 548 PetscFunctionReturn(PETSC_SUCCESS); 549 } 550 551 /* Depending on reuse, either build a new mat, or use the existing mat */ 552 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 553 { 554 Mat_SeqAIJ *aseq; 555 556 PetscFunctionBegin; 557 PetscCall(PetscKokkosInitializeCheck()); 558 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 559 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */ 560 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 561 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 562 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 563 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 564 PetscCall(PetscFree(A->defaultvectype)); 565 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 566 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 567 PetscCall(MatSetOps_SeqAIJKokkos(A)); 568 aseq = static_cast<Mat_SeqAIJ *>(A->data); 569 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 570 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 571 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE); 572 } 573 } 574 PetscFunctionReturn(PETSC_SUCCESS); 575 } 576 577 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 578 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 579 */ 580 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 581 { 582 Mat_SeqAIJ *bseq; 583 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 584 Mat mat; 585 586 PetscFunctionBegin; 587 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 588 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 589 mat = *B; 590 if (A->assembled) { 591 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 592 bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE); 593 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 594 /* Now copy values to B if needed */ 595 if (dupOption == MAT_COPY_VALUES) { 596 if (akok->a_dual.need_sync_device()) { 597 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 598 bkok->a_dual.modify_host(); 599 } else { /* If device has the latest data, we only copy data on device */ 600 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 601 bkok->a_dual.modify_device(); 602 } 603 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 604 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 605 bkok->a_dual.modify_host(); 606 } 607 mat->spptr = bkok; 608 } 609 610 PetscCall(PetscFree(mat->defaultvectype)); 611 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 612 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 613 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 614 PetscFunctionReturn(PETSC_SUCCESS); 615 } 616 617 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 618 { 619 Mat At; 620 KokkosCsrMatrix internT; 621 Mat_SeqAIJKokkos *atkok, *bkok; 622 623 PetscFunctionBegin; 624 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 625 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 626 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 627 /* Deep copy internT, as we want to isolate the internal transpose */ 628 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 629 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 630 if (reuse == MAT_INITIAL_MATRIX) *B = At; 631 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 632 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 633 if ((*B)->assembled) { 634 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 635 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 636 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 637 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 638 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 639 MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 640 MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 641 PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 642 PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 643 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 644 } 645 PetscFunctionReturn(PETSC_SUCCESS); 646 } 647 648 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 649 { 650 Mat_SeqAIJKokkos *aijkok; 651 652 PetscFunctionBegin; 653 if (A->factortype == MAT_FACTOR_NONE) { 654 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 655 delete aijkok; 656 } else { 657 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 658 } 659 A->spptr = NULL; 660 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 661 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 662 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 663 #if defined(PETSC_HAVE_HYPRE) 664 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL)); 665 #endif 666 PetscCall(MatDestroy_SeqAIJ(A)); 667 PetscFunctionReturn(PETSC_SUCCESS); 668 } 669 670 /*MC 671 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 672 673 A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types 674 675 Options Database Key: 676 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 677 678 Level: beginner 679 680 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 681 M*/ 682 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 683 { 684 PetscFunctionBegin; 685 PetscCall(PetscKokkosInitializeCheck()); 686 PetscCall(MatCreate_SeqAIJ(A)); 687 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 688 PetscFunctionReturn(PETSC_SUCCESS); 689 } 690 691 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 692 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 693 { 694 Mat_SeqAIJ *a, *b; 695 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 696 MatScalarKokkosView aa, ba, ca; 697 MatRowMapKokkosView ai, bi, ci; 698 MatColIdxKokkosView aj, bj, cj; 699 PetscInt m, n, nnz, aN; 700 701 PetscFunctionBegin; 702 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 703 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 704 PetscAssertPointer(C, 4); 705 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 706 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 707 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 708 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 709 710 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 711 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 712 a = static_cast<Mat_SeqAIJ *>(A->data); 713 b = static_cast<Mat_SeqAIJ *>(B->data); 714 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 715 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 716 aa = akok->a_dual.view_device(); 717 ai = akok->i_dual.view_device(); 718 ba = bkok->a_dual.view_device(); 719 bi = bkok->i_dual.view_device(); 720 m = A->rmap->n; /* M, N and nnz of C */ 721 n = A->cmap->n + B->cmap->n; 722 nnz = a->nz + b->nz; 723 aN = A->cmap->n; /* N of A */ 724 if (reuse == MAT_INITIAL_MATRIX) { 725 aj = akok->j_dual.view_device(); 726 bj = bkok->j_dual.view_device(); 727 auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 728 auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 729 auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 730 ca = ca_dual.view_device(); 731 ci = ci_dual.view_device(); 732 cj = cj_dual.view_device(); 733 734 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 735 Kokkos::parallel_for( 736 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 737 PetscInt i = t.league_rank(); /* row i */ 738 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 739 740 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 741 ci(i) = coffset; 742 if (i == m - 1) ci(m) = ai(m) + bi(m); 743 }); 744 745 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 746 if (k < alen) { 747 ca(coffset + k) = aa(ai(i) + k); 748 cj(coffset + k) = aj(ai(i) + k); 749 } else { 750 ca(coffset + k) = ba(bi(i) + k - alen); 751 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 752 } 753 }); 754 }); 755 ca_dual.modify_device(); 756 ci_dual.modify_device(); 757 cj_dual.modify_device(); 758 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 759 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 760 } else if (reuse == MAT_REUSE_MATRIX) { 761 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 762 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 763 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 764 ca = ckok->a_dual.view_device(); 765 ci = ckok->i_dual.view_device(); 766 767 Kokkos::parallel_for( 768 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 769 PetscInt i = t.league_rank(); /* row i */ 770 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 771 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 772 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 773 else ca(ci(i) + k) = ba(bi(i) + k - alen); 774 }); 775 }); 776 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 777 } 778 PetscFunctionReturn(PETSC_SUCCESS); 779 } 780 781 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 782 { 783 PetscFunctionBegin; 784 delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 785 PetscFunctionReturn(PETSC_SUCCESS); 786 } 787 788 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 789 { 790 Mat_Product *product = C->product; 791 Mat A, B; 792 bool transA, transB; /* use bool, since KK needs this type */ 793 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 794 Mat_SeqAIJ *c; 795 MatProductData_SeqAIJKokkos *pdata; 796 KokkosCsrMatrix csrmatA, csrmatB; 797 798 PetscFunctionBegin; 799 MatCheckProduct(C, 1); 800 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 801 pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 802 803 // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 804 // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 805 // we still do numeric. 806 if (pdata->reusesym) { // numeric reuses results from symbolic 807 pdata->reusesym = PETSC_FALSE; 808 PetscFunctionReturn(PETSC_SUCCESS); 809 } 810 811 switch (product->type) { 812 case MATPRODUCT_AB: 813 transA = false; 814 transB = false; 815 break; 816 case MATPRODUCT_AtB: 817 transA = true; 818 transB = false; 819 break; 820 case MATPRODUCT_ABt: 821 transA = false; 822 transB = true; 823 break; 824 default: 825 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 826 } 827 828 A = product->A; 829 B = product->B; 830 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 831 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 832 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 833 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 834 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 835 836 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 837 838 csrmatA = akok->csrmat; 839 csrmatB = bkok->csrmat; 840 841 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 842 if (transA) { 843 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 844 transA = false; 845 } 846 847 if (transB) { 848 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 849 transB = false; 850 } 851 PetscCall(PetscLogGpuTimeBegin()); 852 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 853 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 854 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 855 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 856 #endif 857 858 PetscCall(PetscLogGpuTimeEnd()); 859 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 860 /* shorter version of MatAssemblyEnd_SeqAIJ */ 861 c = (Mat_SeqAIJ *)C->data; 862 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 863 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 864 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 865 c->reallocs = 0; 866 C->info.mallocs = 0; 867 C->info.nz_unneeded = 0; 868 C->assembled = C->was_assembled = PETSC_TRUE; 869 C->num_ass++; 870 PetscFunctionReturn(PETSC_SUCCESS); 871 } 872 873 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 874 { 875 Mat_Product *product = C->product; 876 MatProductType ptype; 877 Mat A, B; 878 bool transA, transB; 879 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 880 MatProductData_SeqAIJKokkos *pdata; 881 MPI_Comm comm; 882 KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 883 884 PetscFunctionBegin; 885 MatCheckProduct(C, 1); 886 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 887 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 888 A = product->A; 889 B = product->B; 890 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 891 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 892 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 893 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 894 csrmatA = akok->csrmat; 895 csrmatB = bkok->csrmat; 896 897 ptype = product->type; 898 // Take advantage of the symmetry if true 899 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 900 ptype = MATPRODUCT_AB; 901 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 902 } 903 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 904 ptype = MATPRODUCT_AB; 905 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 906 } 907 908 switch (ptype) { 909 case MATPRODUCT_AB: 910 transA = false; 911 transB = false; 912 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 913 break; 914 case MATPRODUCT_AtB: 915 transA = true; 916 transB = false; 917 if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs)); 918 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 919 break; 920 case MATPRODUCT_ABt: 921 transA = false; 922 transB = true; 923 if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs)); 924 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 925 break; 926 default: 927 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 928 } 929 PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 930 pdata->reusesym = product->api_user; 931 932 /* TODO: add command line options to select spgemm algorithms */ 933 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 934 935 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 936 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 937 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 938 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 939 #endif 940 #endif 941 PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 942 943 PetscCall(PetscLogGpuTimeBegin()); 944 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 945 if (transA) { 946 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 947 transA = false; 948 } 949 950 if (transB) { 951 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 952 transB = false; 953 } 954 955 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 956 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 957 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 958 calling new Mat_SeqAIJKokkos(). 959 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 960 */ 961 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 962 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 963 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 964 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 965 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 966 #endif 967 PetscCall(PetscLogGpuTimeEnd()); 968 969 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 970 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 971 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 972 PetscFunctionReturn(PETSC_SUCCESS); 973 } 974 975 /* handles sparse matrix matrix ops */ 976 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 977 { 978 Mat_Product *product = mat->product; 979 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 980 981 PetscFunctionBegin; 982 MatCheckProduct(mat, 1); 983 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 984 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 985 if (Biskok && Ciskok) { 986 switch (product->type) { 987 case MATPRODUCT_AB: 988 case MATPRODUCT_AtB: 989 case MATPRODUCT_ABt: 990 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 991 break; 992 case MATPRODUCT_PtAP: 993 case MATPRODUCT_RARt: 994 case MATPRODUCT_ABC: 995 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 996 break; 997 default: 998 break; 999 } 1000 } else { /* fallback for AIJ */ 1001 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 1002 } 1003 PetscFunctionReturn(PETSC_SUCCESS); 1004 } 1005 1006 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 1007 { 1008 Mat_SeqAIJKokkos *aijkok; 1009 1010 PetscFunctionBegin; 1011 PetscCall(PetscLogGpuTimeBegin()); 1012 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1013 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1014 KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 1015 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1016 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 1017 PetscCall(PetscLogGpuTimeEnd()); 1018 PetscFunctionReturn(PETSC_SUCCESS); 1019 } 1020 1021 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular) 1022 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a) 1023 { 1024 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1025 1026 PetscFunctionBegin; 1027 if (A->assembled && aijseq->diagonaldense) { // no missing diagonals 1028 PetscInt n = PetscMin(A->rmap->n, A->cmap->n); 1029 1030 PetscCall(PetscLogGpuTimeBegin()); 1031 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1032 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1033 const auto &Aa = aijkok->a_dual.view_device(); 1034 const auto &Adiag = aijkok->diag_dual.view_device(); 1035 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; })); 1036 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1037 PetscCall(PetscLogGpuFlops(n)); 1038 PetscCall(PetscLogGpuTimeEnd()); 1039 } else { // need reassembly, very slow! 1040 PetscCall(MatShift_Basic(A, a)); 1041 } 1042 PetscFunctionReturn(PETSC_SUCCESS); 1043 } 1044 1045 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is) 1046 { 1047 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data); 1048 1049 PetscFunctionBegin; 1050 if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals 1051 ConstPetscScalarKokkosView dv; 1052 PetscInt n, nv; 1053 1054 PetscCall(PetscLogGpuTimeBegin()); 1055 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1056 PetscCall(VecGetKokkosView(D, &dv)); 1057 PetscCall(VecGetLocalSize(D, &nv)); 1058 n = PetscMin(Y->rmap->n, Y->cmap->n); 1059 PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match"); 1060 1061 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1062 const auto &Aa = aijkok->a_dual.view_device(); 1063 const auto &Adiag = aijkok->diag_dual.view_device(); 1064 PetscCallCXX(Kokkos::parallel_for( 1065 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1066 if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i); 1067 else Aa(Adiag(i)) += dv(i); 1068 })); 1069 PetscCall(VecRestoreKokkosView(D, &dv)); 1070 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1071 PetscCall(PetscLogGpuFlops(n)); 1072 PetscCall(PetscLogGpuTimeEnd()); 1073 } else { // need reassembly, very slow! 1074 PetscCall(MatDiagonalSet_Default(Y, D, is)); 1075 } 1076 PetscFunctionReturn(PETSC_SUCCESS); 1077 } 1078 1079 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr) 1080 { 1081 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1082 PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz; 1083 ConstPetscScalarKokkosView lv, rv; 1084 1085 PetscFunctionBegin; 1086 PetscCall(PetscLogGpuTimeBegin()); 1087 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1088 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1089 const auto &Aa = aijkok->a_dual.view_device(); 1090 const auto &Ai = aijkok->i_dual.view_device(); 1091 const auto &Aj = aijkok->j_dual.view_device(); 1092 if (ll) { 1093 PetscCall(VecGetLocalSize(ll, &m)); 1094 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1095 PetscCall(VecGetKokkosView(ll, &lv)); 1096 PetscCallCXX(Kokkos::parallel_for( // for each row 1097 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1098 PetscInt i = t.league_rank(); // row i 1099 PetscInt len = Ai(i + 1) - Ai(i); 1100 // scale entries on the row 1101 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); }); 1102 })); 1103 PetscCall(VecRestoreKokkosView(ll, &lv)); 1104 PetscCall(PetscLogGpuFlops(nz)); 1105 } 1106 if (rr) { 1107 PetscCall(VecGetLocalSize(rr, &n)); 1108 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1109 PetscCall(VecGetKokkosView(rr, &rv)); 1110 PetscCallCXX(Kokkos::parallel_for( // for each nonzero 1111 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); })); 1112 PetscCall(VecRestoreKokkosView(rr, &lv)); 1113 PetscCall(PetscLogGpuFlops(nz)); 1114 } 1115 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1116 PetscCall(PetscLogGpuTimeEnd()); 1117 PetscFunctionReturn(PETSC_SUCCESS); 1118 } 1119 1120 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 1121 { 1122 Mat_SeqAIJKokkos *aijkok; 1123 1124 PetscFunctionBegin; 1125 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1126 if (aijkok) { /* Only zero the device if data is already there */ 1127 KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0); 1128 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1129 } else { /* Might be preallocated but not assembled */ 1130 PetscCall(MatZeroEntries_SeqAIJ(A)); 1131 } 1132 PetscFunctionReturn(PETSC_SUCCESS); 1133 } 1134 1135 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1136 { 1137 Mat_SeqAIJKokkos *aijkok; 1138 PetscInt n; 1139 PetscScalarKokkosView xv; 1140 1141 PetscFunctionBegin; 1142 PetscCall(VecGetLocalSize(x, &n)); 1143 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1144 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1145 1146 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1147 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1148 1149 const auto &Aa = aijkok->a_dual.view_device(); 1150 const auto &Ai = aijkok->i_dual.view_device(); 1151 const auto &Adiag = aijkok->diag_dual.view_device(); 1152 1153 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1154 Kokkos::parallel_for( 1155 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1156 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1157 else xv(i) = 0; 1158 }); 1159 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1160 PetscFunctionReturn(PETSC_SUCCESS); 1161 } 1162 1163 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1164 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1165 { 1166 Mat_SeqAIJKokkos *aijkok; 1167 1168 PetscFunctionBegin; 1169 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1170 PetscAssertPointer(kv, 2); 1171 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1172 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1173 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1174 *kv = aijkok->a_dual.view_device(); 1175 PetscFunctionReturn(PETSC_SUCCESS); 1176 } 1177 1178 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1179 { 1180 PetscFunctionBegin; 1181 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1182 PetscAssertPointer(kv, 2); 1183 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1184 PetscFunctionReturn(PETSC_SUCCESS); 1185 } 1186 1187 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1188 { 1189 Mat_SeqAIJKokkos *aijkok; 1190 1191 PetscFunctionBegin; 1192 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1193 PetscAssertPointer(kv, 2); 1194 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1195 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1196 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1197 *kv = aijkok->a_dual.view_device(); 1198 PetscFunctionReturn(PETSC_SUCCESS); 1199 } 1200 1201 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1202 { 1203 PetscFunctionBegin; 1204 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1205 PetscAssertPointer(kv, 2); 1206 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1207 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1208 PetscFunctionReturn(PETSC_SUCCESS); 1209 } 1210 1211 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1212 { 1213 Mat_SeqAIJKokkos *aijkok; 1214 1215 PetscFunctionBegin; 1216 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1217 PetscAssertPointer(kv, 2); 1218 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1219 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1220 *kv = aijkok->a_dual.view_device(); 1221 PetscFunctionReturn(PETSC_SUCCESS); 1222 } 1223 1224 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1225 { 1226 PetscFunctionBegin; 1227 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1228 PetscAssertPointer(kv, 2); 1229 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1230 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1231 PetscFunctionReturn(PETSC_SUCCESS); 1232 } 1233 1234 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A) 1235 { 1236 Mat_SeqAIJKokkos *akok; 1237 1238 PetscFunctionBegin; 1239 auto exec = PetscGetKokkosExecutionSpace(); 1240 // Create host copies of the input aij 1241 auto i_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), i_d); 1242 auto j_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), j_d); 1243 // Don't copy the vals to the host now 1244 auto a_h = Kokkos::create_mirror_view(HostMirrorMemorySpace(), a_d); 1245 1246 MatScalarKokkosDualView a_dual = MatScalarKokkosDualView(a_d, a_h); 1247 // Note we have modified device data so it will copy lazily 1248 a_dual.modify_device(); 1249 MatRowMapKokkosDualView i_dual = MatRowMapKokkosDualView(i_d, i_h); 1250 MatColIdxKokkosDualView j_dual = MatColIdxKokkosDualView(j_d, j_h); 1251 1252 PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_dual.extent(0), i_dual, j_dual, a_dual)); 1253 PetscCall(MatCreate(comm, A)); 1254 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1255 PetscFunctionReturn(PETSC_SUCCESS); 1256 } 1257 1258 /* Computes Y += alpha X */ 1259 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1260 { 1261 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1262 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1263 ConstMatScalarKokkosView Xa; 1264 MatScalarKokkosView Ya; 1265 auto exec = PetscGetKokkosExecutionSpace(); 1266 1267 PetscFunctionBegin; 1268 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1269 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1270 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1271 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1272 PetscCall(PetscLogGpuTimeBegin()); 1273 1274 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1275 PetscBool e; 1276 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1277 if (e) { 1278 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1279 if (e) pattern = SAME_NONZERO_PATTERN; 1280 } 1281 } 1282 1283 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1284 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1285 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1286 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1287 */ 1288 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1289 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1290 Xa = xkok->a_dual.view_device(); 1291 Ya = ykok->a_dual.view_device(); 1292 1293 if (pattern == SAME_NONZERO_PATTERN) { 1294 KokkosBlas::axpy(exec, alpha, Xa, Ya); 1295 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1296 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1297 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1298 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1299 1300 Kokkos::parallel_for( 1301 Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1302 PetscInt i = t.league_rank(); // row i 1303 Kokkos::single(Kokkos::PerTeam(t), [=]() { 1304 // Only one thread works in a team 1305 PetscInt p, q = Yi(i); 1306 for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 1307 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 1308 if (Xj(p) == Yj(q)) { // Found it 1309 Ya(q) += alpha * Xa(p); 1310 q++; 1311 } else { 1312 // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1313 // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1314 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 1315 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 1316 #else 1317 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 1318 #endif 1319 } 1320 } 1321 }); 1322 }); 1323 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1324 } else { // different nonzero patterns 1325 Mat Z; 1326 KokkosCsrMatrix zcsr; 1327 KernelHandle kh; 1328 kh.create_spadd_handle(true); // X, Y are sorted 1329 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1330 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1331 zkok = new Mat_SeqAIJKokkos(zcsr); 1332 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1333 PetscCall(MatHeaderReplace(Y, &Z)); 1334 kh.destroy_spadd_handle(); 1335 } 1336 PetscCall(PetscLogGpuTimeEnd()); 1337 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 1338 PetscFunctionReturn(PETSC_SUCCESS); 1339 } 1340 1341 struct MatCOOStruct_SeqAIJKokkos { 1342 PetscCount n; 1343 PetscCount Atot; 1344 PetscInt nz; 1345 PetscCountKokkosView jmap; 1346 PetscCountKokkosView perm; 1347 1348 MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h) 1349 { 1350 nz = coo_h->nz; 1351 n = coo_h->n; 1352 Atot = coo_h->Atot; 1353 jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1)); 1354 perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot)); 1355 } 1356 }; 1357 1358 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data) 1359 { 1360 PetscFunctionBegin; 1361 PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data)); 1362 PetscFunctionReturn(PETSC_SUCCESS); 1363 } 1364 1365 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1366 { 1367 Mat_SeqAIJKokkos *akok; 1368 Mat_SeqAIJ *aseq; 1369 PetscContainer container_h; 1370 MatCOOStruct_SeqAIJ *coo_h; 1371 MatCOOStruct_SeqAIJKokkos *coo_d; 1372 1373 PetscFunctionBegin; 1374 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1375 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1376 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1377 delete akok; 1378 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE); 1379 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1380 1381 // Copy the COO struct to device 1382 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 1383 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 1384 PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h)); 1385 1386 // Put the COO struct in a container and then attach that to the matrix 1387 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos)); 1388 PetscFunctionReturn(PETSC_SUCCESS); 1389 } 1390 1391 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1392 { 1393 MatScalarKokkosView Aa; 1394 ConstMatScalarKokkosView kv; 1395 PetscMemType memtype; 1396 PetscContainer container; 1397 MatCOOStruct_SeqAIJKokkos *coo; 1398 1399 PetscFunctionBegin; 1400 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 1401 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 1402 1403 const auto &n = coo->n; 1404 const auto &Annz = coo->nz; 1405 const auto &jmap = coo->jmap; 1406 const auto &perm = coo->perm; 1407 1408 PetscCall(PetscGetMemType(v, &memtype)); 1409 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1410 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n)); 1411 } else { 1412 kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */ 1413 } 1414 1415 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1416 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1417 1418 PetscCall(PetscLogGpuTimeBegin()); 1419 Kokkos::parallel_for( 1420 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) { 1421 PetscScalar sum = 0.0; 1422 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1423 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1424 }); 1425 PetscCall(PetscLogGpuTimeEnd()); 1426 1427 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1428 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1429 PetscFunctionReturn(PETSC_SUCCESS); 1430 } 1431 1432 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1433 { 1434 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1435 1436 PetscFunctionBegin; 1437 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1438 A->boundtocpu = PETSC_FALSE; 1439 1440 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1441 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1442 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1443 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1444 A->ops->scale = MatScale_SeqAIJKokkos; 1445 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1446 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1447 A->ops->mult = MatMult_SeqAIJKokkos; 1448 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1449 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1450 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1451 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1452 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1453 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1454 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1455 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1456 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1457 A->ops->shift = MatShift_SeqAIJKokkos; 1458 A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos; 1459 A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos; 1460 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1461 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1462 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1463 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1464 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1465 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1466 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1467 1468 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1469 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1470 #if defined(PETSC_HAVE_HYPRE) 1471 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE)); 1472 #endif 1473 PetscFunctionReturn(PETSC_SUCCESS); 1474 } 1475 1476 /* 1477 Extract the (prescribled) diagonal blocks of the matrix and then invert them 1478 1479 Input Parameters: 1480 + A - the MATSEQAIJKOKKOS matrix 1481 . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i) 1482 . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[] 1483 . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i) 1484 - work - a pre-allocated work buffer (as big as diagVal) for use by this routine 1485 1486 Output Parameter: 1487 . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order) 1488 */ 1489 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal) 1490 { 1491 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1492 PetscInt nblocks = bs.extent(0) - 1; 1493 1494 PetscFunctionBegin; 1495 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device 1496 1497 // Pull out the diagonal blocks of the matrix and then invert the blocks 1498 auto Aa = akok->a_dual.view_device(); 1499 auto Ai = akok->i_dual.view_device(); 1500 auto Aj = akok->j_dual.view_device(); 1501 auto Adiag = akok->diag_dual.view_device(); 1502 // TODO: how to tune the team size? 1503 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY) 1504 auto ts = Kokkos::AUTO(); 1505 #else 1506 auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs 1507 #endif 1508 PetscCallCXX(Kokkos::parallel_for( 1509 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) { 1510 const PetscInt bid = teamMember.league_rank(); // block id 1511 const PetscInt rstart = bs(bid); // this block starts from this row 1512 const PetscInt m = bs(bid + 1) - bs(bid); // size of this block 1513 const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order 1514 const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m); 1515 1516 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B 1517 PetscInt i = rstart + r; // i-th row in A 1518 1519 if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case) 1520 PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row 1521 1522 for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet 1523 if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero 1524 B(r, c) = 0.0; 1525 } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column 1526 B(r, c) = Aa(first + c); 1527 } else { // this entry does not show up in the CSR 1528 B(r, c) = 0.0; 1529 } 1530 } 1531 } else { // rare case that the diagonal does not exist 1532 const PetscInt begin = Ai(i); 1533 const PetscInt end = Ai(i + 1); 1534 for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0; 1535 for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not. 1536 if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j); 1537 else if (Aj(j) >= rstart + m) break; 1538 } 1539 } 1540 }); 1541 1542 // LU-decompose B (w/o pivoting) and then invert B 1543 KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0); 1544 KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W); 1545 })); 1546 // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels 1547 PetscFunctionReturn(PETSC_SUCCESS); 1548 } 1549 1550 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1551 { 1552 Mat_SeqAIJ *aseq; 1553 PetscInt i, m, n; 1554 auto exec = PetscGetKokkosExecutionSpace(); 1555 1556 PetscFunctionBegin; 1557 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1558 1559 m = akok->nrows(); 1560 n = akok->ncols(); 1561 PetscCall(MatSetSizes(A, m, n, m, n)); 1562 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1563 1564 /* Set up data structures of A as a MATSEQAIJ */ 1565 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1566 aseq = (Mat_SeqAIJ *)A->data; 1567 1568 PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */ 1569 PetscCallCXX(akok->j_dual.sync_host(exec)); 1570 PetscCallCXX(exec.fence()); 1571 1572 aseq->i = akok->i_host_data(); 1573 aseq->j = akok->j_host_data(); 1574 aseq->a = akok->a_host_data(); 1575 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1576 aseq->free_a = PETSC_FALSE; 1577 aseq->free_ij = PETSC_FALSE; 1578 aseq->nz = akok->nnz(); 1579 aseq->maxnz = aseq->nz; 1580 1581 PetscCall(PetscMalloc1(m, &aseq->imax)); 1582 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1583 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1584 1585 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1586 akok->nonzerostate = A->nonzerostate; 1587 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1588 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1589 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1590 PetscFunctionReturn(PETSC_SUCCESS); 1591 } 1592 1593 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 1594 { 1595 PetscFunctionBegin; 1596 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1597 *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 1598 PetscFunctionReturn(PETSC_SUCCESS); 1599 } 1600 1601 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 1602 { 1603 Mat_SeqAIJKokkos *akok; 1604 1605 PetscFunctionBegin; 1606 PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 1607 PetscCall(MatCreate(comm, A)); 1608 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1609 PetscFunctionReturn(PETSC_SUCCESS); 1610 } 1611 1612 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1613 1614 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1615 */ 1616 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1617 { 1618 PetscFunctionBegin; 1619 PetscCall(MatCreate(comm, A)); 1620 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1621 PetscFunctionReturn(PETSC_SUCCESS); 1622 } 1623 1624 /*@C 1625 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1626 (the default parallel PETSc format). This matrix will ultimately be handled by 1627 Kokkos for calculations. 1628 1629 Collective 1630 1631 Input Parameters: 1632 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1633 . m - number of rows 1634 . n - number of columns 1635 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 1636 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 1637 1638 Output Parameter: 1639 . A - the matrix 1640 1641 Level: intermediate 1642 1643 Notes: 1644 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1645 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1646 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1647 1648 The AIJ format, also called 1649 compressed row storage, is fully compatible with standard Fortran 1650 storage. That is, the stored row and column indices can begin at 1651 either one (as in Fortran) or zero. 1652 1653 Specify the preallocated storage with either `nz` or `nnz` (not both). 1654 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1655 allocation. 1656 1657 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()` 1658 @*/ 1659 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1660 { 1661 PetscFunctionBegin; 1662 PetscCall(PetscKokkosInitializeCheck()); 1663 PetscCall(MatCreate(comm, A)); 1664 PetscCall(MatSetSizes(*A, m, n, m, n)); 1665 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1666 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1667 PetscFunctionReturn(PETSC_SUCCESS); 1668 } 1669 1670 // After matrix numeric factorization, there are still steps to do before triangular solve can be called. 1671 // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does). 1672 // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve(). 1673 // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck. 1674 static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A) 1675 { 1676 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1677 const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy 1678 const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy 1679 1680 PetscFunctionBegin; 1681 if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet 1682 if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d)); 1683 if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d)); 1684 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1685 } 1686 PetscFunctionReturn(PETSC_SUCCESS); 1687 } 1688 1689 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1690 { 1691 const PetscInt n = A->rmap->n; 1692 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1693 const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy 1694 const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy 1695 1696 PetscFunctionBegin; 1697 if (!factors->transpose_updated) { 1698 if (has_upper) { 1699 if (!factors->iUt_d.extent(0)) { // Allocate Ut on device if not yet 1700 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix 1701 factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 1702 factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 1703 } 1704 1705 if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host 1706 if (!factors->U) { 1707 Mat_SeqAIJ *seq; 1708 1709 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U)); 1710 PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut)); 1711 1712 seq = static_cast<Mat_SeqAIJ *>(factors->Ut->data); 1713 factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1); 1714 factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz); 1715 factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz); 1716 } else { 1717 PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h 1718 } 1719 // Copy Ut from host to device 1720 PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h)); 1721 PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h)); 1722 PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h)); 1723 } else { // If U was computed on device, we also compute the transpose there 1724 // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. 1725 PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, 1726 factors->jU_d, factors->aU_d, 1727 factors->iUt_d, factors->jUt_d, 1728 factors->aUt_d)); 1729 PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d)); 1730 } 1731 PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d)); 1732 } 1733 1734 // do the same for L with LU 1735 if (has_lower) { 1736 if (!factors->iLt_d.extent(0)) { // Allocate Lt on device if not yet 1737 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix 1738 factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 1739 factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 1740 } 1741 1742 if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host 1743 if (!factors->L) { 1744 Mat_SeqAIJ *seq; 1745 1746 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L)); 1747 PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt)); 1748 1749 seq = static_cast<Mat_SeqAIJ *>(factors->Lt->data); 1750 factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1); 1751 factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz); 1752 factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz); 1753 } else { 1754 PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h 1755 } 1756 // Copy Lt from host to device 1757 PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h)); 1758 PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h)); 1759 PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h)); 1760 } else { // If L was computed on device, we also compute the transpose there 1761 // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. 1762 PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, 1763 factors->jL_d, factors->aL_d, 1764 factors->iLt_d, factors->jLt_d, 1765 factors->aLt_d)); 1766 PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d)); 1767 } 1768 PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d)); 1769 } 1770 1771 factors->transpose_updated = PETSC_TRUE; 1772 } 1773 PetscFunctionReturn(PETSC_SUCCESS); 1774 } 1775 1776 // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A. 1777 // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty. 1778 static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx) 1779 { 1780 auto exec = PetscGetKokkosExecutionSpace(); 1781 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1782 PetscInt m = A->rmap->n; 1783 PetscScalarKokkosView D = factors->D_d; 1784 PetscScalarKokkosView X, Y, B; // alias 1785 ConstPetscScalarKokkosView b; 1786 PetscScalarKokkosView x; 1787 PetscIntKokkosView &rowperm = factors->rowperm; 1788 PetscBool identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1789 1790 PetscFunctionBegin; 1791 PetscCall(PetscLogGpuTimeBegin()); 1792 PetscCall(MatSeqAIJKokkosSolveCheck(A)); // for UX = T 1793 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B 1794 PetscCall(VecGetKokkosView(bb, &b)); 1795 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1796 1797 // Solve U^T Y = B 1798 if (identity) { // Reorder b with the row permutation 1799 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1800 Y = factors->workVector; 1801 } else { 1802 B = factors->workVector; 1803 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); })); 1804 Y = x; 1805 } 1806 PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y)); 1807 1808 // Solve diag(D) Y' = Y. 1809 // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication. 1810 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); })); 1811 1812 // Solve UX = Y 1813 if (identity) { 1814 X = x; 1815 } else { 1816 X = factors->workVector; // B is not needed anymore 1817 } 1818 PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X)); 1819 1820 // Reorder X with the inverse column (row) permutation 1821 if (!identity) { 1822 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); })); 1823 } 1824 1825 PetscCall(VecRestoreKokkosView(bb, &b)); 1826 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1827 PetscCall(PetscLogGpuTimeEnd()); 1828 PetscFunctionReturn(PETSC_SUCCESS); 1829 } 1830 1831 // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively. 1832 // R and C are represented by rowperm and colperm in factors. 1833 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty. 1834 static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx) 1835 { 1836 auto exec = PetscGetKokkosExecutionSpace(); 1837 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1838 PetscInt m = A->rmap->n; 1839 PetscScalarKokkosView X, Y, B; // alias 1840 ConstPetscScalarKokkosView b; 1841 PetscScalarKokkosView x; 1842 PetscIntKokkosView &rowperm = factors->rowperm; 1843 PetscIntKokkosView &colperm = factors->colperm; 1844 PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1845 PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1846 1847 PetscFunctionBegin; 1848 PetscCall(PetscLogGpuTimeBegin()); 1849 PetscCall(MatSeqAIJKokkosSolveCheck(A)); 1850 PetscCall(VecGetKokkosView(bb, &b)); 1851 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1852 1853 // Solve L Y = B (i.e., L (U C^- x) = R b). R b indicates applying the row permutation on b. 1854 if (row_identity) { 1855 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1856 Y = factors->workVector; 1857 } else { 1858 B = factors->workVector; 1859 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); })); 1860 Y = x; 1861 } 1862 PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y)); 1863 1864 // Solve U C^- x = Y 1865 if (col_identity) { 1866 X = x; 1867 } else { 1868 X = factors->workVector; 1869 } 1870 PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X)); 1871 1872 // x = C X; Reorder X with the inverse col permutation 1873 if (!col_identity) { 1874 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); })); 1875 } 1876 1877 PetscCall(VecRestoreKokkosView(bb, &b)); 1878 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1879 PetscCall(PetscLogGpuTimeEnd()); 1880 PetscFunctionReturn(PETSC_SUCCESS); 1881 } 1882 1883 // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively. 1884 // R and C are represented by rowperm and colperm in factors. 1885 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty. 1886 // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R. 1887 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx) 1888 { 1889 auto exec = PetscGetKokkosExecutionSpace(); 1890 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1891 PetscInt m = A->rmap->n; 1892 PetscScalarKokkosView X, Y, B; // alias 1893 ConstPetscScalarKokkosView b; 1894 PetscScalarKokkosView x; 1895 PetscIntKokkosView &rowperm = factors->rowperm; 1896 PetscIntKokkosView &colperm = factors->colperm; 1897 PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1898 PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1899 1900 PetscFunctionBegin; 1901 PetscCall(PetscLogGpuTimeBegin()); 1902 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T 1903 PetscCall(VecGetKokkosView(bb, &b)); 1904 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1905 1906 // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b). Note C^- b = C^T b, which means applying the column permutation on b. 1907 if (col_identity) { // Reorder b with the col permutation 1908 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1909 Y = factors->workVector; 1910 } else { 1911 B = factors->workVector; 1912 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); })); 1913 Y = x; 1914 } 1915 PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y)); 1916 1917 // Solve L^T X = Y 1918 if (row_identity) { 1919 X = x; 1920 } else { 1921 X = factors->workVector; 1922 } 1923 PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X)); 1924 1925 // x = R^- X = R^T X; Reorder X with the inverse row permutation 1926 if (!row_identity) { 1927 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); })); 1928 } 1929 1930 PetscCall(VecRestoreKokkosView(bb, &b)); 1931 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1932 PetscCall(PetscLogGpuTimeEnd()); 1933 PetscFunctionReturn(PETSC_SUCCESS); 1934 } 1935 1936 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1937 { 1938 PetscFunctionBegin; 1939 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1940 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1941 1942 if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device 1943 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1944 Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data); 1945 const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag; 1946 const MatScalar *Ba = b->a; 1947 PetscInt m = B->rmap->n, n = B->cmap->n; 1948 1949 if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time 1950 // Allocate memory and copy the structure 1951 factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1); 1952 factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries 1953 factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m); 1954 factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1); 1955 factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m])); 1956 factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m])); 1957 1958 PetscInt *Li = factors->iL_h.data(); 1959 PetscInt *Lj = factors->jL_h.data(); 1960 PetscInt *Ui = factors->iU_h.data(); 1961 PetscInt *Uj = factors->jU_h.data(); 1962 1963 Li[0] = Ui[0] = 0; 1964 for (PetscInt i = 0; i < m; i++) { 1965 PetscInt llen = Bi[i + 1] - Bi[i]; // exclusive of the diagonal entry 1966 PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry 1967 1968 PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen); // entries of L on the left of the diagonal 1969 Lj[Li[i] + llen] = i; // diagonal entry of L 1970 1971 Uj[Ui[i]] = i; // diagonal entry of U 1972 PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1); // entries of U on the right of the diagonal 1973 1974 Li[i + 1] = Li[i] + llen + 1; 1975 Ui[i + 1] = Ui[i] + ulen; 1976 } 1977 1978 factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h); 1979 factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h); 1980 factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h); 1981 factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h); 1982 factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h); 1983 factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h); 1984 1985 // Copy row/col permutation to device 1986 IS rowperm = ((Mat_SeqAIJ *)B->data)->row; 1987 PetscBool row_identity; 1988 PetscCall(ISIdentity(rowperm, &row_identity)); 1989 if (!row_identity) { 1990 const PetscInt *ip; 1991 1992 PetscCall(ISGetIndices(rowperm, &ip)); 1993 factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m); 1994 PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m))); 1995 PetscCall(ISRestoreIndices(rowperm, &ip)); 1996 PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt))); 1997 } 1998 1999 IS colperm = ((Mat_SeqAIJ *)B->data)->col; 2000 PetscBool col_identity; 2001 PetscCall(ISIdentity(colperm, &col_identity)); 2002 if (!col_identity) { 2003 const PetscInt *ip; 2004 2005 PetscCall(ISGetIndices(colperm, &ip)); 2006 factors->colperm = PetscIntKokkosView(NoInit("colperm"), n); 2007 PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n))); 2008 PetscCall(ISRestoreIndices(colperm, &ip)); 2009 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 2010 } 2011 2012 /* Create sptrsv handles for L, U and their transpose */ 2013 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2014 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2015 #else 2016 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2017 #endif 2018 factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */); 2019 factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */); 2020 factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */); 2021 factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */); 2022 } 2023 2024 // Copy the value 2025 for (PetscInt i = 0; i < m; i++) { 2026 PetscInt llen = Bi[i + 1] - Bi[i]; 2027 PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; 2028 const PetscInt *Li = factors->iL_h.data(); 2029 const PetscInt *Ui = factors->iU_h.data(); 2030 2031 PetscScalar *La = factors->aL_h.data(); 2032 PetscScalar *Ua = factors->aU_h.data(); 2033 2034 PetscArraycpy(La + Li[i], Ba + Bi[i], llen); // entries of L 2035 La[Li[i] + llen] = 1.0; // diagonal entry 2036 2037 Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]]; // diagonal entry 2038 PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1); // entries of U 2039 } 2040 2041 PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h)); 2042 PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h)); 2043 // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic 2044 factors->transpose_updated = PETSC_FALSE; 2045 factors->sptrsv_symbolic_completed = PETSC_FALSE; 2046 2047 B->ops->solve = MatSolve_SeqAIJKokkos_LU; 2048 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU; 2049 } 2050 2051 B->ops->matsolve = NULL; 2052 B->ops->matsolvetranspose = NULL; 2053 PetscFunctionReturn(PETSC_SUCCESS); 2054 } 2055 2056 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info) 2057 { 2058 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 2059 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2060 PetscInt fill_lev = info->levels; 2061 2062 PetscFunctionBegin; 2063 PetscCall(PetscLogGpuTimeBegin()); 2064 PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false"); 2065 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2066 2067 auto a_d = aijkok->a_dual.view_device(); 2068 auto i_d = aijkok->i_dual.view_device(); 2069 auto j_d = aijkok->j_dual.view_device(); 2070 2071 PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d)); 2072 2073 B->assembled = PETSC_TRUE; 2074 B->preallocated = PETSC_TRUE; 2075 B->ops->solve = MatSolve_SeqAIJKokkos_LU; 2076 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU; 2077 B->ops->matsolve = NULL; 2078 B->ops->matsolvetranspose = NULL; 2079 2080 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 2081 factors->transpose_updated = PETSC_FALSE; 2082 factors->sptrsv_symbolic_completed = PETSC_FALSE; 2083 /* TODO: log flops, but how to know that? */ 2084 PetscCall(PetscLogGpuTimeEnd()); 2085 PetscFunctionReturn(PETSC_SUCCESS); 2086 } 2087 2088 // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering 2089 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info) 2090 { 2091 Mat_SeqAIJKokkos *aijkok; 2092 Mat_SeqAIJ *b; 2093 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2094 PetscInt fill_lev = info->levels; 2095 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 2096 PetscInt n = A->rmap->n; 2097 2098 PetscFunctionBegin; 2099 PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now"); 2100 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2101 2102 /* Create a spiluk handle and then do symbolic factorization */ 2103 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 2104 factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 2105 2106 auto spiluk_handle = factors->kh.get_spiluk_handle(); 2107 2108 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 2109 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 2110 Kokkos::realloc(factors->iU_d, n + 1); 2111 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 2112 2113 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 2114 auto i_d = aijkok->i_dual.view_device(); 2115 auto j_d = aijkok->j_dual.view_device(); 2116 PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d)); 2117 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 2118 2119 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 2120 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 2121 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 2122 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 2123 2124 /* TODO: add options to select sptrsv algorithms */ 2125 /* Create sptrsv handles for L, U and their transpose */ 2126 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2127 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2128 #else 2129 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2130 #endif 2131 2132 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 2133 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 2134 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 2135 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 2136 2137 /* Fill fields of the factor matrix B */ 2138 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 2139 b = (Mat_SeqAIJ *)B->data; 2140 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 2141 B->info.fill_ratio_given = info->fill; 2142 B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 2143 2144 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0; 2145 PetscFunctionReturn(PETSC_SUCCESS); 2146 } 2147 2148 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2149 { 2150 PetscFunctionBegin; 2151 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2152 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2153 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2154 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 2155 PetscFunctionReturn(PETSC_SUCCESS); 2156 } 2157 2158 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2159 { 2160 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2161 2162 PetscFunctionBegin; 2163 if (!info->factoronhost) { 2164 PetscCall(ISIdentity(isrow, &row_identity)); 2165 PetscCall(ISIdentity(iscol, &col_identity)); 2166 } 2167 2168 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2169 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2170 2171 if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering 2172 PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info)); 2173 } else { 2174 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host 2175 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 2176 } 2177 PetscFunctionReturn(PETSC_SUCCESS); 2178 } 2179 2180 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 2181 { 2182 PetscFunctionBegin; 2183 PetscCall(MatSeqAIJKokkosSyncHost(A)); 2184 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 2185 2186 if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device 2187 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2188 Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data); 2189 const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag; 2190 const MatScalar *Ba = b->a; 2191 PetscInt m = B->rmap->n; 2192 2193 if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization 2194 // Allocate memory and copy the structure 2195 factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h 2196 factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]); 2197 factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]); 2198 factors->D_h = MatScalarKokkosViewHost(NoInit("D_h"), m); 2199 factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h); 2200 factors->D_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h); 2201 2202 // Build jU_h from the skewed Aj 2203 PetscInt *Uj = factors->jU_h.data(); 2204 for (PetscInt i = 0; i < m; i++) { 2205 PetscInt ulen = Bi[i + 1] - Bi[i]; 2206 Uj[Bi[i]] = i; // diagonal entry 2207 PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal 2208 } 2209 2210 // Copy iU, jU to device 2211 PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h)); 2212 PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h)); 2213 2214 // Copy row/col permutation to device 2215 IS rowperm = ((Mat_SeqAIJ *)B->data)->row; 2216 PetscBool row_identity; 2217 PetscCall(ISIdentity(rowperm, &row_identity)); 2218 if (!row_identity) { 2219 const PetscInt *ip; 2220 2221 PetscCall(ISGetIndices(rowperm, &ip)); 2222 PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m)); 2223 PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m))); 2224 PetscCall(ISRestoreIndices(rowperm, &ip)); 2225 PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt))); 2226 } 2227 2228 // Create sptrsv handles for U and U^T 2229 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2230 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2231 #else 2232 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2233 #endif 2234 factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */); 2235 factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */); 2236 } 2237 // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them 2238 B->ops->solve = MatSolve_SeqAIJKokkos_Cholesky; 2239 B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky; 2240 2241 // Copy the value 2242 PetscScalar *Ua = factors->aU_h.data(); 2243 PetscScalar *D = factors->D_h.data(); 2244 for (PetscInt i = 0; i < m; i++) { 2245 D[i] = Ba[Bdiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 2246 Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U 2247 for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k]; 2248 } 2249 PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h)); 2250 PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h)); 2251 2252 factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again 2253 factors->transpose_updated = PETSC_FALSE; 2254 } 2255 2256 B->ops->matsolve = NULL; 2257 B->ops->matsolvetranspose = NULL; 2258 PetscFunctionReturn(PETSC_SUCCESS); 2259 } 2260 2261 static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2262 { 2263 PetscFunctionBegin; 2264 if (info->solveonhost) { 2265 // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc. 2266 PetscCall(MatSetType(B, MATSEQSBAIJ)); 2267 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 2268 } 2269 2270 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2271 2272 if (!info->solveonhost) { 2273 // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr 2274 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2275 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2276 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos; 2277 } 2278 PetscFunctionReturn(PETSC_SUCCESS); 2279 } 2280 2281 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2282 { 2283 PetscFunctionBegin; 2284 if (info->solveonhost) { 2285 // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc. 2286 PetscCall(MatSetType(B, MATSEQSBAIJ)); 2287 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 2288 } 2289 2290 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm 2291 2292 if (!info->solveonhost) { 2293 // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr 2294 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2295 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2296 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos; 2297 } 2298 PetscFunctionReturn(PETSC_SUCCESS); 2299 } 2300 2301 // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix 2302 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type) 2303 { 2304 PetscFunctionBegin; 2305 *type = MATSOLVERKOKKOS; 2306 PetscFunctionReturn(PETSC_SUCCESS); 2307 } 2308 2309 /*MC 2310 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 2311 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 2312 2313 Level: beginner 2314 2315 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 2316 M*/ 2317 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 2318 { 2319 PetscInt n = A->rmap->n; 2320 MPI_Comm comm; 2321 2322 PetscFunctionBegin; 2323 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 2324 PetscCall(MatCreate(comm, B)); 2325 PetscCall(MatSetSizes(*B, n, n, n, n)); 2326 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2327 (*B)->factortype = ftype; 2328 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 2329 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2330 PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2331 2332 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2333 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 2334 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 2335 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2336 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2337 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2338 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2339 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJKokkos; 2340 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos; 2341 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2342 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2343 } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 2344 2345 // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host 2346 (*B)->canuseordering = PETSC_TRUE; 2347 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos)); 2348 PetscFunctionReturn(PETSC_SUCCESS); 2349 } 2350 2351 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void) 2352 { 2353 PetscFunctionBegin; 2354 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 2355 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos)); 2356 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 2357 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos)); 2358 PetscFunctionReturn(PETSC_SUCCESS); 2359 } 2360 2361 /* Utility to print out a KokkosCsrMatrix for debugging */ 2362 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 2363 { 2364 const auto &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map); 2365 const auto &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries); 2366 const auto &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values); 2367 const PetscInt *i = iv.data(); 2368 const PetscInt *j = jv.data(); 2369 const PetscScalar *a = av.data(); 2370 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 2371 2372 PetscFunctionBegin; 2373 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 2374 for (PetscInt k = 0; k < m; k++) { 2375 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 2376 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 2377 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 2378 } 2379 PetscFunctionReturn(PETSC_SUCCESS); 2380 } 2381