1 #include <petsc_kokkos.hpp> 2 #include <petscvec_kokkos.hpp> 3 #include <petscmat_kokkos.hpp> 4 #include <petscpkg_version.h> 5 #include <petsc/private/petscimpl.h> 6 #include <petsc/private/sfimpl.h> 7 #include <petsc/private/kokkosimpl.hpp> 8 #include <petscsys.h> 9 10 #include <Kokkos_Core.hpp> 11 #include <KokkosBlas.hpp> 12 #include <KokkosSparse_CrsMatrix.hpp> 13 14 // To suppress compiler warnings: 15 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63: 16 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, 17 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*, 18 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations] 19 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations") 20 #include <KokkosSparse_spmv.hpp> 21 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END() 22 23 #include <KokkosSparse_spiluk.hpp> 24 #include <KokkosSparse_sptrsv.hpp> 25 #include <KokkosSparse_spgemm.hpp> 26 #include <KokkosSparse_spadd.hpp> 27 #include <KokkosBatched_LU_Decl.hpp> 28 #include <KokkosBatched_InverseLU_Decl.hpp> 29 30 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 31 32 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 33 #include <KokkosSparse_Utils.hpp> 34 using KokkosSparse::sort_crs_matrix; 35 using KokkosSparse::Impl::transpose_matrix; 36 #else 37 #include <KokkosKernels_Sorting.hpp> 38 using KokkosKernels::sort_crs_matrix; 39 using KokkosKernels::Impl::transpose_matrix; 40 #endif 41 42 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0) 43 using KokkosSparse::spiluk_symbolic; 44 using KokkosSparse::spiluk_numeric; 45 using KokkosSparse::sptrsv_symbolic; 46 using KokkosSparse::sptrsv_solve; 47 using KokkosSparse::Experimental::SPTRSVAlgorithm; 48 using KokkosSparse::Experimental::SPILUKAlgorithm; 49 #else 50 using KokkosSparse::Experimental::spiluk_symbolic; 51 using KokkosSparse::Experimental::spiluk_numeric; 52 using KokkosSparse::Experimental::sptrsv_symbolic; 53 using KokkosSparse::Experimental::sptrsv_solve; 54 using KokkosSparse::Experimental::SPTRSVAlgorithm; 55 using KokkosSparse::Experimental::SPILUKAlgorithm; 56 #endif 57 58 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 59 60 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 61 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 62 In the latter case, it is important to set a_dual's sync state correctly. 63 */ 64 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 65 { 66 Mat_SeqAIJ *aijseq; 67 Mat_SeqAIJKokkos *aijkok; 68 69 PetscFunctionBegin; 70 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 71 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 72 73 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 74 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 75 76 /* If aijkok does not exist, we just copy i, j to device. 77 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 78 In both cases, we build a new aijkok structure. 79 */ 80 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 81 delete aijkok; 82 aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 83 A->spptr = aijkok; 84 } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag. 85 MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n); 86 auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h); 87 aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h); 88 } 89 PetscFunctionReturn(PETSC_SUCCESS); 90 } 91 92 /* Sync CSR data to device if not yet */ 93 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 94 { 95 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 96 97 PetscFunctionBegin; 98 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 99 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 100 if (aijkok->a_dual.need_sync_device()) { 101 aijkok->a_dual.sync_device(); 102 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 103 aijkok->hermitian_updated = PETSC_FALSE; 104 } 105 PetscFunctionReturn(PETSC_SUCCESS); 106 } 107 108 /* Mark the CSR data on device as modified */ 109 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 110 { 111 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 112 113 PetscFunctionBegin; 114 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 115 aijkok->a_dual.clear_sync_state(); 116 aijkok->a_dual.modify_device(); 117 aijkok->transpose_updated = PETSC_FALSE; 118 aijkok->hermitian_updated = PETSC_FALSE; 119 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 120 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 121 PetscFunctionReturn(PETSC_SUCCESS); 122 } 123 124 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 125 { 126 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 127 auto exec = PetscGetKokkosExecutionSpace(); 128 129 PetscFunctionBegin; 130 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 131 /* We do not expect one needs factors on host */ 132 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 133 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 134 PetscCall(KokkosDualViewSync<HostMirrorMemorySpace>(aijkok->a_dual, exec)); 135 PetscFunctionReturn(PETSC_SUCCESS); 136 } 137 138 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 139 { 140 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 141 142 PetscFunctionBegin; 143 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 144 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 145 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 146 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 147 */ 148 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 149 auto exec = PetscGetKokkosExecutionSpace(); 150 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 151 PetscCallCXX(exec.fence()); 152 *array = aijkok->a_dual.view_host().data(); 153 } else { /* Happens when calling MatSetValues on a newly created matrix */ 154 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 155 } 156 PetscFunctionReturn(PETSC_SUCCESS); 157 } 158 159 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 160 { 161 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 162 163 PetscFunctionBegin; 164 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 165 PetscFunctionReturn(PETSC_SUCCESS); 166 } 167 168 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 169 { 170 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 171 172 PetscFunctionBegin; 173 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 174 auto exec = PetscGetKokkosExecutionSpace(); 175 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 176 PetscCallCXX(exec.fence()); 177 *array = aijkok->a_dual.view_host().data(); 178 } else { 179 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 180 } 181 PetscFunctionReturn(PETSC_SUCCESS); 182 } 183 184 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 185 { 186 PetscFunctionBegin; 187 *array = NULL; 188 PetscFunctionReturn(PETSC_SUCCESS); 189 } 190 191 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 192 { 193 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 194 195 PetscFunctionBegin; 196 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 197 *array = aijkok->a_dual.view_host().data(); 198 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 199 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 200 } 201 PetscFunctionReturn(PETSC_SUCCESS); 202 } 203 204 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 205 { 206 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 207 208 PetscFunctionBegin; 209 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 210 aijkok->a_dual.clear_sync_state(); 211 aijkok->a_dual.modify_host(); 212 } 213 PetscFunctionReturn(PETSC_SUCCESS); 214 } 215 216 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 217 { 218 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 219 220 PetscFunctionBegin; 221 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 222 223 if (i) *i = aijkok->i_device_data(); 224 if (j) *j = aijkok->j_device_data(); 225 if (a) { 226 aijkok->a_dual.sync_device(); 227 *a = aijkok->a_device_data(); 228 } 229 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 230 PetscFunctionReturn(PETSC_SUCCESS); 231 } 232 233 /* 234 Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 235 236 Input Parameter: 237 . A - the MATSEQAIJKOKKOS matrix 238 239 Output Parameters: 240 + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 241 - T_d - the transpose on device, whose value array is allocated but not initialized 242 */ 243 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 244 { 245 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 246 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 247 const PetscInt *Ai = aseq->i, *Aj = aseq->j; 248 MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 249 MatRowMapType *Ti = Ti_h.data(); 250 MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 251 MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 252 PetscInt *Tj = Tj_h.data(); 253 PetscInt *perm = perm_h.data(); 254 PetscInt *offset; 255 256 PetscFunctionBegin; 257 // Populate Ti 258 PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 259 Ti++; 260 for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 261 Ti--; 262 for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 263 264 // Populate Tj and the permutation array 265 PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 266 for (PetscInt i = 0; i < m; i++) { 267 for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 268 PetscInt r = Aj[j]; // row r of T 269 PetscInt disp = Ti[r] + offset[r]; 270 271 Tj[disp] = i; // col i of T 272 perm[disp] = j; 273 offset[r]++; 274 } 275 } 276 PetscCall(PetscFree(offset)); 277 278 // Sort each row of T, along with the permutation array 279 for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 280 281 // Output perm and T on device 282 auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 283 auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 284 PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 285 PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 286 PetscFunctionReturn(PETSC_SUCCESS); 287 } 288 289 // Generate the transpose on device and cache it internally 290 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 291 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 292 { 293 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 294 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 295 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 296 KokkosCsrMatrix &T = akok->csrmatT; 297 298 PetscFunctionBegin; 299 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 300 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 301 302 const auto &Aa = akok->a_dual.view_device(); 303 304 if (A->symmetric == PETSC_BOOL3_TRUE) { 305 *csrmatT = akok->csrmat; 306 } else { 307 // See if we already have a cached transpose and its value is up to date 308 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 309 if (!akok->transpose_updated) { // if the value is out of date, update the cached version 310 const auto &perm = akok->transpose_perm; // get the permutation array 311 auto &Ta = T.values; 312 313 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 314 } 315 } else { // Generate T of size n x m for the first time 316 MatRowMapKokkosView perm; 317 318 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 319 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 320 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 321 } 322 akok->transpose_updated = PETSC_TRUE; 323 *csrmatT = akok->csrmatT; 324 } 325 PetscFunctionReturn(PETSC_SUCCESS); 326 } 327 328 // Generate the Hermitian on device and cache it internally 329 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 330 { 331 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 332 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 333 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 334 KokkosCsrMatrix &T = akok->csrmatH; 335 336 PetscFunctionBegin; 337 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 338 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 339 340 const auto &Aa = akok->a_dual.view_device(); 341 342 if (A->hermitian == PETSC_BOOL3_TRUE) { 343 *csrmatH = akok->csrmat; 344 } else { 345 // See if we already have a cached hermitian and its value is up to date 346 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 347 if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 348 const auto &perm = akok->transpose_perm; // get the permutation array 349 auto &Ta = T.values; 350 351 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 352 } 353 } else { // Generate T of size n x m for the first time 354 MatRowMapKokkosView perm; 355 356 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 357 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 358 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 359 } 360 akok->hermitian_updated = PETSC_TRUE; 361 *csrmatH = akok->csrmatH; 362 } 363 PetscFunctionReturn(PETSC_SUCCESS); 364 } 365 366 /* y = A x */ 367 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 368 { 369 Mat_SeqAIJKokkos *aijkok; 370 ConstPetscScalarKokkosView xv; 371 PetscScalarKokkosView yv; 372 373 PetscFunctionBegin; 374 PetscCall(PetscLogGpuTimeBegin()); 375 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 376 PetscCall(VecGetKokkosView(xx, &xv)); 377 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 378 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 379 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 380 PetscCall(VecRestoreKokkosView(xx, &xv)); 381 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 382 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 383 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 384 PetscCall(PetscLogGpuTimeEnd()); 385 PetscFunctionReturn(PETSC_SUCCESS); 386 } 387 388 /* y = A^T x */ 389 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 390 { 391 Mat_SeqAIJKokkos *aijkok; 392 const char *mode; 393 ConstPetscScalarKokkosView xv; 394 PetscScalarKokkosView yv; 395 KokkosCsrMatrix csrmat; 396 397 PetscFunctionBegin; 398 PetscCall(PetscLogGpuTimeBegin()); 399 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 400 PetscCall(VecGetKokkosView(xx, &xv)); 401 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 402 if (A->form_explicit_transpose) { 403 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 404 mode = "N"; 405 } else { 406 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 407 csrmat = aijkok->csrmat; 408 mode = "T"; 409 } 410 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 411 PetscCall(VecRestoreKokkosView(xx, &xv)); 412 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 413 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 414 PetscCall(PetscLogGpuTimeEnd()); 415 PetscFunctionReturn(PETSC_SUCCESS); 416 } 417 418 /* y = A^H x */ 419 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 420 { 421 Mat_SeqAIJKokkos *aijkok; 422 const char *mode; 423 ConstPetscScalarKokkosView xv; 424 PetscScalarKokkosView yv; 425 KokkosCsrMatrix csrmat; 426 427 PetscFunctionBegin; 428 PetscCall(PetscLogGpuTimeBegin()); 429 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 430 PetscCall(VecGetKokkosView(xx, &xv)); 431 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 432 if (A->form_explicit_transpose) { 433 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 434 mode = "N"; 435 } else { 436 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 437 csrmat = aijkok->csrmat; 438 mode = "C"; 439 } 440 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 441 PetscCall(VecRestoreKokkosView(xx, &xv)); 442 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 443 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 444 PetscCall(PetscLogGpuTimeEnd()); 445 PetscFunctionReturn(PETSC_SUCCESS); 446 } 447 448 /* z = A x + y */ 449 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 450 { 451 Mat_SeqAIJKokkos *aijkok; 452 ConstPetscScalarKokkosView xv; 453 PetscScalarKokkosView zv; 454 455 PetscFunctionBegin; 456 PetscCall(PetscLogGpuTimeBegin()); 457 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 458 if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host 459 PetscCall(VecGetKokkosView(xx, &xv)); 460 PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device 461 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 462 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 463 PetscCall(VecRestoreKokkosView(xx, &xv)); 464 PetscCall(VecRestoreKokkosView(zz, &zv)); 465 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 466 PetscCall(PetscLogGpuTimeEnd()); 467 PetscFunctionReturn(PETSC_SUCCESS); 468 } 469 470 /* z = A^T x + y */ 471 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 472 { 473 Mat_SeqAIJKokkos *aijkok; 474 const char *mode; 475 ConstPetscScalarKokkosView xv; 476 PetscScalarKokkosView zv; 477 KokkosCsrMatrix csrmat; 478 479 PetscFunctionBegin; 480 PetscCall(PetscLogGpuTimeBegin()); 481 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 482 if (zz != yy) PetscCall(VecCopy(yy, zz)); 483 PetscCall(VecGetKokkosView(xx, &xv)); 484 PetscCall(VecGetKokkosView(zz, &zv)); 485 if (A->form_explicit_transpose) { 486 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 487 mode = "N"; 488 } else { 489 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 490 csrmat = aijkok->csrmat; 491 mode = "T"; 492 } 493 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 494 PetscCall(VecRestoreKokkosView(xx, &xv)); 495 PetscCall(VecRestoreKokkosView(zz, &zv)); 496 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 497 PetscCall(PetscLogGpuTimeEnd()); 498 PetscFunctionReturn(PETSC_SUCCESS); 499 } 500 501 /* z = A^H x + y */ 502 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 503 { 504 Mat_SeqAIJKokkos *aijkok; 505 const char *mode; 506 ConstPetscScalarKokkosView xv; 507 PetscScalarKokkosView zv; 508 KokkosCsrMatrix csrmat; 509 510 PetscFunctionBegin; 511 PetscCall(PetscLogGpuTimeBegin()); 512 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 513 if (zz != yy) PetscCall(VecCopy(yy, zz)); 514 PetscCall(VecGetKokkosView(xx, &xv)); 515 PetscCall(VecGetKokkosView(zz, &zv)); 516 if (A->form_explicit_transpose) { 517 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 518 mode = "N"; 519 } else { 520 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 521 csrmat = aijkok->csrmat; 522 mode = "C"; 523 } 524 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 525 PetscCall(VecRestoreKokkosView(xx, &xv)); 526 PetscCall(VecRestoreKokkosView(zz, &zv)); 527 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 528 PetscCall(PetscLogGpuTimeEnd()); 529 PetscFunctionReturn(PETSC_SUCCESS); 530 } 531 532 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 533 { 534 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 535 536 PetscFunctionBegin; 537 switch (op) { 538 case MAT_FORM_EXPLICIT_TRANSPOSE: 539 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 540 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 541 A->form_explicit_transpose = flg; 542 break; 543 default: 544 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 545 break; 546 } 547 PetscFunctionReturn(PETSC_SUCCESS); 548 } 549 550 /* Depending on reuse, either build a new mat, or use the existing mat */ 551 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 552 { 553 Mat_SeqAIJ *aseq; 554 555 PetscFunctionBegin; 556 PetscCall(PetscKokkosInitializeCheck()); 557 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 558 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */ 559 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 560 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 561 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 562 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 563 PetscCall(PetscFree(A->defaultvectype)); 564 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 565 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 566 PetscCall(MatSetOps_SeqAIJKokkos(A)); 567 aseq = static_cast<Mat_SeqAIJ *>(A->data); 568 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 569 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 570 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE); 571 } 572 } 573 PetscFunctionReturn(PETSC_SUCCESS); 574 } 575 576 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 577 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 578 */ 579 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 580 { 581 Mat_SeqAIJ *bseq; 582 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 583 Mat mat; 584 585 PetscFunctionBegin; 586 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 587 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 588 mat = *B; 589 if (A->assembled) { 590 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 591 bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE); 592 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 593 /* Now copy values to B if needed */ 594 if (dupOption == MAT_COPY_VALUES) { 595 if (akok->a_dual.need_sync_device()) { 596 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 597 bkok->a_dual.modify_host(); 598 } else { /* If device has the latest data, we only copy data on device */ 599 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 600 bkok->a_dual.modify_device(); 601 } 602 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 603 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 604 bkok->a_dual.modify_host(); 605 } 606 mat->spptr = bkok; 607 } 608 609 PetscCall(PetscFree(mat->defaultvectype)); 610 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 611 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 612 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 613 PetscFunctionReturn(PETSC_SUCCESS); 614 } 615 616 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 617 { 618 Mat At; 619 KokkosCsrMatrix internT; 620 Mat_SeqAIJKokkos *atkok, *bkok; 621 622 PetscFunctionBegin; 623 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 624 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 625 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 626 /* Deep copy internT, as we want to isolate the internal transpose */ 627 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 628 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 629 if (reuse == MAT_INITIAL_MATRIX) *B = At; 630 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 631 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 632 if ((*B)->assembled) { 633 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 634 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 635 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 636 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 637 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 638 MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 639 MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 640 PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 641 PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 642 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 643 } 644 PetscFunctionReturn(PETSC_SUCCESS); 645 } 646 647 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 648 { 649 Mat_SeqAIJKokkos *aijkok; 650 651 PetscFunctionBegin; 652 if (A->factortype == MAT_FACTOR_NONE) { 653 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 654 delete aijkok; 655 } else { 656 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 657 } 658 A->spptr = NULL; 659 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 660 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 661 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 662 #if defined(PETSC_HAVE_HYPRE) 663 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL)); 664 #endif 665 PetscCall(MatDestroy_SeqAIJ(A)); 666 PetscFunctionReturn(PETSC_SUCCESS); 667 } 668 669 /*MC 670 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 671 672 A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types 673 674 Options Database Key: 675 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 676 677 Level: beginner 678 679 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 680 M*/ 681 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 682 { 683 PetscFunctionBegin; 684 PetscCall(PetscKokkosInitializeCheck()); 685 PetscCall(MatCreate_SeqAIJ(A)); 686 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 687 PetscFunctionReturn(PETSC_SUCCESS); 688 } 689 690 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 691 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 692 { 693 Mat_SeqAIJ *a, *b; 694 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 695 MatScalarKokkosView aa, ba, ca; 696 MatRowMapKokkosView ai, bi, ci; 697 MatColIdxKokkosView aj, bj, cj; 698 PetscInt m, n, nnz, aN; 699 700 PetscFunctionBegin; 701 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 702 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 703 PetscAssertPointer(C, 4); 704 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 705 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 706 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 707 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 708 709 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 710 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 711 a = static_cast<Mat_SeqAIJ *>(A->data); 712 b = static_cast<Mat_SeqAIJ *>(B->data); 713 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 714 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 715 aa = akok->a_dual.view_device(); 716 ai = akok->i_dual.view_device(); 717 ba = bkok->a_dual.view_device(); 718 bi = bkok->i_dual.view_device(); 719 m = A->rmap->n; /* M, N and nnz of C */ 720 n = A->cmap->n + B->cmap->n; 721 nnz = a->nz + b->nz; 722 aN = A->cmap->n; /* N of A */ 723 if (reuse == MAT_INITIAL_MATRIX) { 724 aj = akok->j_dual.view_device(); 725 bj = bkok->j_dual.view_device(); 726 auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 727 auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 728 auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 729 ca = ca_dual.view_device(); 730 ci = ci_dual.view_device(); 731 cj = cj_dual.view_device(); 732 733 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 734 Kokkos::parallel_for( 735 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 736 PetscInt i = t.league_rank(); /* row i */ 737 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 738 739 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 740 ci(i) = coffset; 741 if (i == m - 1) ci(m) = ai(m) + bi(m); 742 }); 743 744 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 745 if (k < alen) { 746 ca(coffset + k) = aa(ai(i) + k); 747 cj(coffset + k) = aj(ai(i) + k); 748 } else { 749 ca(coffset + k) = ba(bi(i) + k - alen); 750 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 751 } 752 }); 753 }); 754 ca_dual.modify_device(); 755 ci_dual.modify_device(); 756 cj_dual.modify_device(); 757 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 758 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 759 } else if (reuse == MAT_REUSE_MATRIX) { 760 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 761 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 762 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 763 ca = ckok->a_dual.view_device(); 764 ci = ckok->i_dual.view_device(); 765 766 Kokkos::parallel_for( 767 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 768 PetscInt i = t.league_rank(); /* row i */ 769 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 770 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 771 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 772 else ca(ci(i) + k) = ba(bi(i) + k - alen); 773 }); 774 }); 775 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 776 } 777 PetscFunctionReturn(PETSC_SUCCESS); 778 } 779 780 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 781 { 782 PetscFunctionBegin; 783 delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 784 PetscFunctionReturn(PETSC_SUCCESS); 785 } 786 787 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 788 { 789 Mat_Product *product = C->product; 790 Mat A, B; 791 bool transA, transB; /* use bool, since KK needs this type */ 792 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 793 Mat_SeqAIJ *c; 794 MatProductData_SeqAIJKokkos *pdata; 795 KokkosCsrMatrix csrmatA, csrmatB; 796 797 PetscFunctionBegin; 798 MatCheckProduct(C, 1); 799 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 800 pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 801 802 // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 803 // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 804 // we still do numeric. 805 if (pdata->reusesym) { // numeric reuses results from symbolic 806 pdata->reusesym = PETSC_FALSE; 807 PetscFunctionReturn(PETSC_SUCCESS); 808 } 809 810 switch (product->type) { 811 case MATPRODUCT_AB: 812 transA = false; 813 transB = false; 814 break; 815 case MATPRODUCT_AtB: 816 transA = true; 817 transB = false; 818 break; 819 case MATPRODUCT_ABt: 820 transA = false; 821 transB = true; 822 break; 823 default: 824 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 825 } 826 827 A = product->A; 828 B = product->B; 829 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 830 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 831 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 832 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 833 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 834 835 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 836 837 csrmatA = akok->csrmat; 838 csrmatB = bkok->csrmat; 839 840 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 841 if (transA) { 842 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 843 transA = false; 844 } 845 846 if (transB) { 847 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 848 transB = false; 849 } 850 PetscCall(PetscLogGpuTimeBegin()); 851 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 852 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 853 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 854 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 855 #endif 856 857 PetscCall(PetscLogGpuTimeEnd()); 858 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 859 /* shorter version of MatAssemblyEnd_SeqAIJ */ 860 c = (Mat_SeqAIJ *)C->data; 861 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 862 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 863 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 864 c->reallocs = 0; 865 C->info.mallocs = 0; 866 C->info.nz_unneeded = 0; 867 C->assembled = C->was_assembled = PETSC_TRUE; 868 C->num_ass++; 869 PetscFunctionReturn(PETSC_SUCCESS); 870 } 871 872 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 873 { 874 Mat_Product *product = C->product; 875 MatProductType ptype; 876 Mat A, B; 877 bool transA, transB; 878 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 879 MatProductData_SeqAIJKokkos *pdata; 880 MPI_Comm comm; 881 KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 882 883 PetscFunctionBegin; 884 MatCheckProduct(C, 1); 885 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 886 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 887 A = product->A; 888 B = product->B; 889 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 890 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 891 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 892 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 893 csrmatA = akok->csrmat; 894 csrmatB = bkok->csrmat; 895 896 ptype = product->type; 897 // Take advantage of the symmetry if true 898 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 899 ptype = MATPRODUCT_AB; 900 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 901 } 902 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 903 ptype = MATPRODUCT_AB; 904 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 905 } 906 907 switch (ptype) { 908 case MATPRODUCT_AB: 909 transA = false; 910 transB = false; 911 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 912 break; 913 case MATPRODUCT_AtB: 914 transA = true; 915 transB = false; 916 if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs)); 917 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 918 break; 919 case MATPRODUCT_ABt: 920 transA = false; 921 transB = true; 922 if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs)); 923 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 924 break; 925 default: 926 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 927 } 928 PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 929 pdata->reusesym = product->api_user; 930 931 /* TODO: add command line options to select spgemm algorithms */ 932 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 933 934 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 935 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 936 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 937 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 938 #endif 939 #endif 940 PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 941 942 PetscCall(PetscLogGpuTimeBegin()); 943 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 944 if (transA) { 945 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 946 transA = false; 947 } 948 949 if (transB) { 950 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 951 transB = false; 952 } 953 954 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 955 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 956 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 957 calling new Mat_SeqAIJKokkos(). 958 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 959 */ 960 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 961 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 962 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 963 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 964 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 965 #endif 966 PetscCall(PetscLogGpuTimeEnd()); 967 968 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 969 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 970 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 971 PetscFunctionReturn(PETSC_SUCCESS); 972 } 973 974 /* handles sparse matrix matrix ops */ 975 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 976 { 977 Mat_Product *product = mat->product; 978 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 979 980 PetscFunctionBegin; 981 MatCheckProduct(mat, 1); 982 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 983 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 984 if (Biskok && Ciskok) { 985 switch (product->type) { 986 case MATPRODUCT_AB: 987 case MATPRODUCT_AtB: 988 case MATPRODUCT_ABt: 989 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 990 break; 991 case MATPRODUCT_PtAP: 992 case MATPRODUCT_RARt: 993 case MATPRODUCT_ABC: 994 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 995 break; 996 default: 997 break; 998 } 999 } else { /* fallback for AIJ */ 1000 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 1001 } 1002 PetscFunctionReturn(PETSC_SUCCESS); 1003 } 1004 1005 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 1006 { 1007 Mat_SeqAIJKokkos *aijkok; 1008 1009 PetscFunctionBegin; 1010 PetscCall(PetscLogGpuTimeBegin()); 1011 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1012 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1013 KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 1014 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1015 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 1016 PetscCall(PetscLogGpuTimeEnd()); 1017 PetscFunctionReturn(PETSC_SUCCESS); 1018 } 1019 1020 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular) 1021 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a) 1022 { 1023 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1024 1025 PetscFunctionBegin; 1026 if (A->assembled && aijseq->diagonaldense) { // no missing diagonals 1027 PetscInt n = PetscMin(A->rmap->n, A->cmap->n); 1028 1029 PetscCall(PetscLogGpuTimeBegin()); 1030 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1031 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1032 const auto &Aa = aijkok->a_dual.view_device(); 1033 const auto &Adiag = aijkok->diag_dual.view_device(); 1034 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; })); 1035 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1036 PetscCall(PetscLogGpuFlops(n)); 1037 PetscCall(PetscLogGpuTimeEnd()); 1038 } else { // need reassembly, very slow! 1039 PetscCall(MatShift_Basic(A, a)); 1040 } 1041 PetscFunctionReturn(PETSC_SUCCESS); 1042 } 1043 1044 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is) 1045 { 1046 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data); 1047 1048 PetscFunctionBegin; 1049 if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals 1050 ConstPetscScalarKokkosView dv; 1051 PetscInt n, nv; 1052 1053 PetscCall(PetscLogGpuTimeBegin()); 1054 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1055 PetscCall(VecGetKokkosView(D, &dv)); 1056 PetscCall(VecGetLocalSize(D, &nv)); 1057 n = PetscMin(Y->rmap->n, Y->cmap->n); 1058 PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match"); 1059 1060 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1061 const auto &Aa = aijkok->a_dual.view_device(); 1062 const auto &Adiag = aijkok->diag_dual.view_device(); 1063 PetscCallCXX(Kokkos::parallel_for( 1064 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1065 if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i); 1066 else Aa(Adiag(i)) += dv(i); 1067 })); 1068 PetscCall(VecRestoreKokkosView(D, &dv)); 1069 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1070 PetscCall(PetscLogGpuFlops(n)); 1071 PetscCall(PetscLogGpuTimeEnd()); 1072 } else { // need reassembly, very slow! 1073 PetscCall(MatDiagonalSet_Default(Y, D, is)); 1074 } 1075 PetscFunctionReturn(PETSC_SUCCESS); 1076 } 1077 1078 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr) 1079 { 1080 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1081 PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz; 1082 ConstPetscScalarKokkosView lv, rv; 1083 1084 PetscFunctionBegin; 1085 PetscCall(PetscLogGpuTimeBegin()); 1086 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1087 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1088 const auto &Aa = aijkok->a_dual.view_device(); 1089 const auto &Ai = aijkok->i_dual.view_device(); 1090 const auto &Aj = aijkok->j_dual.view_device(); 1091 if (ll) { 1092 PetscCall(VecGetLocalSize(ll, &m)); 1093 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1094 PetscCall(VecGetKokkosView(ll, &lv)); 1095 PetscCallCXX(Kokkos::parallel_for( // for each row 1096 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1097 PetscInt i = t.league_rank(); // row i 1098 PetscInt len = Ai(i + 1) - Ai(i); 1099 // scale entries on the row 1100 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); }); 1101 })); 1102 PetscCall(VecRestoreKokkosView(ll, &lv)); 1103 PetscCall(PetscLogGpuFlops(nz)); 1104 } 1105 if (rr) { 1106 PetscCall(VecGetLocalSize(rr, &n)); 1107 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1108 PetscCall(VecGetKokkosView(rr, &rv)); 1109 PetscCallCXX(Kokkos::parallel_for( // for each nonzero 1110 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); })); 1111 PetscCall(VecRestoreKokkosView(rr, &lv)); 1112 PetscCall(PetscLogGpuFlops(nz)); 1113 } 1114 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1115 PetscCall(PetscLogGpuTimeEnd()); 1116 PetscFunctionReturn(PETSC_SUCCESS); 1117 } 1118 1119 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 1120 { 1121 Mat_SeqAIJKokkos *aijkok; 1122 1123 PetscFunctionBegin; 1124 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1125 if (aijkok) { /* Only zero the device if data is already there */ 1126 KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0); 1127 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1128 } else { /* Might be preallocated but not assembled */ 1129 PetscCall(MatZeroEntries_SeqAIJ(A)); 1130 } 1131 PetscFunctionReturn(PETSC_SUCCESS); 1132 } 1133 1134 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1135 { 1136 Mat_SeqAIJKokkos *aijkok; 1137 PetscInt n; 1138 PetscScalarKokkosView xv; 1139 1140 PetscFunctionBegin; 1141 PetscCall(VecGetLocalSize(x, &n)); 1142 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1143 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1144 1145 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1146 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1147 1148 const auto &Aa = aijkok->a_dual.view_device(); 1149 const auto &Ai = aijkok->i_dual.view_device(); 1150 const auto &Adiag = aijkok->diag_dual.view_device(); 1151 1152 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1153 Kokkos::parallel_for( 1154 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1155 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1156 else xv(i) = 0; 1157 }); 1158 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1159 PetscFunctionReturn(PETSC_SUCCESS); 1160 } 1161 1162 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1163 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1164 { 1165 Mat_SeqAIJKokkos *aijkok; 1166 1167 PetscFunctionBegin; 1168 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1169 PetscAssertPointer(kv, 2); 1170 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1171 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1172 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1173 *kv = aijkok->a_dual.view_device(); 1174 PetscFunctionReturn(PETSC_SUCCESS); 1175 } 1176 1177 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1178 { 1179 PetscFunctionBegin; 1180 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1181 PetscAssertPointer(kv, 2); 1182 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1183 PetscFunctionReturn(PETSC_SUCCESS); 1184 } 1185 1186 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1187 { 1188 Mat_SeqAIJKokkos *aijkok; 1189 1190 PetscFunctionBegin; 1191 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1192 PetscAssertPointer(kv, 2); 1193 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1194 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1195 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1196 *kv = aijkok->a_dual.view_device(); 1197 PetscFunctionReturn(PETSC_SUCCESS); 1198 } 1199 1200 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1201 { 1202 PetscFunctionBegin; 1203 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1204 PetscAssertPointer(kv, 2); 1205 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1206 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1207 PetscFunctionReturn(PETSC_SUCCESS); 1208 } 1209 1210 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1211 { 1212 Mat_SeqAIJKokkos *aijkok; 1213 1214 PetscFunctionBegin; 1215 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1216 PetscAssertPointer(kv, 2); 1217 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1218 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1219 *kv = aijkok->a_dual.view_device(); 1220 PetscFunctionReturn(PETSC_SUCCESS); 1221 } 1222 1223 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1224 { 1225 PetscFunctionBegin; 1226 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1227 PetscAssertPointer(kv, 2); 1228 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1229 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1230 PetscFunctionReturn(PETSC_SUCCESS); 1231 } 1232 1233 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A) 1234 { 1235 Mat_SeqAIJKokkos *akok; 1236 1237 PetscFunctionBegin; 1238 // Create host copies of the input aij 1239 auto i_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), i_d); 1240 auto j_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), j_d); 1241 // Don't copy the vals to the host now 1242 auto a_h = Kokkos::create_mirror_view(HostMirrorMemorySpace(), a_d); 1243 1244 MatScalarKokkosDualView a_dual = MatScalarKokkosDualView(a_d, a_h); 1245 // Note we have modified device data so it will copy lazily 1246 a_dual.modify_device(); 1247 MatRowMapKokkosDualView i_dual = MatRowMapKokkosDualView(i_d, i_h); 1248 MatColIdxKokkosDualView j_dual = MatColIdxKokkosDualView(j_d, j_h); 1249 1250 PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_dual.extent(0), i_dual, j_dual, a_dual)); 1251 PetscCall(MatCreate(comm, A)); 1252 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1253 PetscFunctionReturn(PETSC_SUCCESS); 1254 } 1255 1256 /* Computes Y += alpha X */ 1257 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1258 { 1259 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1260 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1261 ConstMatScalarKokkosView Xa; 1262 MatScalarKokkosView Ya; 1263 auto exec = PetscGetKokkosExecutionSpace(); 1264 1265 PetscFunctionBegin; 1266 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1267 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1268 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1269 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1270 PetscCall(PetscLogGpuTimeBegin()); 1271 1272 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1273 PetscBool e; 1274 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1275 if (e) { 1276 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1277 if (e) pattern = SAME_NONZERO_PATTERN; 1278 } 1279 } 1280 1281 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1282 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1283 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1284 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1285 */ 1286 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1287 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1288 Xa = xkok->a_dual.view_device(); 1289 Ya = ykok->a_dual.view_device(); 1290 1291 if (pattern == SAME_NONZERO_PATTERN) { 1292 KokkosBlas::axpy(exec, alpha, Xa, Ya); 1293 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1294 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1295 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1296 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1297 1298 Kokkos::parallel_for( 1299 Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1300 PetscInt i = t.league_rank(); // row i 1301 Kokkos::single(Kokkos::PerTeam(t), [=]() { 1302 // Only one thread works in a team 1303 PetscInt p, q = Yi(i); 1304 for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 1305 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 1306 if (Xj(p) == Yj(q)) { // Found it 1307 Ya(q) += alpha * Xa(p); 1308 q++; 1309 } else { 1310 // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1311 // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1312 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 1313 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 1314 #else 1315 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 1316 #endif 1317 } 1318 } 1319 }); 1320 }); 1321 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1322 } else { // different nonzero patterns 1323 Mat Z; 1324 KokkosCsrMatrix zcsr; 1325 KernelHandle kh; 1326 kh.create_spadd_handle(true); // X, Y are sorted 1327 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1328 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1329 zkok = new Mat_SeqAIJKokkos(zcsr); 1330 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1331 PetscCall(MatHeaderReplace(Y, &Z)); 1332 kh.destroy_spadd_handle(); 1333 } 1334 PetscCall(PetscLogGpuTimeEnd()); 1335 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 1336 PetscFunctionReturn(PETSC_SUCCESS); 1337 } 1338 1339 struct MatCOOStruct_SeqAIJKokkos { 1340 PetscCount n; 1341 PetscCount Atot; 1342 PetscInt nz; 1343 PetscCountKokkosView jmap; 1344 PetscCountKokkosView perm; 1345 1346 MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h) 1347 { 1348 nz = coo_h->nz; 1349 n = coo_h->n; 1350 Atot = coo_h->Atot; 1351 jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1)); 1352 perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot)); 1353 } 1354 }; 1355 1356 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data) 1357 { 1358 PetscFunctionBegin; 1359 PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data)); 1360 PetscFunctionReturn(PETSC_SUCCESS); 1361 } 1362 1363 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1364 { 1365 Mat_SeqAIJKokkos *akok; 1366 Mat_SeqAIJ *aseq; 1367 PetscContainer container_h; 1368 MatCOOStruct_SeqAIJ *coo_h; 1369 MatCOOStruct_SeqAIJKokkos *coo_d; 1370 1371 PetscFunctionBegin; 1372 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1373 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1374 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1375 delete akok; 1376 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE); 1377 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1378 1379 // Copy the COO struct to device 1380 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 1381 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 1382 PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h)); 1383 1384 // Put the COO struct in a container and then attach that to the matrix 1385 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos)); 1386 PetscFunctionReturn(PETSC_SUCCESS); 1387 } 1388 1389 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1390 { 1391 MatScalarKokkosView Aa; 1392 ConstMatScalarKokkosView kv; 1393 PetscMemType memtype; 1394 PetscContainer container; 1395 MatCOOStruct_SeqAIJKokkos *coo; 1396 1397 PetscFunctionBegin; 1398 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 1399 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 1400 1401 const auto &n = coo->n; 1402 const auto &Annz = coo->nz; 1403 const auto &jmap = coo->jmap; 1404 const auto &perm = coo->perm; 1405 1406 PetscCall(PetscGetMemType(v, &memtype)); 1407 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1408 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n)); 1409 } else { 1410 kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */ 1411 } 1412 1413 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1414 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1415 1416 PetscCall(PetscLogGpuTimeBegin()); 1417 Kokkos::parallel_for( 1418 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) { 1419 PetscScalar sum = 0.0; 1420 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1421 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1422 }); 1423 PetscCall(PetscLogGpuTimeEnd()); 1424 1425 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1426 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1427 PetscFunctionReturn(PETSC_SUCCESS); 1428 } 1429 1430 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1431 { 1432 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1433 1434 PetscFunctionBegin; 1435 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1436 A->boundtocpu = PETSC_FALSE; 1437 1438 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1439 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1440 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1441 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1442 A->ops->scale = MatScale_SeqAIJKokkos; 1443 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1444 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1445 A->ops->mult = MatMult_SeqAIJKokkos; 1446 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1447 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1448 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1449 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1450 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1451 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1452 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1453 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1454 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1455 A->ops->shift = MatShift_SeqAIJKokkos; 1456 A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos; 1457 A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos; 1458 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1459 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1460 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1461 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1462 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1463 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1464 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1465 1466 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1467 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1468 #if defined(PETSC_HAVE_HYPRE) 1469 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE)); 1470 #endif 1471 PetscFunctionReturn(PETSC_SUCCESS); 1472 } 1473 1474 /* 1475 Extract the (prescribled) diagonal blocks of the matrix and then invert them 1476 1477 Input Parameters: 1478 + A - the MATSEQAIJKOKKOS matrix 1479 . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i) 1480 . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[] 1481 . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i) 1482 - work - a pre-allocated work buffer (as big as diagVal) for use by this routine 1483 1484 Output Parameter: 1485 . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order) 1486 */ 1487 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal) 1488 { 1489 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1490 PetscInt nblocks = bs.extent(0) - 1; 1491 1492 PetscFunctionBegin; 1493 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device 1494 1495 // Pull out the diagonal blocks of the matrix and then invert the blocks 1496 auto Aa = akok->a_dual.view_device(); 1497 auto Ai = akok->i_dual.view_device(); 1498 auto Aj = akok->j_dual.view_device(); 1499 auto Adiag = akok->diag_dual.view_device(); 1500 // TODO: how to tune the team size? 1501 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY) 1502 auto ts = Kokkos::AUTO(); 1503 #else 1504 auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs 1505 #endif 1506 PetscCallCXX(Kokkos::parallel_for( 1507 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) { 1508 const PetscInt bid = teamMember.league_rank(); // block id 1509 const PetscInt rstart = bs(bid); // this block starts from this row 1510 const PetscInt m = bs(bid + 1) - bs(bid); // size of this block 1511 const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order 1512 const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m); 1513 1514 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B 1515 PetscInt i = rstart + r; // i-th row in A 1516 1517 if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case) 1518 PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row 1519 1520 for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet 1521 if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero 1522 B(r, c) = 0.0; 1523 } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column 1524 B(r, c) = Aa(first + c); 1525 } else { // this entry does not show up in the CSR 1526 B(r, c) = 0.0; 1527 } 1528 } 1529 } else { // rare case that the diagonal does not exist 1530 const PetscInt begin = Ai(i); 1531 const PetscInt end = Ai(i + 1); 1532 for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0; 1533 for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not. 1534 if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j); 1535 else if (Aj(j) >= rstart + m) break; 1536 } 1537 } 1538 }); 1539 1540 // LU-decompose B (w/o pivoting) and then invert B 1541 KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0); 1542 KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W); 1543 })); 1544 // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels 1545 PetscFunctionReturn(PETSC_SUCCESS); 1546 } 1547 1548 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1549 { 1550 Mat_SeqAIJ *aseq; 1551 PetscInt i, m, n; 1552 auto exec = PetscGetKokkosExecutionSpace(); 1553 1554 PetscFunctionBegin; 1555 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1556 1557 m = akok->nrows(); 1558 n = akok->ncols(); 1559 PetscCall(MatSetSizes(A, m, n, m, n)); 1560 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1561 1562 /* Set up data structures of A as a MATSEQAIJ */ 1563 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1564 aseq = (Mat_SeqAIJ *)A->data; 1565 1566 PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */ 1567 PetscCallCXX(akok->j_dual.sync_host(exec)); 1568 PetscCallCXX(exec.fence()); 1569 1570 aseq->i = akok->i_host_data(); 1571 aseq->j = akok->j_host_data(); 1572 aseq->a = akok->a_host_data(); 1573 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1574 aseq->free_a = PETSC_FALSE; 1575 aseq->free_ij = PETSC_FALSE; 1576 aseq->nz = akok->nnz(); 1577 aseq->maxnz = aseq->nz; 1578 1579 PetscCall(PetscMalloc1(m, &aseq->imax)); 1580 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1581 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1582 1583 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1584 akok->nonzerostate = A->nonzerostate; 1585 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1586 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1587 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1588 PetscFunctionReturn(PETSC_SUCCESS); 1589 } 1590 1591 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 1592 { 1593 PetscFunctionBegin; 1594 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1595 *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 1596 PetscFunctionReturn(PETSC_SUCCESS); 1597 } 1598 1599 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 1600 { 1601 Mat_SeqAIJKokkos *akok; 1602 1603 PetscFunctionBegin; 1604 PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 1605 PetscCall(MatCreate(comm, A)); 1606 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1607 PetscFunctionReturn(PETSC_SUCCESS); 1608 } 1609 1610 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1611 1612 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1613 */ 1614 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1615 { 1616 PetscFunctionBegin; 1617 PetscCall(MatCreate(comm, A)); 1618 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1619 PetscFunctionReturn(PETSC_SUCCESS); 1620 } 1621 1622 /*@C 1623 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1624 (the default parallel PETSc format). This matrix will ultimately be handled by 1625 Kokkos for calculations. 1626 1627 Collective 1628 1629 Input Parameters: 1630 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1631 . m - number of rows 1632 . n - number of columns 1633 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 1634 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 1635 1636 Output Parameter: 1637 . A - the matrix 1638 1639 Level: intermediate 1640 1641 Notes: 1642 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1643 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1644 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1645 1646 The AIJ format, also called 1647 compressed row storage, is fully compatible with standard Fortran 1648 storage. That is, the stored row and column indices can begin at 1649 either one (as in Fortran) or zero. 1650 1651 Specify the preallocated storage with either `nz` or `nnz` (not both). 1652 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1653 allocation. 1654 1655 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()` 1656 @*/ 1657 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1658 { 1659 PetscFunctionBegin; 1660 PetscCall(PetscKokkosInitializeCheck()); 1661 PetscCall(MatCreate(comm, A)); 1662 PetscCall(MatSetSizes(*A, m, n, m, n)); 1663 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1664 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1665 PetscFunctionReturn(PETSC_SUCCESS); 1666 } 1667 1668 // After matrix numeric factorization, there are still steps to do before triangular solve can be called. 1669 // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does). 1670 // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve(). 1671 // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck. 1672 static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A) 1673 { 1674 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1675 const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy 1676 const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy 1677 1678 PetscFunctionBegin; 1679 if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet 1680 if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d)); 1681 if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d)); 1682 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1683 } 1684 PetscFunctionReturn(PETSC_SUCCESS); 1685 } 1686 1687 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1688 { 1689 const PetscInt n = A->rmap->n; 1690 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1691 const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy 1692 const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy 1693 1694 PetscFunctionBegin; 1695 if (!factors->transpose_updated) { 1696 if (has_upper) { 1697 if (!factors->iUt_d.extent(0)) { // Allocate Ut on device if not yet 1698 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix 1699 factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 1700 factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 1701 } 1702 1703 if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host 1704 if (!factors->U) { 1705 Mat_SeqAIJ *seq; 1706 1707 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U)); 1708 PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut)); 1709 1710 seq = static_cast<Mat_SeqAIJ *>(factors->Ut->data); 1711 factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1); 1712 factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz); 1713 factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz); 1714 } else { 1715 PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h 1716 } 1717 // Copy Ut from host to device 1718 PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h)); 1719 PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h)); 1720 PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h)); 1721 } else { // If U was computed on device, we also compute the transpose there 1722 // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. 1723 PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, 1724 factors->jU_d, factors->aU_d, 1725 factors->iUt_d, factors->jUt_d, 1726 factors->aUt_d)); 1727 PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d)); 1728 } 1729 PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d)); 1730 } 1731 1732 // do the same for L with LU 1733 if (has_lower) { 1734 if (!factors->iLt_d.extent(0)) { // Allocate Lt on device if not yet 1735 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix 1736 factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 1737 factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 1738 } 1739 1740 if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host 1741 if (!factors->L) { 1742 Mat_SeqAIJ *seq; 1743 1744 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L)); 1745 PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt)); 1746 1747 seq = static_cast<Mat_SeqAIJ *>(factors->Lt->data); 1748 factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1); 1749 factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz); 1750 factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz); 1751 } else { 1752 PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h 1753 } 1754 // Copy Lt from host to device 1755 PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h)); 1756 PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h)); 1757 PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h)); 1758 } else { // If L was computed on device, we also compute the transpose there 1759 // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. 1760 PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, 1761 factors->jL_d, factors->aL_d, 1762 factors->iLt_d, factors->jLt_d, 1763 factors->aLt_d)); 1764 PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d)); 1765 } 1766 PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d)); 1767 } 1768 1769 factors->transpose_updated = PETSC_TRUE; 1770 } 1771 PetscFunctionReturn(PETSC_SUCCESS); 1772 } 1773 1774 // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A. 1775 // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty. 1776 static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx) 1777 { 1778 auto exec = PetscGetKokkosExecutionSpace(); 1779 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1780 PetscInt m = A->rmap->n; 1781 PetscScalarKokkosView D = factors->D_d; 1782 PetscScalarKokkosView X, Y, B; // alias 1783 ConstPetscScalarKokkosView b; 1784 PetscScalarKokkosView x; 1785 PetscIntKokkosView &rowperm = factors->rowperm; 1786 PetscBool identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1787 1788 PetscFunctionBegin; 1789 PetscCall(PetscLogGpuTimeBegin()); 1790 PetscCall(MatSeqAIJKokkosSolveCheck(A)); // for UX = T 1791 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B 1792 PetscCall(VecGetKokkosView(bb, &b)); 1793 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1794 1795 // Solve U^T Y = B 1796 if (identity) { // Reorder b with the row permutation 1797 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1798 Y = factors->workVector; 1799 } else { 1800 B = factors->workVector; 1801 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); })); 1802 Y = x; 1803 } 1804 PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y)); 1805 1806 // Solve diag(D) Y' = Y. 1807 // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication. 1808 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); })); 1809 1810 // Solve UX = Y 1811 if (identity) { 1812 X = x; 1813 } else { 1814 X = factors->workVector; // B is not needed anymore 1815 } 1816 PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X)); 1817 1818 // Reorder X with the inverse column (row) permutation 1819 if (!identity) { 1820 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); })); 1821 } 1822 1823 PetscCall(VecRestoreKokkosView(bb, &b)); 1824 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1825 PetscCall(PetscLogGpuTimeEnd()); 1826 PetscFunctionReturn(PETSC_SUCCESS); 1827 } 1828 1829 // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively. 1830 // R and C are represented by rowperm and colperm in factors. 1831 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty. 1832 static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx) 1833 { 1834 auto exec = PetscGetKokkosExecutionSpace(); 1835 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1836 PetscInt m = A->rmap->n; 1837 PetscScalarKokkosView X, Y, B; // alias 1838 ConstPetscScalarKokkosView b; 1839 PetscScalarKokkosView x; 1840 PetscIntKokkosView &rowperm = factors->rowperm; 1841 PetscIntKokkosView &colperm = factors->colperm; 1842 PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1843 PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1844 1845 PetscFunctionBegin; 1846 PetscCall(PetscLogGpuTimeBegin()); 1847 PetscCall(MatSeqAIJKokkosSolveCheck(A)); 1848 PetscCall(VecGetKokkosView(bb, &b)); 1849 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1850 1851 // Solve L Y = B (i.e., L (U C^- x) = R b). R b indicates applying the row permutation on b. 1852 if (row_identity) { 1853 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1854 Y = factors->workVector; 1855 } else { 1856 B = factors->workVector; 1857 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); })); 1858 Y = x; 1859 } 1860 PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y)); 1861 1862 // Solve U C^- x = Y 1863 if (col_identity) { 1864 X = x; 1865 } else { 1866 X = factors->workVector; 1867 } 1868 PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X)); 1869 1870 // x = C X; Reorder X with the inverse col permutation 1871 if (!col_identity) { 1872 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); })); 1873 } 1874 1875 PetscCall(VecRestoreKokkosView(bb, &b)); 1876 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1877 PetscCall(PetscLogGpuTimeEnd()); 1878 PetscFunctionReturn(PETSC_SUCCESS); 1879 } 1880 1881 // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively. 1882 // R and C are represented by rowperm and colperm in factors. 1883 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty. 1884 // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R. 1885 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx) 1886 { 1887 auto exec = PetscGetKokkosExecutionSpace(); 1888 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1889 PetscInt m = A->rmap->n; 1890 PetscScalarKokkosView X, Y, B; // alias 1891 ConstPetscScalarKokkosView b; 1892 PetscScalarKokkosView x; 1893 PetscIntKokkosView &rowperm = factors->rowperm; 1894 PetscIntKokkosView &colperm = factors->colperm; 1895 PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1896 PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1897 1898 PetscFunctionBegin; 1899 PetscCall(PetscLogGpuTimeBegin()); 1900 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T 1901 PetscCall(VecGetKokkosView(bb, &b)); 1902 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1903 1904 // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b). Note C^- b = C^T b, which means applying the column permutation on b. 1905 if (col_identity) { // Reorder b with the col permutation 1906 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1907 Y = factors->workVector; 1908 } else { 1909 B = factors->workVector; 1910 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); })); 1911 Y = x; 1912 } 1913 PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y)); 1914 1915 // Solve L^T X = Y 1916 if (row_identity) { 1917 X = x; 1918 } else { 1919 X = factors->workVector; 1920 } 1921 PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X)); 1922 1923 // x = R^- X = R^T X; Reorder X with the inverse row permutation 1924 if (!row_identity) { 1925 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); })); 1926 } 1927 1928 PetscCall(VecRestoreKokkosView(bb, &b)); 1929 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1930 PetscCall(PetscLogGpuTimeEnd()); 1931 PetscFunctionReturn(PETSC_SUCCESS); 1932 } 1933 1934 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1935 { 1936 PetscFunctionBegin; 1937 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1938 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1939 1940 if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device 1941 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1942 Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data); 1943 const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag; 1944 const MatScalar *Ba = b->a; 1945 PetscInt m = B->rmap->n, n = B->cmap->n; 1946 1947 if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time 1948 // Allocate memory and copy the structure 1949 factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1); 1950 factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries 1951 factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m); 1952 factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1); 1953 factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m])); 1954 factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m])); 1955 1956 PetscInt *Li = factors->iL_h.data(); 1957 PetscInt *Lj = factors->jL_h.data(); 1958 PetscInt *Ui = factors->iU_h.data(); 1959 PetscInt *Uj = factors->jU_h.data(); 1960 1961 Li[0] = Ui[0] = 0; 1962 for (PetscInt i = 0; i < m; i++) { 1963 PetscInt llen = Bi[i + 1] - Bi[i]; // exclusive of the diagonal entry 1964 PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry 1965 1966 PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen); // entries of L on the left of the diagonal 1967 Lj[Li[i] + llen] = i; // diagonal entry of L 1968 1969 Uj[Ui[i]] = i; // diagonal entry of U 1970 PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1); // entries of U on the right of the diagonal 1971 1972 Li[i + 1] = Li[i] + llen + 1; 1973 Ui[i + 1] = Ui[i] + ulen; 1974 } 1975 1976 factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h); 1977 factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h); 1978 factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h); 1979 factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h); 1980 factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h); 1981 factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h); 1982 1983 // Copy row/col permutation to device 1984 IS rowperm = ((Mat_SeqAIJ *)B->data)->row; 1985 PetscBool row_identity; 1986 PetscCall(ISIdentity(rowperm, &row_identity)); 1987 if (!row_identity) { 1988 const PetscInt *ip; 1989 1990 PetscCall(ISGetIndices(rowperm, &ip)); 1991 factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m); 1992 PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m))); 1993 PetscCall(ISRestoreIndices(rowperm, &ip)); 1994 PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt))); 1995 } 1996 1997 IS colperm = ((Mat_SeqAIJ *)B->data)->col; 1998 PetscBool col_identity; 1999 PetscCall(ISIdentity(colperm, &col_identity)); 2000 if (!col_identity) { 2001 const PetscInt *ip; 2002 2003 PetscCall(ISGetIndices(colperm, &ip)); 2004 factors->colperm = PetscIntKokkosView(NoInit("colperm"), n); 2005 PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n))); 2006 PetscCall(ISRestoreIndices(colperm, &ip)); 2007 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 2008 } 2009 2010 /* Create sptrsv handles for L, U and their transpose */ 2011 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2012 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2013 #else 2014 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2015 #endif 2016 factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */); 2017 factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */); 2018 factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */); 2019 factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */); 2020 } 2021 2022 // Copy the value 2023 for (PetscInt i = 0; i < m; i++) { 2024 PetscInt llen = Bi[i + 1] - Bi[i]; 2025 PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; 2026 const PetscInt *Li = factors->iL_h.data(); 2027 const PetscInt *Ui = factors->iU_h.data(); 2028 2029 PetscScalar *La = factors->aL_h.data(); 2030 PetscScalar *Ua = factors->aU_h.data(); 2031 2032 PetscArraycpy(La + Li[i], Ba + Bi[i], llen); // entries of L 2033 La[Li[i] + llen] = 1.0; // diagonal entry 2034 2035 Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]]; // diagonal entry 2036 PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1); // entries of U 2037 } 2038 2039 PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h)); 2040 PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h)); 2041 // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic 2042 factors->transpose_updated = PETSC_FALSE; 2043 factors->sptrsv_symbolic_completed = PETSC_FALSE; 2044 2045 B->ops->solve = MatSolve_SeqAIJKokkos_LU; 2046 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU; 2047 } 2048 2049 B->ops->matsolve = NULL; 2050 B->ops->matsolvetranspose = NULL; 2051 PetscFunctionReturn(PETSC_SUCCESS); 2052 } 2053 2054 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info) 2055 { 2056 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 2057 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2058 PetscInt fill_lev = info->levels; 2059 2060 PetscFunctionBegin; 2061 PetscCall(PetscLogGpuTimeBegin()); 2062 PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false"); 2063 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2064 2065 auto a_d = aijkok->a_dual.view_device(); 2066 auto i_d = aijkok->i_dual.view_device(); 2067 auto j_d = aijkok->j_dual.view_device(); 2068 2069 PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d)); 2070 2071 B->assembled = PETSC_TRUE; 2072 B->preallocated = PETSC_TRUE; 2073 B->ops->solve = MatSolve_SeqAIJKokkos_LU; 2074 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU; 2075 B->ops->matsolve = NULL; 2076 B->ops->matsolvetranspose = NULL; 2077 2078 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 2079 factors->transpose_updated = PETSC_FALSE; 2080 factors->sptrsv_symbolic_completed = PETSC_FALSE; 2081 /* TODO: log flops, but how to know that? */ 2082 PetscCall(PetscLogGpuTimeEnd()); 2083 PetscFunctionReturn(PETSC_SUCCESS); 2084 } 2085 2086 // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering 2087 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info) 2088 { 2089 Mat_SeqAIJKokkos *aijkok; 2090 Mat_SeqAIJ *b; 2091 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2092 PetscInt fill_lev = info->levels; 2093 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 2094 PetscInt n = A->rmap->n; 2095 2096 PetscFunctionBegin; 2097 PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now"); 2098 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2099 2100 /* Create a spiluk handle and then do symbolic factorization */ 2101 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 2102 factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 2103 2104 auto spiluk_handle = factors->kh.get_spiluk_handle(); 2105 2106 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 2107 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 2108 Kokkos::realloc(factors->iU_d, n + 1); 2109 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 2110 2111 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 2112 auto i_d = aijkok->i_dual.view_device(); 2113 auto j_d = aijkok->j_dual.view_device(); 2114 PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d)); 2115 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 2116 2117 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 2118 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 2119 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 2120 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 2121 2122 /* TODO: add options to select sptrsv algorithms */ 2123 /* Create sptrsv handles for L, U and their transpose */ 2124 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2125 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2126 #else 2127 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2128 #endif 2129 2130 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 2131 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 2132 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 2133 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 2134 2135 /* Fill fields of the factor matrix B */ 2136 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 2137 b = (Mat_SeqAIJ *)B->data; 2138 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 2139 B->info.fill_ratio_given = info->fill; 2140 B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 2141 2142 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0; 2143 PetscFunctionReturn(PETSC_SUCCESS); 2144 } 2145 2146 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2147 { 2148 PetscFunctionBegin; 2149 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2150 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2151 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2152 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 2153 PetscFunctionReturn(PETSC_SUCCESS); 2154 } 2155 2156 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2157 { 2158 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2159 2160 PetscFunctionBegin; 2161 if (!info->factoronhost) { 2162 PetscCall(ISIdentity(isrow, &row_identity)); 2163 PetscCall(ISIdentity(iscol, &col_identity)); 2164 } 2165 2166 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2167 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2168 2169 if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering 2170 PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info)); 2171 } else { 2172 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host 2173 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 2174 } 2175 PetscFunctionReturn(PETSC_SUCCESS); 2176 } 2177 2178 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 2179 { 2180 PetscFunctionBegin; 2181 PetscCall(MatSeqAIJKokkosSyncHost(A)); 2182 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 2183 2184 if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device 2185 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2186 Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data); 2187 const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag; 2188 const MatScalar *Ba = b->a; 2189 PetscInt m = B->rmap->n; 2190 2191 if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization 2192 // Allocate memory and copy the structure 2193 factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h 2194 factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]); 2195 factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]); 2196 factors->D_h = MatScalarKokkosViewHost(NoInit("D_h"), m); 2197 factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h); 2198 factors->D_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h); 2199 2200 // Build jU_h from the skewed Aj 2201 PetscInt *Uj = factors->jU_h.data(); 2202 for (PetscInt i = 0; i < m; i++) { 2203 PetscInt ulen = Bi[i + 1] - Bi[i]; 2204 Uj[Bi[i]] = i; // diagonal entry 2205 PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal 2206 } 2207 2208 // Copy iU, jU to device 2209 PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h)); 2210 PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h)); 2211 2212 // Copy row/col permutation to device 2213 IS rowperm = ((Mat_SeqAIJ *)B->data)->row; 2214 PetscBool row_identity; 2215 PetscCall(ISIdentity(rowperm, &row_identity)); 2216 if (!row_identity) { 2217 const PetscInt *ip; 2218 2219 PetscCall(ISGetIndices(rowperm, &ip)); 2220 PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m)); 2221 PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m))); 2222 PetscCall(ISRestoreIndices(rowperm, &ip)); 2223 PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt))); 2224 } 2225 2226 // Create sptrsv handles for U and U^T 2227 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2228 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2229 #else 2230 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2231 #endif 2232 factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */); 2233 factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */); 2234 } 2235 // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them 2236 B->ops->solve = MatSolve_SeqAIJKokkos_Cholesky; 2237 B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky; 2238 2239 // Copy the value 2240 PetscScalar *Ua = factors->aU_h.data(); 2241 PetscScalar *D = factors->D_h.data(); 2242 for (PetscInt i = 0; i < m; i++) { 2243 D[i] = Ba[Bdiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 2244 Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U 2245 for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k]; 2246 } 2247 PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h)); 2248 PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h)); 2249 2250 factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again 2251 factors->transpose_updated = PETSC_FALSE; 2252 } 2253 2254 B->ops->matsolve = NULL; 2255 B->ops->matsolvetranspose = NULL; 2256 PetscFunctionReturn(PETSC_SUCCESS); 2257 } 2258 2259 static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2260 { 2261 PetscFunctionBegin; 2262 if (info->solveonhost) { 2263 // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc. 2264 PetscCall(MatSetType(B, MATSEQSBAIJ)); 2265 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 2266 } 2267 2268 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2269 2270 if (!info->solveonhost) { 2271 // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr 2272 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2273 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2274 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos; 2275 } 2276 PetscFunctionReturn(PETSC_SUCCESS); 2277 } 2278 2279 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2280 { 2281 PetscFunctionBegin; 2282 if (info->solveonhost) { 2283 // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc. 2284 PetscCall(MatSetType(B, MATSEQSBAIJ)); 2285 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 2286 } 2287 2288 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm 2289 2290 if (!info->solveonhost) { 2291 // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr 2292 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2293 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2294 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos; 2295 } 2296 PetscFunctionReturn(PETSC_SUCCESS); 2297 } 2298 2299 // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix 2300 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type) 2301 { 2302 PetscFunctionBegin; 2303 *type = MATSOLVERKOKKOS; 2304 PetscFunctionReturn(PETSC_SUCCESS); 2305 } 2306 2307 /*MC 2308 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 2309 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 2310 2311 Level: beginner 2312 2313 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 2314 M*/ 2315 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 2316 { 2317 PetscInt n = A->rmap->n; 2318 MPI_Comm comm; 2319 2320 PetscFunctionBegin; 2321 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 2322 PetscCall(MatCreate(comm, B)); 2323 PetscCall(MatSetSizes(*B, n, n, n, n)); 2324 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2325 (*B)->factortype = ftype; 2326 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 2327 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2328 PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2329 2330 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2331 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 2332 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 2333 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2334 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2335 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2336 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2337 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJKokkos; 2338 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos; 2339 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2340 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2341 } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 2342 2343 // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host 2344 (*B)->canuseordering = PETSC_TRUE; 2345 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos)); 2346 PetscFunctionReturn(PETSC_SUCCESS); 2347 } 2348 2349 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void) 2350 { 2351 PetscFunctionBegin; 2352 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 2353 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos)); 2354 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 2355 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos)); 2356 PetscFunctionReturn(PETSC_SUCCESS); 2357 } 2358 2359 /* Utility to print out a KokkosCsrMatrix for debugging */ 2360 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 2361 { 2362 const auto &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map); 2363 const auto &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries); 2364 const auto &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values); 2365 const PetscInt *i = iv.data(); 2366 const PetscInt *j = jv.data(); 2367 const PetscScalar *a = av.data(); 2368 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 2369 2370 PetscFunctionBegin; 2371 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 2372 for (PetscInt k = 0; k < m; k++) { 2373 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 2374 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 2375 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 2376 } 2377 PetscFunctionReturn(PETSC_SUCCESS); 2378 } 2379