1 #include <petsc_kokkos.hpp> 2 #include <petscvec_kokkos.hpp> 3 #include <petscmat_kokkos.hpp> 4 #include <petscpkg_version.h> 5 #include <petsc/private/petscimpl.h> 6 #include <petsc/private/sfimpl.h> 7 #include <petsc/private/kokkosimpl.hpp> 8 #include <petscsys.h> 9 10 #include <Kokkos_Core.hpp> 11 #include <KokkosBlas.hpp> 12 #include <KokkosSparse_CrsMatrix.hpp> 13 14 // To suppress compiler warnings: 15 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63: 16 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, 17 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*, 18 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations] 19 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations") 20 #include <KokkosSparse_spmv.hpp> 21 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END() 22 23 #include <KokkosSparse_spiluk.hpp> 24 #include <KokkosSparse_sptrsv.hpp> 25 #include <KokkosSparse_spgemm.hpp> 26 #include <KokkosSparse_spadd.hpp> 27 #include <KokkosBatched_LU_Decl.hpp> 28 #include <KokkosBatched_InverseLU_Decl.hpp> 29 30 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 31 32 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 33 #include <KokkosSparse_Utils.hpp> 34 using KokkosSparse::sort_crs_matrix; 35 using KokkosSparse::Impl::transpose_matrix; 36 #else 37 #include <KokkosKernels_Sorting.hpp> 38 using KokkosKernels::sort_crs_matrix; 39 using KokkosKernels::Impl::transpose_matrix; 40 #endif 41 42 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0) 43 using KokkosSparse::spiluk_symbolic; 44 using KokkosSparse::spiluk_numeric; 45 using KokkosSparse::sptrsv_symbolic; 46 using KokkosSparse::sptrsv_solve; 47 using KokkosSparse::Experimental::SPTRSVAlgorithm; 48 using KokkosSparse::Experimental::SPILUKAlgorithm; 49 #else 50 using KokkosSparse::Experimental::spiluk_symbolic; 51 using KokkosSparse::Experimental::spiluk_numeric; 52 using KokkosSparse::Experimental::sptrsv_symbolic; 53 using KokkosSparse::Experimental::sptrsv_solve; 54 using KokkosSparse::Experimental::SPTRSVAlgorithm; 55 using KokkosSparse::Experimental::SPILUKAlgorithm; 56 #endif 57 58 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 59 60 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 61 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 62 In the latter case, it is important to set a_dual's sync state correctly. 63 */ 64 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 65 { 66 Mat_SeqAIJ *aijseq; 67 Mat_SeqAIJKokkos *aijkok; 68 69 PetscFunctionBegin; 70 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 71 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 72 73 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 74 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 75 76 /* If aijkok does not exist, we just copy i, j to device. 77 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 78 In both cases, we build a new aijkok structure. 79 */ 80 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 81 delete aijkok; 82 aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 83 A->spptr = aijkok; 84 } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag. 85 MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n); 86 auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h); 87 aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h); 88 } 89 PetscFunctionReturn(PETSC_SUCCESS); 90 } 91 92 /* Sync CSR data to device if not yet */ 93 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 94 { 95 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 96 97 PetscFunctionBegin; 98 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 99 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 100 if (aijkok->a_dual.need_sync_device()) { 101 PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 102 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 103 aijkok->hermitian_updated = PETSC_FALSE; 104 } 105 PetscFunctionReturn(PETSC_SUCCESS); 106 } 107 108 /* Mark the CSR data on device as modified */ 109 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 110 { 111 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 112 113 PetscFunctionBegin; 114 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 115 aijkok->a_dual.clear_sync_state(); 116 aijkok->a_dual.modify_device(); 117 aijkok->transpose_updated = PETSC_FALSE; 118 aijkok->hermitian_updated = PETSC_FALSE; 119 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 120 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 121 PetscFunctionReturn(PETSC_SUCCESS); 122 } 123 124 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 125 { 126 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 127 auto exec = PetscGetKokkosExecutionSpace(); 128 129 PetscFunctionBegin; 130 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 131 /* We do not expect one needs factors on host */ 132 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 133 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 134 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec)); 135 PetscFunctionReturn(PETSC_SUCCESS); 136 } 137 138 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 139 { 140 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 141 142 PetscFunctionBegin; 143 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 144 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 145 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 146 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 147 */ 148 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 149 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 150 *array = aijkok->a_dual.view_host().data(); 151 } else { /* Happens when calling MatSetValues on a newly created matrix */ 152 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 153 } 154 PetscFunctionReturn(PETSC_SUCCESS); 155 } 156 157 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 158 { 159 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 160 161 PetscFunctionBegin; 162 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 163 PetscFunctionReturn(PETSC_SUCCESS); 164 } 165 166 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 167 { 168 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 169 170 PetscFunctionBegin; 171 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 172 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 173 *array = aijkok->a_dual.view_host().data(); 174 } else { 175 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 176 } 177 PetscFunctionReturn(PETSC_SUCCESS); 178 } 179 180 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 181 { 182 PetscFunctionBegin; 183 *array = NULL; 184 PetscFunctionReturn(PETSC_SUCCESS); 185 } 186 187 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 188 { 189 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 190 191 PetscFunctionBegin; 192 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 193 *array = aijkok->a_dual.view_host().data(); 194 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 195 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 196 } 197 PetscFunctionReturn(PETSC_SUCCESS); 198 } 199 200 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 201 { 202 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 203 204 PetscFunctionBegin; 205 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 206 aijkok->a_dual.clear_sync_state(); 207 aijkok->a_dual.modify_host(); 208 } 209 PetscFunctionReturn(PETSC_SUCCESS); 210 } 211 212 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 213 { 214 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 215 216 PetscFunctionBegin; 217 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 218 219 if (i) *i = aijkok->i_device_data(); 220 if (j) *j = aijkok->j_device_data(); 221 if (a) { 222 PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 223 *a = aijkok->a_device_data(); 224 } 225 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 226 PetscFunctionReturn(PETSC_SUCCESS); 227 } 228 229 /* 230 Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 231 232 Input Parameter: 233 . A - the MATSEQAIJKOKKOS matrix 234 235 Output Parameters: 236 + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 237 - T_d - the transpose on device, whose value array is allocated but not initialized 238 */ 239 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 240 { 241 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 242 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 243 const PetscInt *Ai = aseq->i, *Aj = aseq->j; 244 MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 245 MatRowMapType *Ti = Ti_h.data(); 246 MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 247 MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 248 PetscInt *Tj = Tj_h.data(); 249 PetscInt *perm = perm_h.data(); 250 PetscInt *offset; 251 252 PetscFunctionBegin; 253 // Populate Ti 254 PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 255 Ti++; 256 for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 257 Ti--; 258 for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 259 260 // Populate Tj and the permutation array 261 PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 262 for (PetscInt i = 0; i < m; i++) { 263 for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 264 PetscInt r = Aj[j]; // row r of T 265 PetscInt disp = Ti[r] + offset[r]; 266 267 Tj[disp] = i; // col i of T 268 perm[disp] = j; 269 offset[r]++; 270 } 271 } 272 PetscCall(PetscFree(offset)); 273 274 // Sort each row of T, along with the permutation array 275 for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 276 277 // Output perm and T on device 278 auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 279 auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 280 PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 281 PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 282 PetscFunctionReturn(PETSC_SUCCESS); 283 } 284 285 // Generate the transpose on device and cache it internally 286 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 287 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 288 { 289 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 290 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 291 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 292 KokkosCsrMatrix &T = akok->csrmatT; 293 294 PetscFunctionBegin; 295 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 296 PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device 297 298 const auto &Aa = akok->a_dual.view_device(); 299 300 if (A->symmetric == PETSC_BOOL3_TRUE) { 301 *csrmatT = akok->csrmat; 302 } else { 303 // See if we already have a cached transpose and its value is up to date 304 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 305 if (!akok->transpose_updated) { // if the value is out of date, update the cached version 306 const auto &perm = akok->transpose_perm; // get the permutation array 307 auto &Ta = T.values; 308 309 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 310 } 311 } else { // Generate T of size n x m for the first time 312 MatRowMapKokkosView perm; 313 314 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 315 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 316 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 317 } 318 akok->transpose_updated = PETSC_TRUE; 319 *csrmatT = akok->csrmatT; 320 } 321 PetscFunctionReturn(PETSC_SUCCESS); 322 } 323 324 // Generate the Hermitian on device and cache it internally 325 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 326 { 327 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 328 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 329 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 330 KokkosCsrMatrix &T = akok->csrmatH; 331 332 PetscFunctionBegin; 333 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 334 PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device 335 336 const auto &Aa = akok->a_dual.view_device(); 337 338 if (A->hermitian == PETSC_BOOL3_TRUE) { 339 *csrmatH = akok->csrmat; 340 } else { 341 // See if we already have a cached hermitian and its value is up to date 342 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 343 if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 344 const auto &perm = akok->transpose_perm; // get the permutation array 345 auto &Ta = T.values; 346 347 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 348 } 349 } else { // Generate T of size n x m for the first time 350 MatRowMapKokkosView perm; 351 352 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 353 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 354 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 355 } 356 akok->hermitian_updated = PETSC_TRUE; 357 *csrmatH = akok->csrmatH; 358 } 359 PetscFunctionReturn(PETSC_SUCCESS); 360 } 361 362 /* y = A x */ 363 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 364 { 365 Mat_SeqAIJKokkos *aijkok; 366 ConstPetscScalarKokkosView xv; 367 PetscScalarKokkosView yv; 368 369 PetscFunctionBegin; 370 PetscCall(PetscLogGpuTimeBegin()); 371 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 372 PetscCall(VecGetKokkosView(xx, &xv)); 373 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 374 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 375 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 376 PetscCall(VecRestoreKokkosView(xx, &xv)); 377 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 378 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 379 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 380 PetscCall(PetscLogGpuTimeEnd()); 381 PetscFunctionReturn(PETSC_SUCCESS); 382 } 383 384 /* y = A^T x */ 385 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 386 { 387 Mat_SeqAIJKokkos *aijkok; 388 const char *mode; 389 ConstPetscScalarKokkosView xv; 390 PetscScalarKokkosView yv; 391 KokkosCsrMatrix csrmat; 392 393 PetscFunctionBegin; 394 PetscCall(PetscLogGpuTimeBegin()); 395 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 396 PetscCall(VecGetKokkosView(xx, &xv)); 397 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 398 if (A->form_explicit_transpose) { 399 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 400 mode = "N"; 401 } else { 402 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 403 csrmat = aijkok->csrmat; 404 mode = "T"; 405 } 406 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 407 PetscCall(VecRestoreKokkosView(xx, &xv)); 408 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 409 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 410 PetscCall(PetscLogGpuTimeEnd()); 411 PetscFunctionReturn(PETSC_SUCCESS); 412 } 413 414 /* y = A^H x */ 415 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 416 { 417 Mat_SeqAIJKokkos *aijkok; 418 const char *mode; 419 ConstPetscScalarKokkosView xv; 420 PetscScalarKokkosView yv; 421 KokkosCsrMatrix csrmat; 422 423 PetscFunctionBegin; 424 PetscCall(PetscLogGpuTimeBegin()); 425 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 426 PetscCall(VecGetKokkosView(xx, &xv)); 427 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 428 if (A->form_explicit_transpose) { 429 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 430 mode = "N"; 431 } else { 432 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 433 csrmat = aijkok->csrmat; 434 mode = "C"; 435 } 436 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 437 PetscCall(VecRestoreKokkosView(xx, &xv)); 438 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 439 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 440 PetscCall(PetscLogGpuTimeEnd()); 441 PetscFunctionReturn(PETSC_SUCCESS); 442 } 443 444 /* z = A x + y */ 445 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 446 { 447 Mat_SeqAIJKokkos *aijkok; 448 ConstPetscScalarKokkosView xv; 449 PetscScalarKokkosView zv; 450 451 PetscFunctionBegin; 452 PetscCall(PetscLogGpuTimeBegin()); 453 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 454 if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host 455 PetscCall(VecGetKokkosView(xx, &xv)); 456 PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device 457 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 458 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 459 PetscCall(VecRestoreKokkosView(xx, &xv)); 460 PetscCall(VecRestoreKokkosView(zz, &zv)); 461 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 462 PetscCall(PetscLogGpuTimeEnd()); 463 PetscFunctionReturn(PETSC_SUCCESS); 464 } 465 466 /* z = A^T x + y */ 467 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 468 { 469 Mat_SeqAIJKokkos *aijkok; 470 const char *mode; 471 ConstPetscScalarKokkosView xv; 472 PetscScalarKokkosView zv; 473 KokkosCsrMatrix csrmat; 474 475 PetscFunctionBegin; 476 PetscCall(PetscLogGpuTimeBegin()); 477 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 478 if (zz != yy) PetscCall(VecCopy(yy, zz)); 479 PetscCall(VecGetKokkosView(xx, &xv)); 480 PetscCall(VecGetKokkosView(zz, &zv)); 481 if (A->form_explicit_transpose) { 482 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 483 mode = "N"; 484 } else { 485 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 486 csrmat = aijkok->csrmat; 487 mode = "T"; 488 } 489 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 490 PetscCall(VecRestoreKokkosView(xx, &xv)); 491 PetscCall(VecRestoreKokkosView(zz, &zv)); 492 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 493 PetscCall(PetscLogGpuTimeEnd()); 494 PetscFunctionReturn(PETSC_SUCCESS); 495 } 496 497 /* z = A^H x + y */ 498 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 499 { 500 Mat_SeqAIJKokkos *aijkok; 501 const char *mode; 502 ConstPetscScalarKokkosView xv; 503 PetscScalarKokkosView zv; 504 KokkosCsrMatrix csrmat; 505 506 PetscFunctionBegin; 507 PetscCall(PetscLogGpuTimeBegin()); 508 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 509 if (zz != yy) PetscCall(VecCopy(yy, zz)); 510 PetscCall(VecGetKokkosView(xx, &xv)); 511 PetscCall(VecGetKokkosView(zz, &zv)); 512 if (A->form_explicit_transpose) { 513 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 514 mode = "N"; 515 } else { 516 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 517 csrmat = aijkok->csrmat; 518 mode = "C"; 519 } 520 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 521 PetscCall(VecRestoreKokkosView(xx, &xv)); 522 PetscCall(VecRestoreKokkosView(zz, &zv)); 523 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 524 PetscCall(PetscLogGpuTimeEnd()); 525 PetscFunctionReturn(PETSC_SUCCESS); 526 } 527 528 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 529 { 530 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 531 532 PetscFunctionBegin; 533 switch (op) { 534 case MAT_FORM_EXPLICIT_TRANSPOSE: 535 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 536 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 537 A->form_explicit_transpose = flg; 538 break; 539 default: 540 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 541 break; 542 } 543 PetscFunctionReturn(PETSC_SUCCESS); 544 } 545 546 /* Depending on reuse, either build a new mat, or use the existing mat */ 547 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 548 { 549 Mat_SeqAIJ *aseq; 550 551 PetscFunctionBegin; 552 PetscCall(PetscKokkosInitializeCheck()); 553 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 554 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */ 555 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 556 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 557 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 558 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 559 PetscCall(PetscFree(A->defaultvectype)); 560 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 561 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 562 PetscCall(MatSetOps_SeqAIJKokkos(A)); 563 aseq = static_cast<Mat_SeqAIJ *>(A->data); 564 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 565 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 566 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE); 567 } 568 } 569 PetscFunctionReturn(PETSC_SUCCESS); 570 } 571 572 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 573 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 574 */ 575 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 576 { 577 Mat_SeqAIJ *bseq; 578 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 579 Mat mat; 580 581 PetscFunctionBegin; 582 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 583 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 584 mat = *B; 585 if (A->assembled) { 586 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 587 bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE); 588 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 589 /* Now copy values to B if needed */ 590 if (dupOption == MAT_COPY_VALUES) { 591 if (akok->a_dual.need_sync_device()) { 592 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 593 bkok->a_dual.modify_host(); 594 } else { /* If device has the latest data, we only copy data on device */ 595 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 596 bkok->a_dual.modify_device(); 597 } 598 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 599 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 600 bkok->a_dual.modify_host(); 601 } 602 mat->spptr = bkok; 603 } 604 605 PetscCall(PetscFree(mat->defaultvectype)); 606 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 607 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 608 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 609 PetscFunctionReturn(PETSC_SUCCESS); 610 } 611 612 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 613 { 614 Mat At; 615 KokkosCsrMatrix internT; 616 Mat_SeqAIJKokkos *atkok, *bkok; 617 618 PetscFunctionBegin; 619 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 620 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 621 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 622 /* Deep copy internT, as we want to isolate the internal transpose */ 623 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 624 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 625 if (reuse == MAT_INITIAL_MATRIX) *B = At; 626 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 627 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 628 if ((*B)->assembled) { 629 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 630 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 631 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 632 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 633 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 634 MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 635 MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 636 PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 637 PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 638 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 639 } 640 PetscFunctionReturn(PETSC_SUCCESS); 641 } 642 643 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 644 { 645 Mat_SeqAIJKokkos *aijkok; 646 647 PetscFunctionBegin; 648 if (A->factortype == MAT_FACTOR_NONE) { 649 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 650 delete aijkok; 651 } else { 652 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 653 } 654 A->spptr = NULL; 655 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 656 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 657 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 658 #if defined(PETSC_HAVE_HYPRE) 659 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL)); 660 #endif 661 PetscCall(MatDestroy_SeqAIJ(A)); 662 PetscFunctionReturn(PETSC_SUCCESS); 663 } 664 665 /*MC 666 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 667 668 A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types 669 670 Options Database Key: 671 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 672 673 Level: beginner 674 675 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 676 M*/ 677 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 678 { 679 PetscFunctionBegin; 680 PetscCall(PetscKokkosInitializeCheck()); 681 PetscCall(MatCreate_SeqAIJ(A)); 682 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 683 PetscFunctionReturn(PETSC_SUCCESS); 684 } 685 686 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 687 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 688 { 689 Mat_SeqAIJ *a, *b; 690 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 691 MatScalarKokkosView aa, ba, ca; 692 MatRowMapKokkosView ai, bi, ci; 693 MatColIdxKokkosView aj, bj, cj; 694 PetscInt m, n, nnz, aN; 695 696 PetscFunctionBegin; 697 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 698 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 699 PetscAssertPointer(C, 4); 700 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 701 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 702 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 703 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 704 705 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 706 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 707 a = static_cast<Mat_SeqAIJ *>(A->data); 708 b = static_cast<Mat_SeqAIJ *>(B->data); 709 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 710 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 711 aa = akok->a_dual.view_device(); 712 ai = akok->i_dual.view_device(); 713 ba = bkok->a_dual.view_device(); 714 bi = bkok->i_dual.view_device(); 715 m = A->rmap->n; /* M, N and nnz of C */ 716 n = A->cmap->n + B->cmap->n; 717 nnz = a->nz + b->nz; 718 aN = A->cmap->n; /* N of A */ 719 if (reuse == MAT_INITIAL_MATRIX) { 720 aj = akok->j_dual.view_device(); 721 bj = bkok->j_dual.view_device(); 722 auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 723 auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 724 auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 725 ca = ca_dual.view_device(); 726 ci = ci_dual.view_device(); 727 cj = cj_dual.view_device(); 728 729 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 730 Kokkos::parallel_for( 731 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 732 PetscInt i = t.league_rank(); /* row i */ 733 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 734 735 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 736 ci(i) = coffset; 737 if (i == m - 1) ci(m) = ai(m) + bi(m); 738 }); 739 740 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 741 if (k < alen) { 742 ca(coffset + k) = aa(ai(i) + k); 743 cj(coffset + k) = aj(ai(i) + k); 744 } else { 745 ca(coffset + k) = ba(bi(i) + k - alen); 746 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 747 } 748 }); 749 }); 750 ca_dual.modify_device(); 751 ci_dual.modify_device(); 752 cj_dual.modify_device(); 753 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 754 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 755 } else if (reuse == MAT_REUSE_MATRIX) { 756 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 757 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 758 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 759 ca = ckok->a_dual.view_device(); 760 ci = ckok->i_dual.view_device(); 761 762 Kokkos::parallel_for( 763 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 764 PetscInt i = t.league_rank(); /* row i */ 765 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 766 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 767 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 768 else ca(ci(i) + k) = ba(bi(i) + k - alen); 769 }); 770 }); 771 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 772 } 773 PetscFunctionReturn(PETSC_SUCCESS); 774 } 775 776 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 777 { 778 PetscFunctionBegin; 779 delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 780 PetscFunctionReturn(PETSC_SUCCESS); 781 } 782 783 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 784 { 785 Mat_Product *product = C->product; 786 Mat A, B; 787 bool transA, transB; /* use bool, since KK needs this type */ 788 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 789 Mat_SeqAIJ *c; 790 MatProductData_SeqAIJKokkos *pdata; 791 KokkosCsrMatrix csrmatA, csrmatB; 792 793 PetscFunctionBegin; 794 MatCheckProduct(C, 1); 795 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 796 pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 797 798 // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 799 // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 800 // we still do numeric. 801 if (pdata->reusesym) { // numeric reuses results from symbolic 802 pdata->reusesym = PETSC_FALSE; 803 PetscFunctionReturn(PETSC_SUCCESS); 804 } 805 806 switch (product->type) { 807 case MATPRODUCT_AB: 808 transA = false; 809 transB = false; 810 break; 811 case MATPRODUCT_AtB: 812 transA = true; 813 transB = false; 814 break; 815 case MATPRODUCT_ABt: 816 transA = false; 817 transB = true; 818 break; 819 default: 820 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 821 } 822 823 A = product->A; 824 B = product->B; 825 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 826 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 827 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 828 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 829 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 830 831 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 832 833 csrmatA = akok->csrmat; 834 csrmatB = bkok->csrmat; 835 836 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 837 if (transA) { 838 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 839 transA = false; 840 } 841 842 if (transB) { 843 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 844 transB = false; 845 } 846 PetscCall(PetscLogGpuTimeBegin()); 847 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 848 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 849 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 850 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 851 #endif 852 853 PetscCall(PetscLogGpuTimeEnd()); 854 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 855 /* shorter version of MatAssemblyEnd_SeqAIJ */ 856 c = (Mat_SeqAIJ *)C->data; 857 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 858 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 859 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 860 c->reallocs = 0; 861 C->info.mallocs = 0; 862 C->info.nz_unneeded = 0; 863 C->assembled = C->was_assembled = PETSC_TRUE; 864 C->num_ass++; 865 PetscFunctionReturn(PETSC_SUCCESS); 866 } 867 868 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 869 { 870 Mat_Product *product = C->product; 871 MatProductType ptype; 872 Mat A, B; 873 bool transA, transB; 874 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 875 MatProductData_SeqAIJKokkos *pdata; 876 MPI_Comm comm; 877 KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 878 879 PetscFunctionBegin; 880 MatCheckProduct(C, 1); 881 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 882 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 883 A = product->A; 884 B = product->B; 885 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 886 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 887 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 888 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 889 csrmatA = akok->csrmat; 890 csrmatB = bkok->csrmat; 891 892 ptype = product->type; 893 // Take advantage of the symmetry if true 894 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 895 ptype = MATPRODUCT_AB; 896 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 897 } 898 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 899 ptype = MATPRODUCT_AB; 900 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 901 } 902 903 switch (ptype) { 904 case MATPRODUCT_AB: 905 transA = false; 906 transB = false; 907 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 908 break; 909 case MATPRODUCT_AtB: 910 transA = true; 911 transB = false; 912 if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs)); 913 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 914 break; 915 case MATPRODUCT_ABt: 916 transA = false; 917 transB = true; 918 if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs)); 919 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 920 break; 921 default: 922 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 923 } 924 PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 925 pdata->reusesym = product->api_user; 926 927 /* TODO: add command line options to select spgemm algorithms */ 928 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 929 930 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 931 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 932 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 933 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 934 #endif 935 #endif 936 PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 937 938 PetscCall(PetscLogGpuTimeBegin()); 939 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 940 if (transA) { 941 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 942 transA = false; 943 } 944 945 if (transB) { 946 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 947 transB = false; 948 } 949 950 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 951 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 952 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 953 calling new Mat_SeqAIJKokkos(). 954 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 955 */ 956 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 957 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 958 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 959 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 960 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 961 #endif 962 PetscCall(PetscLogGpuTimeEnd()); 963 964 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 965 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 966 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 967 PetscFunctionReturn(PETSC_SUCCESS); 968 } 969 970 /* handles sparse matrix matrix ops */ 971 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 972 { 973 Mat_Product *product = mat->product; 974 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 975 976 PetscFunctionBegin; 977 MatCheckProduct(mat, 1); 978 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 979 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 980 if (Biskok && Ciskok) { 981 switch (product->type) { 982 case MATPRODUCT_AB: 983 case MATPRODUCT_AtB: 984 case MATPRODUCT_ABt: 985 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 986 break; 987 case MATPRODUCT_PtAP: 988 case MATPRODUCT_RARt: 989 case MATPRODUCT_ABC: 990 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 991 break; 992 default: 993 break; 994 } 995 } else { /* fallback for AIJ */ 996 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 997 } 998 PetscFunctionReturn(PETSC_SUCCESS); 999 } 1000 1001 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 1002 { 1003 Mat_SeqAIJKokkos *aijkok; 1004 1005 PetscFunctionBegin; 1006 PetscCall(PetscLogGpuTimeBegin()); 1007 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1008 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1009 KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 1010 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1011 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 1012 PetscCall(PetscLogGpuTimeEnd()); 1013 PetscFunctionReturn(PETSC_SUCCESS); 1014 } 1015 1016 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular) 1017 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a) 1018 { 1019 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1020 1021 PetscFunctionBegin; 1022 if (A->assembled && aijseq->diagonaldense) { // no missing diagonals 1023 PetscInt n = PetscMin(A->rmap->n, A->cmap->n); 1024 1025 PetscCall(PetscLogGpuTimeBegin()); 1026 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1027 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1028 const auto &Aa = aijkok->a_dual.view_device(); 1029 const auto &Adiag = aijkok->diag_dual.view_device(); 1030 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; })); 1031 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1032 PetscCall(PetscLogGpuFlops(n)); 1033 PetscCall(PetscLogGpuTimeEnd()); 1034 } else { // need reassembly, very slow! 1035 PetscCall(MatShift_Basic(A, a)); 1036 } 1037 PetscFunctionReturn(PETSC_SUCCESS); 1038 } 1039 1040 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is) 1041 { 1042 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data); 1043 1044 PetscFunctionBegin; 1045 if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals 1046 ConstPetscScalarKokkosView dv; 1047 PetscInt n, nv; 1048 1049 PetscCall(PetscLogGpuTimeBegin()); 1050 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1051 PetscCall(VecGetKokkosView(D, &dv)); 1052 PetscCall(VecGetLocalSize(D, &nv)); 1053 n = PetscMin(Y->rmap->n, Y->cmap->n); 1054 PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match"); 1055 1056 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1057 const auto &Aa = aijkok->a_dual.view_device(); 1058 const auto &Adiag = aijkok->diag_dual.view_device(); 1059 PetscCallCXX(Kokkos::parallel_for( 1060 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1061 if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i); 1062 else Aa(Adiag(i)) += dv(i); 1063 })); 1064 PetscCall(VecRestoreKokkosView(D, &dv)); 1065 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1066 PetscCall(PetscLogGpuFlops(n)); 1067 PetscCall(PetscLogGpuTimeEnd()); 1068 } else { // need reassembly, very slow! 1069 PetscCall(MatDiagonalSet_Default(Y, D, is)); 1070 } 1071 PetscFunctionReturn(PETSC_SUCCESS); 1072 } 1073 1074 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr) 1075 { 1076 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1077 PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz; 1078 ConstPetscScalarKokkosView lv, rv; 1079 1080 PetscFunctionBegin; 1081 PetscCall(PetscLogGpuTimeBegin()); 1082 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1083 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1084 const auto &Aa = aijkok->a_dual.view_device(); 1085 const auto &Ai = aijkok->i_dual.view_device(); 1086 const auto &Aj = aijkok->j_dual.view_device(); 1087 if (ll) { 1088 PetscCall(VecGetLocalSize(ll, &m)); 1089 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1090 PetscCall(VecGetKokkosView(ll, &lv)); 1091 PetscCallCXX(Kokkos::parallel_for( // for each row 1092 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1093 PetscInt i = t.league_rank(); // row i 1094 PetscInt len = Ai(i + 1) - Ai(i); 1095 // scale entries on the row 1096 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); }); 1097 })); 1098 PetscCall(VecRestoreKokkosView(ll, &lv)); 1099 PetscCall(PetscLogGpuFlops(nz)); 1100 } 1101 if (rr) { 1102 PetscCall(VecGetLocalSize(rr, &n)); 1103 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1104 PetscCall(VecGetKokkosView(rr, &rv)); 1105 PetscCallCXX(Kokkos::parallel_for( // for each nonzero 1106 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); })); 1107 PetscCall(VecRestoreKokkosView(rr, &lv)); 1108 PetscCall(PetscLogGpuFlops(nz)); 1109 } 1110 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1111 PetscCall(PetscLogGpuTimeEnd()); 1112 PetscFunctionReturn(PETSC_SUCCESS); 1113 } 1114 1115 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 1116 { 1117 Mat_SeqAIJKokkos *aijkok; 1118 1119 PetscFunctionBegin; 1120 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1121 if (aijkok) { /* Only zero the device if data is already there */ 1122 KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0); 1123 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1124 } else { /* Might be preallocated but not assembled */ 1125 PetscCall(MatZeroEntries_SeqAIJ(A)); 1126 } 1127 PetscFunctionReturn(PETSC_SUCCESS); 1128 } 1129 1130 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1131 { 1132 Mat_SeqAIJKokkos *aijkok; 1133 PetscInt n; 1134 PetscScalarKokkosView xv; 1135 1136 PetscFunctionBegin; 1137 PetscCall(VecGetLocalSize(x, &n)); 1138 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1139 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1140 1141 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1142 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1143 1144 const auto &Aa = aijkok->a_dual.view_device(); 1145 const auto &Ai = aijkok->i_dual.view_device(); 1146 const auto &Adiag = aijkok->diag_dual.view_device(); 1147 1148 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1149 Kokkos::parallel_for( 1150 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1151 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1152 else xv(i) = 0; 1153 }); 1154 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1155 PetscFunctionReturn(PETSC_SUCCESS); 1156 } 1157 1158 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1159 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1160 { 1161 Mat_SeqAIJKokkos *aijkok; 1162 1163 PetscFunctionBegin; 1164 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1165 PetscAssertPointer(kv, 2); 1166 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1167 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1168 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1169 *kv = aijkok->a_dual.view_device(); 1170 PetscFunctionReturn(PETSC_SUCCESS); 1171 } 1172 1173 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1174 { 1175 PetscFunctionBegin; 1176 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1177 PetscAssertPointer(kv, 2); 1178 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1179 PetscFunctionReturn(PETSC_SUCCESS); 1180 } 1181 1182 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1183 { 1184 Mat_SeqAIJKokkos *aijkok; 1185 1186 PetscFunctionBegin; 1187 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1188 PetscAssertPointer(kv, 2); 1189 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1190 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1191 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1192 *kv = aijkok->a_dual.view_device(); 1193 PetscFunctionReturn(PETSC_SUCCESS); 1194 } 1195 1196 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1197 { 1198 PetscFunctionBegin; 1199 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1200 PetscAssertPointer(kv, 2); 1201 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1202 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1203 PetscFunctionReturn(PETSC_SUCCESS); 1204 } 1205 1206 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1207 { 1208 Mat_SeqAIJKokkos *aijkok; 1209 1210 PetscFunctionBegin; 1211 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1212 PetscAssertPointer(kv, 2); 1213 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1214 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1215 *kv = aijkok->a_dual.view_device(); 1216 PetscFunctionReturn(PETSC_SUCCESS); 1217 } 1218 1219 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1220 { 1221 PetscFunctionBegin; 1222 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1223 PetscAssertPointer(kv, 2); 1224 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1225 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1226 PetscFunctionReturn(PETSC_SUCCESS); 1227 } 1228 1229 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A) 1230 { 1231 Mat_SeqAIJKokkos *akok; 1232 1233 PetscFunctionBegin; 1234 auto exec = PetscGetKokkosExecutionSpace(); 1235 // Create host copies of the input aij 1236 auto i_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), i_d); 1237 auto j_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), j_d); 1238 // Don't copy the vals to the host now 1239 auto a_h = Kokkos::create_mirror_view(HostMirrorMemorySpace(), a_d); 1240 1241 MatScalarKokkosDualView a_dual = MatScalarKokkosDualView(a_d, a_h); 1242 // Note we have modified device data so it will copy lazily 1243 a_dual.modify_device(); 1244 MatRowMapKokkosDualView i_dual = MatRowMapKokkosDualView(i_d, i_h); 1245 MatColIdxKokkosDualView j_dual = MatColIdxKokkosDualView(j_d, j_h); 1246 1247 PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_dual.extent(0), i_dual, j_dual, a_dual)); 1248 PetscCall(MatCreate(comm, A)); 1249 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1250 PetscFunctionReturn(PETSC_SUCCESS); 1251 } 1252 1253 /* Computes Y += alpha X */ 1254 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1255 { 1256 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1257 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1258 ConstMatScalarKokkosView Xa; 1259 MatScalarKokkosView Ya; 1260 auto exec = PetscGetKokkosExecutionSpace(); 1261 1262 PetscFunctionBegin; 1263 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1264 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1265 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1266 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1267 PetscCall(PetscLogGpuTimeBegin()); 1268 1269 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1270 PetscBool e; 1271 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1272 if (e) { 1273 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1274 if (e) pattern = SAME_NONZERO_PATTERN; 1275 } 1276 } 1277 1278 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1279 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1280 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1281 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1282 */ 1283 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1284 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1285 Xa = xkok->a_dual.view_device(); 1286 Ya = ykok->a_dual.view_device(); 1287 1288 if (pattern == SAME_NONZERO_PATTERN) { 1289 KokkosBlas::axpy(exec, alpha, Xa, Ya); 1290 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1291 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1292 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1293 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1294 1295 Kokkos::parallel_for( 1296 Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1297 PetscInt i = t.league_rank(); // row i 1298 Kokkos::single(Kokkos::PerTeam(t), [=]() { 1299 // Only one thread works in a team 1300 PetscInt p, q = Yi(i); 1301 for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 1302 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 1303 if (Xj(p) == Yj(q)) { // Found it 1304 Ya(q) += alpha * Xa(p); 1305 q++; 1306 } else { 1307 // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1308 // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1309 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 1310 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 1311 #else 1312 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 1313 #endif 1314 } 1315 } 1316 }); 1317 }); 1318 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1319 } else { // different nonzero patterns 1320 Mat Z; 1321 KokkosCsrMatrix zcsr; 1322 KernelHandle kh; 1323 kh.create_spadd_handle(true); // X, Y are sorted 1324 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1325 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1326 zkok = new Mat_SeqAIJKokkos(zcsr); 1327 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1328 PetscCall(MatHeaderReplace(Y, &Z)); 1329 kh.destroy_spadd_handle(); 1330 } 1331 PetscCall(PetscLogGpuTimeEnd()); 1332 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 1333 PetscFunctionReturn(PETSC_SUCCESS); 1334 } 1335 1336 struct MatCOOStruct_SeqAIJKokkos { 1337 PetscCount n; 1338 PetscCount Atot; 1339 PetscInt nz; 1340 PetscCountKokkosView jmap; 1341 PetscCountKokkosView perm; 1342 1343 MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h) 1344 { 1345 nz = coo_h->nz; 1346 n = coo_h->n; 1347 Atot = coo_h->Atot; 1348 jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1)); 1349 perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot)); 1350 } 1351 }; 1352 1353 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data) 1354 { 1355 PetscFunctionBegin; 1356 PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data)); 1357 PetscFunctionReturn(PETSC_SUCCESS); 1358 } 1359 1360 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1361 { 1362 Mat_SeqAIJKokkos *akok; 1363 Mat_SeqAIJ *aseq; 1364 PetscContainer container_h; 1365 MatCOOStruct_SeqAIJ *coo_h; 1366 MatCOOStruct_SeqAIJKokkos *coo_d; 1367 1368 PetscFunctionBegin; 1369 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1370 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1371 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1372 delete akok; 1373 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE); 1374 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1375 1376 // Copy the COO struct to device 1377 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 1378 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 1379 PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h)); 1380 1381 // Put the COO struct in a container and then attach that to the matrix 1382 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos)); 1383 PetscFunctionReturn(PETSC_SUCCESS); 1384 } 1385 1386 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1387 { 1388 MatScalarKokkosView Aa; 1389 ConstMatScalarKokkosView kv; 1390 PetscMemType memtype; 1391 PetscContainer container; 1392 MatCOOStruct_SeqAIJKokkos *coo; 1393 1394 PetscFunctionBegin; 1395 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 1396 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 1397 1398 const auto &n = coo->n; 1399 const auto &Annz = coo->nz; 1400 const auto &jmap = coo->jmap; 1401 const auto &perm = coo->perm; 1402 1403 PetscCall(PetscGetMemType(v, &memtype)); 1404 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1405 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n)); 1406 } else { 1407 kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */ 1408 } 1409 1410 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1411 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1412 1413 PetscCall(PetscLogGpuTimeBegin()); 1414 Kokkos::parallel_for( 1415 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) { 1416 PetscScalar sum = 0.0; 1417 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1418 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1419 }); 1420 PetscCall(PetscLogGpuTimeEnd()); 1421 1422 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1423 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1424 PetscFunctionReturn(PETSC_SUCCESS); 1425 } 1426 1427 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1428 { 1429 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1430 1431 PetscFunctionBegin; 1432 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1433 A->boundtocpu = PETSC_FALSE; 1434 1435 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1436 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1437 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1438 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1439 A->ops->scale = MatScale_SeqAIJKokkos; 1440 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1441 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1442 A->ops->mult = MatMult_SeqAIJKokkos; 1443 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1444 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1445 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1446 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1447 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1448 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1449 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1450 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1451 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1452 A->ops->shift = MatShift_SeqAIJKokkos; 1453 A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos; 1454 A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos; 1455 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1456 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1457 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1458 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1459 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1460 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1461 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1462 1463 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1464 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1465 #if defined(PETSC_HAVE_HYPRE) 1466 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE)); 1467 #endif 1468 PetscFunctionReturn(PETSC_SUCCESS); 1469 } 1470 1471 /* 1472 Extract the (prescribled) diagonal blocks of the matrix and then invert them 1473 1474 Input Parameters: 1475 + A - the MATSEQAIJKOKKOS matrix 1476 . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i) 1477 . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[] 1478 . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i) 1479 - work - a pre-allocated work buffer (as big as diagVal) for use by this routine 1480 1481 Output Parameter: 1482 . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order) 1483 */ 1484 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal) 1485 { 1486 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1487 PetscInt nblocks = bs.extent(0) - 1; 1488 1489 PetscFunctionBegin; 1490 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device 1491 1492 // Pull out the diagonal blocks of the matrix and then invert the blocks 1493 auto Aa = akok->a_dual.view_device(); 1494 auto Ai = akok->i_dual.view_device(); 1495 auto Aj = akok->j_dual.view_device(); 1496 auto Adiag = akok->diag_dual.view_device(); 1497 // TODO: how to tune the team size? 1498 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY) 1499 auto ts = Kokkos::AUTO(); 1500 #else 1501 auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs 1502 #endif 1503 PetscCallCXX(Kokkos::parallel_for( 1504 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) { 1505 const PetscInt bid = teamMember.league_rank(); // block id 1506 const PetscInt rstart = bs(bid); // this block starts from this row 1507 const PetscInt m = bs(bid + 1) - bs(bid); // size of this block 1508 const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order 1509 const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m); 1510 1511 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B 1512 PetscInt i = rstart + r; // i-th row in A 1513 1514 if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case) 1515 PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row 1516 1517 for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet 1518 if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero 1519 B(r, c) = 0.0; 1520 } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column 1521 B(r, c) = Aa(first + c); 1522 } else { // this entry does not show up in the CSR 1523 B(r, c) = 0.0; 1524 } 1525 } 1526 } else { // rare case that the diagonal does not exist 1527 const PetscInt begin = Ai(i); 1528 const PetscInt end = Ai(i + 1); 1529 for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0; 1530 for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not. 1531 if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j); 1532 else if (Aj(j) >= rstart + m) break; 1533 } 1534 } 1535 }); 1536 1537 // LU-decompose B (w/o pivoting) and then invert B 1538 KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0); 1539 KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W); 1540 })); 1541 // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels 1542 PetscFunctionReturn(PETSC_SUCCESS); 1543 } 1544 1545 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1546 { 1547 Mat_SeqAIJ *aseq; 1548 PetscInt i, m, n; 1549 auto exec = PetscGetKokkosExecutionSpace(); 1550 1551 PetscFunctionBegin; 1552 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1553 1554 m = akok->nrows(); 1555 n = akok->ncols(); 1556 PetscCall(MatSetSizes(A, m, n, m, n)); 1557 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1558 1559 /* Set up data structures of A as a MATSEQAIJ */ 1560 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1561 aseq = (Mat_SeqAIJ *)A->data; 1562 1563 PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */ 1564 PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec)); 1565 1566 aseq->i = akok->i_host_data(); 1567 aseq->j = akok->j_host_data(); 1568 aseq->a = akok->a_host_data(); 1569 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1570 aseq->free_a = PETSC_FALSE; 1571 aseq->free_ij = PETSC_FALSE; 1572 aseq->nz = akok->nnz(); 1573 aseq->maxnz = aseq->nz; 1574 1575 PetscCall(PetscMalloc1(m, &aseq->imax)); 1576 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1577 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1578 1579 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1580 akok->nonzerostate = A->nonzerostate; 1581 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1582 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1583 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1584 PetscFunctionReturn(PETSC_SUCCESS); 1585 } 1586 1587 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 1588 { 1589 PetscFunctionBegin; 1590 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1591 *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 1592 PetscFunctionReturn(PETSC_SUCCESS); 1593 } 1594 1595 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 1596 { 1597 Mat_SeqAIJKokkos *akok; 1598 1599 PetscFunctionBegin; 1600 PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 1601 PetscCall(MatCreate(comm, A)); 1602 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1603 PetscFunctionReturn(PETSC_SUCCESS); 1604 } 1605 1606 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1607 1608 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1609 */ 1610 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1611 { 1612 PetscFunctionBegin; 1613 PetscCall(MatCreate(comm, A)); 1614 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1615 PetscFunctionReturn(PETSC_SUCCESS); 1616 } 1617 1618 /*@C 1619 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1620 (the default parallel PETSc format). This matrix will ultimately be handled by 1621 Kokkos for calculations. 1622 1623 Collective 1624 1625 Input Parameters: 1626 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1627 . m - number of rows 1628 . n - number of columns 1629 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 1630 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 1631 1632 Output Parameter: 1633 . A - the matrix 1634 1635 Level: intermediate 1636 1637 Notes: 1638 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1639 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1640 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1641 1642 The AIJ format, also called 1643 compressed row storage, is fully compatible with standard Fortran 1644 storage. That is, the stored row and column indices can begin at 1645 either one (as in Fortran) or zero. 1646 1647 Specify the preallocated storage with either `nz` or `nnz` (not both). 1648 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1649 allocation. 1650 1651 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()` 1652 @*/ 1653 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1654 { 1655 PetscFunctionBegin; 1656 PetscCall(PetscKokkosInitializeCheck()); 1657 PetscCall(MatCreate(comm, A)); 1658 PetscCall(MatSetSizes(*A, m, n, m, n)); 1659 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1660 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1661 PetscFunctionReturn(PETSC_SUCCESS); 1662 } 1663 1664 // After matrix numeric factorization, there are still steps to do before triangular solve can be called. 1665 // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does). 1666 // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve(). 1667 // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck. 1668 static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A) 1669 { 1670 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1671 const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy 1672 const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy 1673 1674 PetscFunctionBegin; 1675 if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet 1676 if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d)); 1677 if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d)); 1678 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1679 } 1680 PetscFunctionReturn(PETSC_SUCCESS); 1681 } 1682 1683 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1684 { 1685 const PetscInt n = A->rmap->n; 1686 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1687 const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy 1688 const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy 1689 1690 PetscFunctionBegin; 1691 if (!factors->transpose_updated) { 1692 if (has_upper) { 1693 if (!factors->iUt_d.extent(0)) { // Allocate Ut on device if not yet 1694 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix 1695 factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 1696 factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 1697 } 1698 1699 if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host 1700 if (!factors->U) { 1701 Mat_SeqAIJ *seq; 1702 1703 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U)); 1704 PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut)); 1705 1706 seq = static_cast<Mat_SeqAIJ *>(factors->Ut->data); 1707 factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1); 1708 factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz); 1709 factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz); 1710 } else { 1711 PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h 1712 } 1713 // Copy Ut from host to device 1714 PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h)); 1715 PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h)); 1716 PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h)); 1717 } else { // If U was computed on device, we also compute the transpose there 1718 // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. 1719 PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, 1720 factors->jU_d, factors->aU_d, 1721 factors->iUt_d, factors->jUt_d, 1722 factors->aUt_d)); 1723 PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d)); 1724 } 1725 PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d)); 1726 } 1727 1728 // do the same for L with LU 1729 if (has_lower) { 1730 if (!factors->iLt_d.extent(0)) { // Allocate Lt on device if not yet 1731 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix 1732 factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 1733 factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 1734 } 1735 1736 if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host 1737 if (!factors->L) { 1738 Mat_SeqAIJ *seq; 1739 1740 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L)); 1741 PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt)); 1742 1743 seq = static_cast<Mat_SeqAIJ *>(factors->Lt->data); 1744 factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1); 1745 factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz); 1746 factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz); 1747 } else { 1748 PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h 1749 } 1750 // Copy Lt from host to device 1751 PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h)); 1752 PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h)); 1753 PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h)); 1754 } else { // If L was computed on device, we also compute the transpose there 1755 // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. 1756 PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, 1757 factors->jL_d, factors->aL_d, 1758 factors->iLt_d, factors->jLt_d, 1759 factors->aLt_d)); 1760 PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d)); 1761 } 1762 PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d)); 1763 } 1764 1765 factors->transpose_updated = PETSC_TRUE; 1766 } 1767 PetscFunctionReturn(PETSC_SUCCESS); 1768 } 1769 1770 // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A. 1771 // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty. 1772 static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx) 1773 { 1774 auto exec = PetscGetKokkosExecutionSpace(); 1775 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1776 PetscInt m = A->rmap->n; 1777 PetscScalarKokkosView D = factors->D_d; 1778 PetscScalarKokkosView X, Y, B; // alias 1779 ConstPetscScalarKokkosView b; 1780 PetscScalarKokkosView x; 1781 PetscIntKokkosView &rowperm = factors->rowperm; 1782 PetscBool identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1783 1784 PetscFunctionBegin; 1785 PetscCall(PetscLogGpuTimeBegin()); 1786 PetscCall(MatSeqAIJKokkosSolveCheck(A)); // for UX = T 1787 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B 1788 PetscCall(VecGetKokkosView(bb, &b)); 1789 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1790 1791 // Solve U^T Y = B 1792 if (identity) { // Reorder b with the row permutation 1793 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1794 Y = factors->workVector; 1795 } else { 1796 B = factors->workVector; 1797 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); })); 1798 Y = x; 1799 } 1800 PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y)); 1801 1802 // Solve diag(D) Y' = Y. 1803 // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication. 1804 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); })); 1805 1806 // Solve UX = Y 1807 if (identity) { 1808 X = x; 1809 } else { 1810 X = factors->workVector; // B is not needed anymore 1811 } 1812 PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X)); 1813 1814 // Reorder X with the inverse column (row) permutation 1815 if (!identity) { 1816 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); })); 1817 } 1818 1819 PetscCall(VecRestoreKokkosView(bb, &b)); 1820 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1821 PetscCall(PetscLogGpuTimeEnd()); 1822 PetscFunctionReturn(PETSC_SUCCESS); 1823 } 1824 1825 // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively. 1826 // R and C are represented by rowperm and colperm in factors. 1827 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty. 1828 static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx) 1829 { 1830 auto exec = PetscGetKokkosExecutionSpace(); 1831 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1832 PetscInt m = A->rmap->n; 1833 PetscScalarKokkosView X, Y, B; // alias 1834 ConstPetscScalarKokkosView b; 1835 PetscScalarKokkosView x; 1836 PetscIntKokkosView &rowperm = factors->rowperm; 1837 PetscIntKokkosView &colperm = factors->colperm; 1838 PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1839 PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1840 1841 PetscFunctionBegin; 1842 PetscCall(PetscLogGpuTimeBegin()); 1843 PetscCall(MatSeqAIJKokkosSolveCheck(A)); 1844 PetscCall(VecGetKokkosView(bb, &b)); 1845 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1846 1847 // Solve L Y = B (i.e., L (U C^- x) = R b). R b indicates applying the row permutation on b. 1848 if (row_identity) { 1849 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1850 Y = factors->workVector; 1851 } else { 1852 B = factors->workVector; 1853 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); })); 1854 Y = x; 1855 } 1856 PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y)); 1857 1858 // Solve U C^- x = Y 1859 if (col_identity) { 1860 X = x; 1861 } else { 1862 X = factors->workVector; 1863 } 1864 PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X)); 1865 1866 // x = C X; Reorder X with the inverse col permutation 1867 if (!col_identity) { 1868 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); })); 1869 } 1870 1871 PetscCall(VecRestoreKokkosView(bb, &b)); 1872 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1873 PetscCall(PetscLogGpuTimeEnd()); 1874 PetscFunctionReturn(PETSC_SUCCESS); 1875 } 1876 1877 // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively. 1878 // R and C are represented by rowperm and colperm in factors. 1879 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty. 1880 // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R. 1881 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx) 1882 { 1883 auto exec = PetscGetKokkosExecutionSpace(); 1884 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1885 PetscInt m = A->rmap->n; 1886 PetscScalarKokkosView X, Y, B; // alias 1887 ConstPetscScalarKokkosView b; 1888 PetscScalarKokkosView x; 1889 PetscIntKokkosView &rowperm = factors->rowperm; 1890 PetscIntKokkosView &colperm = factors->colperm; 1891 PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1892 PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1893 1894 PetscFunctionBegin; 1895 PetscCall(PetscLogGpuTimeBegin()); 1896 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T 1897 PetscCall(VecGetKokkosView(bb, &b)); 1898 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1899 1900 // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b). Note C^- b = C^T b, which means applying the column permutation on b. 1901 if (col_identity) { // Reorder b with the col permutation 1902 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1903 Y = factors->workVector; 1904 } else { 1905 B = factors->workVector; 1906 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); })); 1907 Y = x; 1908 } 1909 PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y)); 1910 1911 // Solve L^T X = Y 1912 if (row_identity) { 1913 X = x; 1914 } else { 1915 X = factors->workVector; 1916 } 1917 PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X)); 1918 1919 // x = R^- X = R^T X; Reorder X with the inverse row permutation 1920 if (!row_identity) { 1921 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); })); 1922 } 1923 1924 PetscCall(VecRestoreKokkosView(bb, &b)); 1925 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1926 PetscCall(PetscLogGpuTimeEnd()); 1927 PetscFunctionReturn(PETSC_SUCCESS); 1928 } 1929 1930 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1931 { 1932 PetscFunctionBegin; 1933 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1934 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1935 1936 if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device 1937 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1938 Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data); 1939 const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag; 1940 const MatScalar *Ba = b->a; 1941 PetscInt m = B->rmap->n, n = B->cmap->n; 1942 1943 if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time 1944 // Allocate memory and copy the structure 1945 factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1); 1946 factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries 1947 factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m); 1948 factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1); 1949 factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m])); 1950 factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m])); 1951 1952 PetscInt *Li = factors->iL_h.data(); 1953 PetscInt *Lj = factors->jL_h.data(); 1954 PetscInt *Ui = factors->iU_h.data(); 1955 PetscInt *Uj = factors->jU_h.data(); 1956 1957 Li[0] = Ui[0] = 0; 1958 for (PetscInt i = 0; i < m; i++) { 1959 PetscInt llen = Bi[i + 1] - Bi[i]; // exclusive of the diagonal entry 1960 PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry 1961 1962 PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen); // entries of L on the left of the diagonal 1963 Lj[Li[i] + llen] = i; // diagonal entry of L 1964 1965 Uj[Ui[i]] = i; // diagonal entry of U 1966 PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1); // entries of U on the right of the diagonal 1967 1968 Li[i + 1] = Li[i] + llen + 1; 1969 Ui[i + 1] = Ui[i] + ulen; 1970 } 1971 1972 factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h); 1973 factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h); 1974 factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h); 1975 factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h); 1976 factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h); 1977 factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h); 1978 1979 // Copy row/col permutation to device 1980 IS rowperm = ((Mat_SeqAIJ *)B->data)->row; 1981 PetscBool row_identity; 1982 PetscCall(ISIdentity(rowperm, &row_identity)); 1983 if (!row_identity) { 1984 const PetscInt *ip; 1985 1986 PetscCall(ISGetIndices(rowperm, &ip)); 1987 factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m); 1988 PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m))); 1989 PetscCall(ISRestoreIndices(rowperm, &ip)); 1990 PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt))); 1991 } 1992 1993 IS colperm = ((Mat_SeqAIJ *)B->data)->col; 1994 PetscBool col_identity; 1995 PetscCall(ISIdentity(colperm, &col_identity)); 1996 if (!col_identity) { 1997 const PetscInt *ip; 1998 1999 PetscCall(ISGetIndices(colperm, &ip)); 2000 factors->colperm = PetscIntKokkosView(NoInit("colperm"), n); 2001 PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n))); 2002 PetscCall(ISRestoreIndices(colperm, &ip)); 2003 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 2004 } 2005 2006 /* Create sptrsv handles for L, U and their transpose */ 2007 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2008 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2009 #else 2010 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2011 #endif 2012 factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */); 2013 factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */); 2014 factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */); 2015 factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */); 2016 } 2017 2018 // Copy the value 2019 for (PetscInt i = 0; i < m; i++) { 2020 PetscInt llen = Bi[i + 1] - Bi[i]; 2021 PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; 2022 const PetscInt *Li = factors->iL_h.data(); 2023 const PetscInt *Ui = factors->iU_h.data(); 2024 2025 PetscScalar *La = factors->aL_h.data(); 2026 PetscScalar *Ua = factors->aU_h.data(); 2027 2028 PetscArraycpy(La + Li[i], Ba + Bi[i], llen); // entries of L 2029 La[Li[i] + llen] = 1.0; // diagonal entry 2030 2031 Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]]; // diagonal entry 2032 PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1); // entries of U 2033 } 2034 2035 PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h)); 2036 PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h)); 2037 // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic 2038 factors->transpose_updated = PETSC_FALSE; 2039 factors->sptrsv_symbolic_completed = PETSC_FALSE; 2040 2041 B->ops->solve = MatSolve_SeqAIJKokkos_LU; 2042 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU; 2043 } 2044 2045 B->ops->matsolve = NULL; 2046 B->ops->matsolvetranspose = NULL; 2047 PetscFunctionReturn(PETSC_SUCCESS); 2048 } 2049 2050 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info) 2051 { 2052 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 2053 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2054 PetscInt fill_lev = info->levels; 2055 2056 PetscFunctionBegin; 2057 PetscCall(PetscLogGpuTimeBegin()); 2058 PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false"); 2059 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2060 2061 auto a_d = aijkok->a_dual.view_device(); 2062 auto i_d = aijkok->i_dual.view_device(); 2063 auto j_d = aijkok->j_dual.view_device(); 2064 2065 PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d)); 2066 2067 B->assembled = PETSC_TRUE; 2068 B->preallocated = PETSC_TRUE; 2069 B->ops->solve = MatSolve_SeqAIJKokkos_LU; 2070 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU; 2071 B->ops->matsolve = NULL; 2072 B->ops->matsolvetranspose = NULL; 2073 2074 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 2075 factors->transpose_updated = PETSC_FALSE; 2076 factors->sptrsv_symbolic_completed = PETSC_FALSE; 2077 /* TODO: log flops, but how to know that? */ 2078 PetscCall(PetscLogGpuTimeEnd()); 2079 PetscFunctionReturn(PETSC_SUCCESS); 2080 } 2081 2082 // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering 2083 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info) 2084 { 2085 Mat_SeqAIJKokkos *aijkok; 2086 Mat_SeqAIJ *b; 2087 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2088 PetscInt fill_lev = info->levels; 2089 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 2090 PetscInt n = A->rmap->n; 2091 2092 PetscFunctionBegin; 2093 PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now"); 2094 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2095 2096 /* Create a spiluk handle and then do symbolic factorization */ 2097 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 2098 factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 2099 2100 auto spiluk_handle = factors->kh.get_spiluk_handle(); 2101 2102 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 2103 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 2104 Kokkos::realloc(factors->iU_d, n + 1); 2105 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 2106 2107 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 2108 auto i_d = aijkok->i_dual.view_device(); 2109 auto j_d = aijkok->j_dual.view_device(); 2110 PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d)); 2111 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 2112 2113 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 2114 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 2115 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 2116 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 2117 2118 /* TODO: add options to select sptrsv algorithms */ 2119 /* Create sptrsv handles for L, U and their transpose */ 2120 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2121 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2122 #else 2123 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2124 #endif 2125 2126 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 2127 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 2128 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 2129 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 2130 2131 /* Fill fields of the factor matrix B */ 2132 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 2133 b = (Mat_SeqAIJ *)B->data; 2134 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 2135 B->info.fill_ratio_given = info->fill; 2136 B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 2137 2138 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0; 2139 PetscFunctionReturn(PETSC_SUCCESS); 2140 } 2141 2142 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2143 { 2144 PetscFunctionBegin; 2145 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2146 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2147 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2148 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 2149 PetscFunctionReturn(PETSC_SUCCESS); 2150 } 2151 2152 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2153 { 2154 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2155 2156 PetscFunctionBegin; 2157 if (!info->factoronhost) { 2158 PetscCall(ISIdentity(isrow, &row_identity)); 2159 PetscCall(ISIdentity(iscol, &col_identity)); 2160 } 2161 2162 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2163 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2164 2165 if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering 2166 PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info)); 2167 } else { 2168 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host 2169 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 2170 } 2171 PetscFunctionReturn(PETSC_SUCCESS); 2172 } 2173 2174 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 2175 { 2176 PetscFunctionBegin; 2177 PetscCall(MatSeqAIJKokkosSyncHost(A)); 2178 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 2179 2180 if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device 2181 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2182 Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data); 2183 const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag; 2184 const MatScalar *Ba = b->a; 2185 PetscInt m = B->rmap->n; 2186 2187 if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization 2188 // Allocate memory and copy the structure 2189 factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h 2190 factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]); 2191 factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]); 2192 factors->D_h = MatScalarKokkosViewHost(NoInit("D_h"), m); 2193 factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h); 2194 factors->D_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h); 2195 2196 // Build jU_h from the skewed Aj 2197 PetscInt *Uj = factors->jU_h.data(); 2198 for (PetscInt i = 0; i < m; i++) { 2199 PetscInt ulen = Bi[i + 1] - Bi[i]; 2200 Uj[Bi[i]] = i; // diagonal entry 2201 PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal 2202 } 2203 2204 // Copy iU, jU to device 2205 PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h)); 2206 PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h)); 2207 2208 // Copy row/col permutation to device 2209 IS rowperm = ((Mat_SeqAIJ *)B->data)->row; 2210 PetscBool row_identity; 2211 PetscCall(ISIdentity(rowperm, &row_identity)); 2212 if (!row_identity) { 2213 const PetscInt *ip; 2214 2215 PetscCall(ISGetIndices(rowperm, &ip)); 2216 PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m)); 2217 PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m))); 2218 PetscCall(ISRestoreIndices(rowperm, &ip)); 2219 PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt))); 2220 } 2221 2222 // Create sptrsv handles for U and U^T 2223 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2224 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2225 #else 2226 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2227 #endif 2228 factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */); 2229 factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */); 2230 } 2231 // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them 2232 B->ops->solve = MatSolve_SeqAIJKokkos_Cholesky; 2233 B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky; 2234 2235 // Copy the value 2236 PetscScalar *Ua = factors->aU_h.data(); 2237 PetscScalar *D = factors->D_h.data(); 2238 for (PetscInt i = 0; i < m; i++) { 2239 D[i] = Ba[Bdiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 2240 Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U 2241 for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k]; 2242 } 2243 PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h)); 2244 PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h)); 2245 2246 factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again 2247 factors->transpose_updated = PETSC_FALSE; 2248 } 2249 2250 B->ops->matsolve = NULL; 2251 B->ops->matsolvetranspose = NULL; 2252 PetscFunctionReturn(PETSC_SUCCESS); 2253 } 2254 2255 static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2256 { 2257 PetscFunctionBegin; 2258 if (info->solveonhost) { 2259 // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc. 2260 PetscCall(MatSetType(B, MATSEQSBAIJ)); 2261 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 2262 } 2263 2264 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2265 2266 if (!info->solveonhost) { 2267 // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr 2268 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2269 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2270 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos; 2271 } 2272 PetscFunctionReturn(PETSC_SUCCESS); 2273 } 2274 2275 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2276 { 2277 PetscFunctionBegin; 2278 if (info->solveonhost) { 2279 // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc. 2280 PetscCall(MatSetType(B, MATSEQSBAIJ)); 2281 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 2282 } 2283 2284 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm 2285 2286 if (!info->solveonhost) { 2287 // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr 2288 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2289 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2290 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos; 2291 } 2292 PetscFunctionReturn(PETSC_SUCCESS); 2293 } 2294 2295 // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix 2296 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type) 2297 { 2298 PetscFunctionBegin; 2299 *type = MATSOLVERKOKKOS; 2300 PetscFunctionReturn(PETSC_SUCCESS); 2301 } 2302 2303 /*MC 2304 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 2305 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 2306 2307 Level: beginner 2308 2309 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 2310 M*/ 2311 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 2312 { 2313 PetscInt n = A->rmap->n; 2314 MPI_Comm comm; 2315 2316 PetscFunctionBegin; 2317 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 2318 PetscCall(MatCreate(comm, B)); 2319 PetscCall(MatSetSizes(*B, n, n, n, n)); 2320 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2321 (*B)->factortype = ftype; 2322 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 2323 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2324 PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2325 2326 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2327 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 2328 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 2329 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2330 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2331 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2332 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2333 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJKokkos; 2334 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos; 2335 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2336 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2337 } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 2338 2339 // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host 2340 (*B)->canuseordering = PETSC_TRUE; 2341 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos)); 2342 PetscFunctionReturn(PETSC_SUCCESS); 2343 } 2344 2345 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void) 2346 { 2347 PetscFunctionBegin; 2348 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 2349 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos)); 2350 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 2351 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos)); 2352 PetscFunctionReturn(PETSC_SUCCESS); 2353 } 2354 2355 /* Utility to print out a KokkosCsrMatrix for debugging */ 2356 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 2357 { 2358 const auto &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map); 2359 const auto &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries); 2360 const auto &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values); 2361 const PetscInt *i = iv.data(); 2362 const PetscInt *j = jv.data(); 2363 const PetscScalar *a = av.data(); 2364 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 2365 2366 PetscFunctionBegin; 2367 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 2368 for (PetscInt k = 0; k < m; k++) { 2369 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 2370 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 2371 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 2372 } 2373 PetscFunctionReturn(PETSC_SUCCESS); 2374 } 2375