1 #include <petsc_kokkos.hpp> 2 #include <petscvec_kokkos.hpp> 3 #include <petscmat_kokkos.hpp> 4 #include <petscpkg_version.h> 5 #include <petsc/private/petscimpl.h> 6 #include <petsc/private/sfimpl.h> 7 #include <petsc/private/kokkosimpl.hpp> 8 #include <petscsys.h> 9 10 #include <Kokkos_Core.hpp> 11 #include <KokkosBlas.hpp> 12 #include <KokkosSparse_CrsMatrix.hpp> 13 14 // To suppress compiler warnings: 15 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63: 16 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, 17 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*, 18 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations] 19 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations") 20 #include <KokkosSparse_spmv.hpp> 21 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END() 22 23 #include <KokkosSparse_spiluk.hpp> 24 #include <KokkosSparse_sptrsv.hpp> 25 #include <KokkosSparse_spgemm.hpp> 26 #include <KokkosSparse_spadd.hpp> 27 #include <KokkosBatched_LU_Decl.hpp> 28 #include <KokkosBatched_InverseLU_Decl.hpp> 29 30 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 31 32 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 33 #include <KokkosSparse_Utils.hpp> 34 using KokkosSparse::sort_crs_matrix; 35 using KokkosSparse::Impl::transpose_matrix; 36 #else 37 #include <KokkosKernels_Sorting.hpp> 38 using KokkosKernels::sort_crs_matrix; 39 using KokkosKernels::Impl::transpose_matrix; 40 #endif 41 42 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0) 43 using KokkosSparse::spiluk_symbolic; 44 using KokkosSparse::spiluk_numeric; 45 using KokkosSparse::sptrsv_symbolic; 46 using KokkosSparse::sptrsv_solve; 47 using KokkosSparse::Experimental::SPTRSVAlgorithm; 48 using KokkosSparse::Experimental::SPILUKAlgorithm; 49 #else 50 using KokkosSparse::Experimental::spiluk_symbolic; 51 using KokkosSparse::Experimental::spiluk_numeric; 52 using KokkosSparse::Experimental::sptrsv_symbolic; 53 using KokkosSparse::Experimental::sptrsv_solve; 54 using KokkosSparse::Experimental::SPTRSVAlgorithm; 55 using KokkosSparse::Experimental::SPILUKAlgorithm; 56 #endif 57 58 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 59 60 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 61 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 62 In the latter case, it is important to set a_dual's sync state correctly. 63 */ 64 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 65 { 66 Mat_SeqAIJ *aijseq; 67 Mat_SeqAIJKokkos *aijkok; 68 69 PetscFunctionBegin; 70 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 71 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 72 73 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 74 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 75 76 /* If aijkok does not exist, we just copy i, j to device. 77 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 78 In both cases, we build a new aijkok structure. 79 */ 80 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 81 if (aijkok && aijkok->host_aij_allocated_by_kokkos) { /* Avoid accidentally freeing much needed a,i,j on host when deleting aijkok */ 82 PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i)); 83 PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j)); 84 PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a)); 85 PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1)); 86 PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz())); 87 PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz())); 88 aijseq->free_a = PETSC_TRUE; 89 aijseq->free_ij = PETSC_TRUE; 90 /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where 91 we have the CsrMatrix on device first and then copy to host, followed by 92 MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL. 93 One could improve it by not using NULL garray. 94 */ 95 } 96 delete aijkok; 97 aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 98 A->spptr = aijkok; 99 } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag. 100 MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n); 101 auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h); 102 aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h); 103 } 104 PetscFunctionReturn(PETSC_SUCCESS); 105 } 106 107 /* Sync CSR data to device if not yet */ 108 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 109 { 110 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 111 112 PetscFunctionBegin; 113 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 114 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 115 if (aijkok->a_dual.need_sync_device()) { 116 PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 117 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 118 aijkok->hermitian_updated = PETSC_FALSE; 119 } 120 PetscFunctionReturn(PETSC_SUCCESS); 121 } 122 123 /* Mark the CSR data on device as modified */ 124 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 125 { 126 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 127 128 PetscFunctionBegin; 129 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 130 aijkok->a_dual.clear_sync_state(); 131 aijkok->a_dual.modify_device(); 132 aijkok->transpose_updated = PETSC_FALSE; 133 aijkok->hermitian_updated = PETSC_FALSE; 134 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 135 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 136 PetscFunctionReturn(PETSC_SUCCESS); 137 } 138 139 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 140 { 141 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 142 auto exec = PetscGetKokkosExecutionSpace(); 143 144 PetscFunctionBegin; 145 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 146 /* We do not expect one needs factors on host */ 147 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 148 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 149 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec)); 150 PetscFunctionReturn(PETSC_SUCCESS); 151 } 152 153 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 154 { 155 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 156 157 PetscFunctionBegin; 158 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 159 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 160 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 161 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 162 */ 163 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 164 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 165 *array = aijkok->a_dual.view_host().data(); 166 } else { /* Happens when calling MatSetValues on a newly created matrix */ 167 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 168 } 169 PetscFunctionReturn(PETSC_SUCCESS); 170 } 171 172 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 173 { 174 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 175 176 PetscFunctionBegin; 177 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 178 PetscFunctionReturn(PETSC_SUCCESS); 179 } 180 181 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 182 { 183 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 184 185 PetscFunctionBegin; 186 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 187 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 188 *array = aijkok->a_dual.view_host().data(); 189 } else { 190 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 191 } 192 PetscFunctionReturn(PETSC_SUCCESS); 193 } 194 195 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 196 { 197 PetscFunctionBegin; 198 *array = NULL; 199 PetscFunctionReturn(PETSC_SUCCESS); 200 } 201 202 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 203 { 204 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 205 206 PetscFunctionBegin; 207 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 208 *array = aijkok->a_dual.view_host().data(); 209 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 210 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 211 } 212 PetscFunctionReturn(PETSC_SUCCESS); 213 } 214 215 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 216 { 217 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 218 219 PetscFunctionBegin; 220 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 221 aijkok->a_dual.clear_sync_state(); 222 aijkok->a_dual.modify_host(); 223 } 224 PetscFunctionReturn(PETSC_SUCCESS); 225 } 226 227 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 228 { 229 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 230 231 PetscFunctionBegin; 232 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 233 234 if (i) *i = aijkok->i_device_data(); 235 if (j) *j = aijkok->j_device_data(); 236 if (a) { 237 PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 238 *a = aijkok->a_device_data(); 239 } 240 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 241 PetscFunctionReturn(PETSC_SUCCESS); 242 } 243 244 /* 245 Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 246 247 Input Parameter: 248 . A - the MATSEQAIJKOKKOS matrix 249 250 Output Parameters: 251 + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 252 - T_d - the transpose on device, whose value array is allocated but not initialized 253 */ 254 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 255 { 256 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 257 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 258 const PetscInt *Ai = aseq->i, *Aj = aseq->j; 259 MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 260 MatRowMapType *Ti = Ti_h.data(); 261 MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 262 MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 263 PetscInt *Tj = Tj_h.data(); 264 PetscInt *perm = perm_h.data(); 265 PetscInt *offset; 266 267 PetscFunctionBegin; 268 // Populate Ti 269 PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 270 Ti++; 271 for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 272 Ti--; 273 for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 274 275 // Populate Tj and the permutation array 276 PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 277 for (PetscInt i = 0; i < m; i++) { 278 for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 279 PetscInt r = Aj[j]; // row r of T 280 PetscInt disp = Ti[r] + offset[r]; 281 282 Tj[disp] = i; // col i of T 283 perm[disp] = j; 284 offset[r]++; 285 } 286 } 287 PetscCall(PetscFree(offset)); 288 289 // Sort each row of T, along with the permutation array 290 for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 291 292 // Output perm and T on device 293 auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 294 auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 295 PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 296 PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 297 PetscFunctionReturn(PETSC_SUCCESS); 298 } 299 300 // Generate the transpose on device and cache it internally 301 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 302 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 303 { 304 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 305 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 306 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 307 KokkosCsrMatrix &T = akok->csrmatT; 308 309 PetscFunctionBegin; 310 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 311 PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device 312 313 const auto &Aa = akok->a_dual.view_device(); 314 315 if (A->symmetric == PETSC_BOOL3_TRUE) { 316 *csrmatT = akok->csrmat; 317 } else { 318 // See if we already have a cached transpose and its value is up to date 319 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 320 if (!akok->transpose_updated) { // if the value is out of date, update the cached version 321 const auto &perm = akok->transpose_perm; // get the permutation array 322 auto &Ta = T.values; 323 324 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 325 } 326 } else { // Generate T of size n x m for the first time 327 MatRowMapKokkosView perm; 328 329 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 330 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 331 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 332 } 333 akok->transpose_updated = PETSC_TRUE; 334 *csrmatT = akok->csrmatT; 335 } 336 PetscFunctionReturn(PETSC_SUCCESS); 337 } 338 339 // Generate the Hermitian on device and cache it internally 340 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 341 { 342 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 343 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 344 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 345 KokkosCsrMatrix &T = akok->csrmatH; 346 347 PetscFunctionBegin; 348 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 349 PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device 350 351 const auto &Aa = akok->a_dual.view_device(); 352 353 if (A->hermitian == PETSC_BOOL3_TRUE) { 354 *csrmatH = akok->csrmat; 355 } else { 356 // See if we already have a cached hermitian and its value is up to date 357 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 358 if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 359 const auto &perm = akok->transpose_perm; // get the permutation array 360 auto &Ta = T.values; 361 362 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 363 } 364 } else { // Generate T of size n x m for the first time 365 MatRowMapKokkosView perm; 366 367 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 368 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 369 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 370 } 371 akok->hermitian_updated = PETSC_TRUE; 372 *csrmatH = akok->csrmatH; 373 } 374 PetscFunctionReturn(PETSC_SUCCESS); 375 } 376 377 /* y = A x */ 378 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 379 { 380 Mat_SeqAIJKokkos *aijkok; 381 ConstPetscScalarKokkosView xv; 382 PetscScalarKokkosView yv; 383 384 PetscFunctionBegin; 385 PetscCall(PetscLogGpuTimeBegin()); 386 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 387 PetscCall(VecGetKokkosView(xx, &xv)); 388 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 389 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 390 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 391 PetscCall(VecRestoreKokkosView(xx, &xv)); 392 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 393 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 394 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 395 PetscCall(PetscLogGpuTimeEnd()); 396 PetscFunctionReturn(PETSC_SUCCESS); 397 } 398 399 /* y = A^T x */ 400 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 401 { 402 Mat_SeqAIJKokkos *aijkok; 403 const char *mode; 404 ConstPetscScalarKokkosView xv; 405 PetscScalarKokkosView yv; 406 KokkosCsrMatrix csrmat; 407 408 PetscFunctionBegin; 409 PetscCall(PetscLogGpuTimeBegin()); 410 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 411 PetscCall(VecGetKokkosView(xx, &xv)); 412 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 413 if (A->form_explicit_transpose) { 414 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 415 mode = "N"; 416 } else { 417 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 418 csrmat = aijkok->csrmat; 419 mode = "T"; 420 } 421 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 422 PetscCall(VecRestoreKokkosView(xx, &xv)); 423 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 424 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 425 PetscCall(PetscLogGpuTimeEnd()); 426 PetscFunctionReturn(PETSC_SUCCESS); 427 } 428 429 /* y = A^H x */ 430 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 431 { 432 Mat_SeqAIJKokkos *aijkok; 433 const char *mode; 434 ConstPetscScalarKokkosView xv; 435 PetscScalarKokkosView yv; 436 KokkosCsrMatrix csrmat; 437 438 PetscFunctionBegin; 439 PetscCall(PetscLogGpuTimeBegin()); 440 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 441 PetscCall(VecGetKokkosView(xx, &xv)); 442 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 443 if (A->form_explicit_transpose) { 444 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 445 mode = "N"; 446 } else { 447 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 448 csrmat = aijkok->csrmat; 449 mode = "C"; 450 } 451 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 452 PetscCall(VecRestoreKokkosView(xx, &xv)); 453 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 454 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 455 PetscCall(PetscLogGpuTimeEnd()); 456 PetscFunctionReturn(PETSC_SUCCESS); 457 } 458 459 /* z = A x + y */ 460 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 461 { 462 Mat_SeqAIJKokkos *aijkok; 463 ConstPetscScalarKokkosView xv; 464 PetscScalarKokkosView zv; 465 466 PetscFunctionBegin; 467 PetscCall(PetscLogGpuTimeBegin()); 468 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 469 if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host 470 PetscCall(VecGetKokkosView(xx, &xv)); 471 PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device 472 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 473 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 474 PetscCall(VecRestoreKokkosView(xx, &xv)); 475 PetscCall(VecRestoreKokkosView(zz, &zv)); 476 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 477 PetscCall(PetscLogGpuTimeEnd()); 478 PetscFunctionReturn(PETSC_SUCCESS); 479 } 480 481 /* z = A^T x + y */ 482 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 483 { 484 Mat_SeqAIJKokkos *aijkok; 485 const char *mode; 486 ConstPetscScalarKokkosView xv; 487 PetscScalarKokkosView zv; 488 KokkosCsrMatrix csrmat; 489 490 PetscFunctionBegin; 491 PetscCall(PetscLogGpuTimeBegin()); 492 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 493 if (zz != yy) PetscCall(VecCopy(yy, zz)); 494 PetscCall(VecGetKokkosView(xx, &xv)); 495 PetscCall(VecGetKokkosView(zz, &zv)); 496 if (A->form_explicit_transpose) { 497 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 498 mode = "N"; 499 } else { 500 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 501 csrmat = aijkok->csrmat; 502 mode = "T"; 503 } 504 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 505 PetscCall(VecRestoreKokkosView(xx, &xv)); 506 PetscCall(VecRestoreKokkosView(zz, &zv)); 507 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 508 PetscCall(PetscLogGpuTimeEnd()); 509 PetscFunctionReturn(PETSC_SUCCESS); 510 } 511 512 /* z = A^H x + y */ 513 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 514 { 515 Mat_SeqAIJKokkos *aijkok; 516 const char *mode; 517 ConstPetscScalarKokkosView xv; 518 PetscScalarKokkosView zv; 519 KokkosCsrMatrix csrmat; 520 521 PetscFunctionBegin; 522 PetscCall(PetscLogGpuTimeBegin()); 523 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 524 if (zz != yy) PetscCall(VecCopy(yy, zz)); 525 PetscCall(VecGetKokkosView(xx, &xv)); 526 PetscCall(VecGetKokkosView(zz, &zv)); 527 if (A->form_explicit_transpose) { 528 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 529 mode = "N"; 530 } else { 531 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 532 csrmat = aijkok->csrmat; 533 mode = "C"; 534 } 535 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 536 PetscCall(VecRestoreKokkosView(xx, &xv)); 537 PetscCall(VecRestoreKokkosView(zz, &zv)); 538 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 539 PetscCall(PetscLogGpuTimeEnd()); 540 PetscFunctionReturn(PETSC_SUCCESS); 541 } 542 543 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 544 { 545 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 546 547 PetscFunctionBegin; 548 switch (op) { 549 case MAT_FORM_EXPLICIT_TRANSPOSE: 550 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 551 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 552 A->form_explicit_transpose = flg; 553 break; 554 default: 555 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 556 break; 557 } 558 PetscFunctionReturn(PETSC_SUCCESS); 559 } 560 561 /* Depending on reuse, either build a new mat, or use the existing mat */ 562 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 563 { 564 Mat_SeqAIJ *aseq; 565 566 PetscFunctionBegin; 567 PetscCall(PetscKokkosInitializeCheck()); 568 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 569 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 570 PetscCall(MatSetType(*newmat, mtype)); 571 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 572 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 573 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 574 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 575 PetscCall(PetscFree(A->defaultvectype)); 576 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 577 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 578 PetscCall(MatSetOps_SeqAIJKokkos(A)); 579 aseq = static_cast<Mat_SeqAIJ *>(A->data); 580 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 581 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 582 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE); 583 } 584 } 585 PetscFunctionReturn(PETSC_SUCCESS); 586 } 587 588 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 589 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 590 */ 591 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 592 { 593 Mat_SeqAIJ *bseq; 594 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 595 Mat mat; 596 597 PetscFunctionBegin; 598 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 599 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 600 mat = *B; 601 if (A->assembled) { 602 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 603 bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE); 604 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 605 /* Now copy values to B if needed */ 606 if (dupOption == MAT_COPY_VALUES) { 607 if (akok->a_dual.need_sync_device()) { 608 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 609 bkok->a_dual.modify_host(); 610 } else { /* If device has the latest data, we only copy data on device */ 611 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 612 bkok->a_dual.modify_device(); 613 } 614 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 615 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 616 bkok->a_dual.modify_host(); 617 } 618 mat->spptr = bkok; 619 } 620 621 PetscCall(PetscFree(mat->defaultvectype)); 622 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 623 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 624 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 625 PetscFunctionReturn(PETSC_SUCCESS); 626 } 627 628 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 629 { 630 Mat At; 631 KokkosCsrMatrix internT; 632 Mat_SeqAIJKokkos *atkok, *bkok; 633 634 PetscFunctionBegin; 635 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 636 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 637 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 638 /* Deep copy internT, as we want to isolate the internal transpose */ 639 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 640 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 641 if (reuse == MAT_INITIAL_MATRIX) *B = At; 642 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 643 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 644 if ((*B)->assembled) { 645 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 646 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 647 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 648 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 649 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 650 MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 651 MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 652 PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 653 PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 654 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 655 } 656 PetscFunctionReturn(PETSC_SUCCESS); 657 } 658 659 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 660 { 661 Mat_SeqAIJKokkos *aijkok; 662 663 PetscFunctionBegin; 664 if (A->factortype == MAT_FACTOR_NONE) { 665 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 666 delete aijkok; 667 } else { 668 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 669 } 670 A->spptr = NULL; 671 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 672 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 673 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 674 #if defined(PETSC_HAVE_HYPRE) 675 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL)); 676 #endif 677 PetscCall(MatDestroy_SeqAIJ(A)); 678 PetscFunctionReturn(PETSC_SUCCESS); 679 } 680 681 /*MC 682 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 683 684 A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types 685 686 Options Database Key: 687 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 688 689 Level: beginner 690 691 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 692 M*/ 693 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 694 { 695 PetscFunctionBegin; 696 PetscCall(PetscKokkosInitializeCheck()); 697 PetscCall(MatCreate_SeqAIJ(A)); 698 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 699 PetscFunctionReturn(PETSC_SUCCESS); 700 } 701 702 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 703 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 704 { 705 Mat_SeqAIJ *a, *b; 706 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 707 MatScalarKokkosView aa, ba, ca; 708 MatRowMapKokkosView ai, bi, ci; 709 MatColIdxKokkosView aj, bj, cj; 710 PetscInt m, n, nnz, aN; 711 712 PetscFunctionBegin; 713 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 714 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 715 PetscAssertPointer(C, 4); 716 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 717 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 718 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 719 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 720 721 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 722 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 723 a = static_cast<Mat_SeqAIJ *>(A->data); 724 b = static_cast<Mat_SeqAIJ *>(B->data); 725 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 726 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 727 aa = akok->a_dual.view_device(); 728 ai = akok->i_dual.view_device(); 729 ba = bkok->a_dual.view_device(); 730 bi = bkok->i_dual.view_device(); 731 m = A->rmap->n; /* M, N and nnz of C */ 732 n = A->cmap->n + B->cmap->n; 733 nnz = a->nz + b->nz; 734 aN = A->cmap->n; /* N of A */ 735 if (reuse == MAT_INITIAL_MATRIX) { 736 aj = akok->j_dual.view_device(); 737 bj = bkok->j_dual.view_device(); 738 auto ca = MatScalarKokkosView("a", aa.extent(0) + ba.extent(0)); 739 auto ci = MatRowMapKokkosView("i", ai.extent(0)); 740 auto cj = MatColIdxKokkosView("j", aj.extent(0) + bj.extent(0)); 741 742 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 743 Kokkos::parallel_for( 744 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 745 PetscInt i = t.league_rank(); /* row i */ 746 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 747 748 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 749 ci(i) = coffset; 750 if (i == m - 1) ci(m) = ai(m) + bi(m); 751 }); 752 753 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 754 if (k < alen) { 755 ca(coffset + k) = aa(ai(i) + k); 756 cj(coffset + k) = aj(ai(i) + k); 757 } else { 758 ca(coffset + k) = ba(bi(i) + k - alen); 759 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 760 } 761 }); 762 }); 763 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci, cj, ca)); 764 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 765 } else if (reuse == MAT_REUSE_MATRIX) { 766 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 767 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 768 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 769 ca = ckok->a_dual.view_device(); 770 ci = ckok->i_dual.view_device(); 771 772 Kokkos::parallel_for( 773 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 774 PetscInt i = t.league_rank(); /* row i */ 775 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 776 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 777 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 778 else ca(ci(i) + k) = ba(bi(i) + k - alen); 779 }); 780 }); 781 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 782 } 783 PetscFunctionReturn(PETSC_SUCCESS); 784 } 785 786 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 787 { 788 PetscFunctionBegin; 789 delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 790 PetscFunctionReturn(PETSC_SUCCESS); 791 } 792 793 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 794 { 795 Mat_Product *product = C->product; 796 Mat A, B; 797 bool transA, transB; /* use bool, since KK needs this type */ 798 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 799 Mat_SeqAIJ *c; 800 MatProductData_SeqAIJKokkos *pdata; 801 KokkosCsrMatrix csrmatA, csrmatB; 802 803 PetscFunctionBegin; 804 MatCheckProduct(C, 1); 805 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 806 pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 807 808 // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 809 // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 810 // we still do numeric. 811 if (pdata->reusesym) { // numeric reuses results from symbolic 812 pdata->reusesym = PETSC_FALSE; 813 PetscFunctionReturn(PETSC_SUCCESS); 814 } 815 816 switch (product->type) { 817 case MATPRODUCT_AB: 818 transA = false; 819 transB = false; 820 break; 821 case MATPRODUCT_AtB: 822 transA = true; 823 transB = false; 824 break; 825 case MATPRODUCT_ABt: 826 transA = false; 827 transB = true; 828 break; 829 default: 830 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 831 } 832 833 A = product->A; 834 B = product->B; 835 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 836 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 837 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 838 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 839 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 840 841 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 842 843 csrmatA = akok->csrmat; 844 csrmatB = bkok->csrmat; 845 846 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 847 if (transA) { 848 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 849 transA = false; 850 } 851 852 if (transB) { 853 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 854 transB = false; 855 } 856 PetscCall(PetscLogGpuTimeBegin()); 857 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 858 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 859 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 860 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 861 #endif 862 863 PetscCall(PetscLogGpuTimeEnd()); 864 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 865 /* shorter version of MatAssemblyEnd_SeqAIJ */ 866 c = (Mat_SeqAIJ *)C->data; 867 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 868 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 869 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 870 c->reallocs = 0; 871 C->info.mallocs = 0; 872 C->info.nz_unneeded = 0; 873 C->assembled = C->was_assembled = PETSC_TRUE; 874 C->num_ass++; 875 PetscFunctionReturn(PETSC_SUCCESS); 876 } 877 878 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 879 { 880 Mat_Product *product = C->product; 881 MatProductType ptype; 882 Mat A, B; 883 bool transA, transB; 884 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 885 MatProductData_SeqAIJKokkos *pdata; 886 MPI_Comm comm; 887 KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 888 889 PetscFunctionBegin; 890 MatCheckProduct(C, 1); 891 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 892 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 893 A = product->A; 894 B = product->B; 895 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 896 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 897 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 898 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 899 csrmatA = akok->csrmat; 900 csrmatB = bkok->csrmat; 901 902 ptype = product->type; 903 // Take advantage of the symmetry if true 904 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 905 ptype = MATPRODUCT_AB; 906 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 907 } 908 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 909 ptype = MATPRODUCT_AB; 910 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 911 } 912 913 switch (ptype) { 914 case MATPRODUCT_AB: 915 transA = false; 916 transB = false; 917 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 918 break; 919 case MATPRODUCT_AtB: 920 transA = true; 921 transB = false; 922 if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs)); 923 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 924 break; 925 case MATPRODUCT_ABt: 926 transA = false; 927 transB = true; 928 if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs)); 929 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 930 break; 931 default: 932 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 933 } 934 PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 935 pdata->reusesym = product->api_user; 936 937 /* TODO: add command line options to select spgemm algorithms */ 938 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 939 940 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 941 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 942 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 943 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 944 #endif 945 #endif 946 PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 947 948 PetscCall(PetscLogGpuTimeBegin()); 949 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 950 if (transA) { 951 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 952 transA = false; 953 } 954 955 if (transB) { 956 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 957 transB = false; 958 } 959 960 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 961 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 962 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 963 calling new Mat_SeqAIJKokkos(). 964 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 965 */ 966 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 967 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 968 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 969 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 970 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 971 #endif 972 PetscCall(PetscLogGpuTimeEnd()); 973 974 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 975 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 976 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 977 PetscFunctionReturn(PETSC_SUCCESS); 978 } 979 980 /* handles sparse matrix matrix ops */ 981 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 982 { 983 Mat_Product *product = mat->product; 984 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 985 986 PetscFunctionBegin; 987 MatCheckProduct(mat, 1); 988 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 989 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 990 if (Biskok && Ciskok) { 991 switch (product->type) { 992 case MATPRODUCT_AB: 993 case MATPRODUCT_AtB: 994 case MATPRODUCT_ABt: 995 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 996 break; 997 case MATPRODUCT_PtAP: 998 case MATPRODUCT_RARt: 999 case MATPRODUCT_ABC: 1000 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 1001 break; 1002 default: 1003 break; 1004 } 1005 } else { /* fallback for AIJ */ 1006 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 1007 } 1008 PetscFunctionReturn(PETSC_SUCCESS); 1009 } 1010 1011 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 1012 { 1013 Mat_SeqAIJKokkos *aijkok; 1014 1015 PetscFunctionBegin; 1016 PetscCall(PetscLogGpuTimeBegin()); 1017 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1018 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1019 KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 1020 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1021 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 1022 PetscCall(PetscLogGpuTimeEnd()); 1023 PetscFunctionReturn(PETSC_SUCCESS); 1024 } 1025 1026 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular) 1027 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a) 1028 { 1029 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1030 1031 PetscFunctionBegin; 1032 if (A->assembled && aijseq->diagonaldense) { // no missing diagonals 1033 PetscInt n = PetscMin(A->rmap->n, A->cmap->n); 1034 1035 PetscCall(PetscLogGpuTimeBegin()); 1036 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1037 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1038 const auto &Aa = aijkok->a_dual.view_device(); 1039 const auto &Adiag = aijkok->diag_dual.view_device(); 1040 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; })); 1041 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1042 PetscCall(PetscLogGpuFlops(n)); 1043 PetscCall(PetscLogGpuTimeEnd()); 1044 } else { // need reassembly, very slow! 1045 PetscCall(MatShift_Basic(A, a)); 1046 } 1047 PetscFunctionReturn(PETSC_SUCCESS); 1048 } 1049 1050 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is) 1051 { 1052 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data); 1053 1054 PetscFunctionBegin; 1055 if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals 1056 ConstPetscScalarKokkosView dv; 1057 PetscInt n, nv; 1058 1059 PetscCall(PetscLogGpuTimeBegin()); 1060 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1061 PetscCall(VecGetKokkosView(D, &dv)); 1062 PetscCall(VecGetLocalSize(D, &nv)); 1063 n = PetscMin(Y->rmap->n, Y->cmap->n); 1064 PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match"); 1065 1066 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1067 const auto &Aa = aijkok->a_dual.view_device(); 1068 const auto &Adiag = aijkok->diag_dual.view_device(); 1069 PetscCallCXX(Kokkos::parallel_for( 1070 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1071 if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i); 1072 else Aa(Adiag(i)) += dv(i); 1073 })); 1074 PetscCall(VecRestoreKokkosView(D, &dv)); 1075 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1076 PetscCall(PetscLogGpuFlops(n)); 1077 PetscCall(PetscLogGpuTimeEnd()); 1078 } else { // need reassembly, very slow! 1079 PetscCall(MatDiagonalSet_Default(Y, D, is)); 1080 } 1081 PetscFunctionReturn(PETSC_SUCCESS); 1082 } 1083 1084 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr) 1085 { 1086 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1087 PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz; 1088 ConstPetscScalarKokkosView lv, rv; 1089 1090 PetscFunctionBegin; 1091 PetscCall(PetscLogGpuTimeBegin()); 1092 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1093 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1094 const auto &Aa = aijkok->a_dual.view_device(); 1095 const auto &Ai = aijkok->i_dual.view_device(); 1096 const auto &Aj = aijkok->j_dual.view_device(); 1097 if (ll) { 1098 PetscCall(VecGetLocalSize(ll, &m)); 1099 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1100 PetscCall(VecGetKokkosView(ll, &lv)); 1101 PetscCallCXX(Kokkos::parallel_for( // for each row 1102 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1103 PetscInt i = t.league_rank(); // row i 1104 PetscInt len = Ai(i + 1) - Ai(i); 1105 // scale entries on the row 1106 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); }); 1107 })); 1108 PetscCall(VecRestoreKokkosView(ll, &lv)); 1109 PetscCall(PetscLogGpuFlops(nz)); 1110 } 1111 if (rr) { 1112 PetscCall(VecGetLocalSize(rr, &n)); 1113 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1114 PetscCall(VecGetKokkosView(rr, &rv)); 1115 PetscCallCXX(Kokkos::parallel_for( // for each nonzero 1116 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); })); 1117 PetscCall(VecRestoreKokkosView(rr, &lv)); 1118 PetscCall(PetscLogGpuFlops(nz)); 1119 } 1120 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1121 PetscCall(PetscLogGpuTimeEnd()); 1122 PetscFunctionReturn(PETSC_SUCCESS); 1123 } 1124 1125 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 1126 { 1127 Mat_SeqAIJKokkos *aijkok; 1128 1129 PetscFunctionBegin; 1130 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1131 if (aijkok) { /* Only zero the device if data is already there */ 1132 KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0); 1133 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1134 } else { /* Might be preallocated but not assembled */ 1135 PetscCall(MatZeroEntries_SeqAIJ(A)); 1136 } 1137 PetscFunctionReturn(PETSC_SUCCESS); 1138 } 1139 1140 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1141 { 1142 Mat_SeqAIJKokkos *aijkok; 1143 PetscInt n; 1144 PetscScalarKokkosView xv; 1145 1146 PetscFunctionBegin; 1147 PetscCall(VecGetLocalSize(x, &n)); 1148 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1149 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1150 1151 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1152 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1153 1154 const auto &Aa = aijkok->a_dual.view_device(); 1155 const auto &Ai = aijkok->i_dual.view_device(); 1156 const auto &Adiag = aijkok->diag_dual.view_device(); 1157 1158 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1159 Kokkos::parallel_for( 1160 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1161 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1162 else xv(i) = 0; 1163 }); 1164 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1165 PetscFunctionReturn(PETSC_SUCCESS); 1166 } 1167 1168 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1169 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1170 { 1171 Mat_SeqAIJKokkos *aijkok; 1172 1173 PetscFunctionBegin; 1174 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1175 PetscAssertPointer(kv, 2); 1176 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1177 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1178 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1179 *kv = aijkok->a_dual.view_device(); 1180 PetscFunctionReturn(PETSC_SUCCESS); 1181 } 1182 1183 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1184 { 1185 PetscFunctionBegin; 1186 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1187 PetscAssertPointer(kv, 2); 1188 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1189 PetscFunctionReturn(PETSC_SUCCESS); 1190 } 1191 1192 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1193 { 1194 Mat_SeqAIJKokkos *aijkok; 1195 1196 PetscFunctionBegin; 1197 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1198 PetscAssertPointer(kv, 2); 1199 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1200 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1201 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1202 *kv = aijkok->a_dual.view_device(); 1203 PetscFunctionReturn(PETSC_SUCCESS); 1204 } 1205 1206 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1207 { 1208 PetscFunctionBegin; 1209 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1210 PetscAssertPointer(kv, 2); 1211 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1212 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1213 PetscFunctionReturn(PETSC_SUCCESS); 1214 } 1215 1216 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1217 { 1218 Mat_SeqAIJKokkos *aijkok; 1219 1220 PetscFunctionBegin; 1221 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1222 PetscAssertPointer(kv, 2); 1223 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1224 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1225 *kv = aijkok->a_dual.view_device(); 1226 PetscFunctionReturn(PETSC_SUCCESS); 1227 } 1228 1229 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1230 { 1231 PetscFunctionBegin; 1232 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1233 PetscAssertPointer(kv, 2); 1234 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1235 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1236 PetscFunctionReturn(PETSC_SUCCESS); 1237 } 1238 1239 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A) 1240 { 1241 Mat_SeqAIJKokkos *akok; 1242 1243 PetscFunctionBegin; 1244 PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_d.extent(0), i_d, j_d, a_d)); 1245 PetscCall(MatCreate(comm, A)); 1246 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1247 PetscFunctionReturn(PETSC_SUCCESS); 1248 } 1249 1250 /* Computes Y += alpha X */ 1251 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1252 { 1253 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1254 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1255 ConstMatScalarKokkosView Xa; 1256 MatScalarKokkosView Ya; 1257 auto exec = PetscGetKokkosExecutionSpace(); 1258 1259 PetscFunctionBegin; 1260 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1261 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1262 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1263 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1264 PetscCall(PetscLogGpuTimeBegin()); 1265 1266 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1267 PetscBool e; 1268 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1269 if (e) { 1270 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1271 if (e) pattern = SAME_NONZERO_PATTERN; 1272 } 1273 } 1274 1275 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1276 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1277 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1278 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1279 */ 1280 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1281 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1282 Xa = xkok->a_dual.view_device(); 1283 Ya = ykok->a_dual.view_device(); 1284 1285 if (pattern == SAME_NONZERO_PATTERN) { 1286 KokkosBlas::axpy(exec, alpha, Xa, Ya); 1287 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1288 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1289 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1290 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1291 1292 Kokkos::parallel_for( 1293 Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1294 PetscInt i = t.league_rank(); // row i 1295 Kokkos::single(Kokkos::PerTeam(t), [=]() { 1296 // Only one thread works in a team 1297 PetscInt p, q = Yi(i); 1298 for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 1299 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 1300 if (Xj(p) == Yj(q)) { // Found it 1301 Ya(q) += alpha * Xa(p); 1302 q++; 1303 } else { 1304 // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1305 // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1306 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 1307 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 1308 #else 1309 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 1310 #endif 1311 } 1312 } 1313 }); 1314 }); 1315 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1316 } else { // different nonzero patterns 1317 Mat Z; 1318 KokkosCsrMatrix zcsr; 1319 KernelHandle kh; 1320 kh.create_spadd_handle(true); // X, Y are sorted 1321 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1322 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1323 zkok = new Mat_SeqAIJKokkos(zcsr); 1324 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1325 PetscCall(MatHeaderReplace(Y, &Z)); 1326 kh.destroy_spadd_handle(); 1327 } 1328 PetscCall(PetscLogGpuTimeEnd()); 1329 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 1330 PetscFunctionReturn(PETSC_SUCCESS); 1331 } 1332 1333 struct MatCOOStruct_SeqAIJKokkos { 1334 PetscCount n; 1335 PetscCount Atot; 1336 PetscInt nz; 1337 PetscCountKokkosView jmap; 1338 PetscCountKokkosView perm; 1339 1340 MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h) 1341 { 1342 nz = coo_h->nz; 1343 n = coo_h->n; 1344 Atot = coo_h->Atot; 1345 jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1)); 1346 perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot)); 1347 } 1348 }; 1349 1350 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data) 1351 { 1352 PetscFunctionBegin; 1353 PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data)); 1354 PetscFunctionReturn(PETSC_SUCCESS); 1355 } 1356 1357 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1358 { 1359 Mat_SeqAIJKokkos *akok; 1360 Mat_SeqAIJ *aseq; 1361 PetscContainer container_h; 1362 MatCOOStruct_SeqAIJ *coo_h; 1363 MatCOOStruct_SeqAIJKokkos *coo_d; 1364 1365 PetscFunctionBegin; 1366 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1367 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1368 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1369 delete akok; 1370 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE); 1371 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1372 1373 // Copy the COO struct to device 1374 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 1375 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 1376 PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h)); 1377 1378 // Put the COO struct in a container and then attach that to the matrix 1379 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos)); 1380 PetscFunctionReturn(PETSC_SUCCESS); 1381 } 1382 1383 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1384 { 1385 MatScalarKokkosView Aa; 1386 ConstMatScalarKokkosView kv; 1387 PetscMemType memtype; 1388 PetscContainer container; 1389 MatCOOStruct_SeqAIJKokkos *coo; 1390 1391 PetscFunctionBegin; 1392 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 1393 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 1394 1395 const auto &n = coo->n; 1396 const auto &Annz = coo->nz; 1397 const auto &jmap = coo->jmap; 1398 const auto &perm = coo->perm; 1399 1400 PetscCall(PetscGetMemType(v, &memtype)); 1401 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1402 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n)); 1403 } else { 1404 kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */ 1405 } 1406 1407 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1408 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1409 1410 PetscCall(PetscLogGpuTimeBegin()); 1411 Kokkos::parallel_for( 1412 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) { 1413 PetscScalar sum = 0.0; 1414 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1415 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1416 }); 1417 PetscCall(PetscLogGpuTimeEnd()); 1418 1419 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1420 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1421 PetscFunctionReturn(PETSC_SUCCESS); 1422 } 1423 1424 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1425 { 1426 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1427 1428 PetscFunctionBegin; 1429 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1430 A->boundtocpu = PETSC_FALSE; 1431 1432 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1433 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1434 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1435 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1436 A->ops->scale = MatScale_SeqAIJKokkos; 1437 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1438 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1439 A->ops->mult = MatMult_SeqAIJKokkos; 1440 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1441 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1442 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1443 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1444 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1445 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1446 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1447 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1448 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1449 A->ops->shift = MatShift_SeqAIJKokkos; 1450 A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos; 1451 A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos; 1452 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1453 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1454 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1455 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1456 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1457 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1458 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1459 1460 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1461 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1462 #if defined(PETSC_HAVE_HYPRE) 1463 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE)); 1464 #endif 1465 PetscFunctionReturn(PETSC_SUCCESS); 1466 } 1467 1468 /* 1469 Extract the (prescribled) diagonal blocks of the matrix and then invert them 1470 1471 Input Parameters: 1472 + A - the MATSEQAIJKOKKOS matrix 1473 . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i) 1474 . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[] 1475 . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i) 1476 - work - a pre-allocated work buffer (as big as diagVal) for use by this routine 1477 1478 Output Parameter: 1479 . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order) 1480 */ 1481 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal) 1482 { 1483 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1484 PetscInt nblocks = bs.extent(0) - 1; 1485 1486 PetscFunctionBegin; 1487 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device 1488 1489 // Pull out the diagonal blocks of the matrix and then invert the blocks 1490 auto Aa = akok->a_dual.view_device(); 1491 auto Ai = akok->i_dual.view_device(); 1492 auto Aj = akok->j_dual.view_device(); 1493 auto Adiag = akok->diag_dual.view_device(); 1494 // TODO: how to tune the team size? 1495 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY) 1496 auto ts = Kokkos::AUTO(); 1497 #else 1498 auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs 1499 #endif 1500 PetscCallCXX(Kokkos::parallel_for( 1501 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) { 1502 const PetscInt bid = teamMember.league_rank(); // block id 1503 const PetscInt rstart = bs(bid); // this block starts from this row 1504 const PetscInt m = bs(bid + 1) - bs(bid); // size of this block 1505 const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order 1506 const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m); 1507 1508 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B 1509 PetscInt i = rstart + r; // i-th row in A 1510 1511 if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case) 1512 PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row 1513 1514 for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet 1515 if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero 1516 B(r, c) = 0.0; 1517 } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column 1518 B(r, c) = Aa(first + c); 1519 } else { // this entry does not show up in the CSR 1520 B(r, c) = 0.0; 1521 } 1522 } 1523 } else { // rare case that the diagonal does not exist 1524 const PetscInt begin = Ai(i); 1525 const PetscInt end = Ai(i + 1); 1526 for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0; 1527 for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not. 1528 if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j); 1529 else if (Aj(j) >= rstart + m) break; 1530 } 1531 } 1532 }); 1533 1534 // LU-decompose B (w/o pivoting) and then invert B 1535 KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0); 1536 KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W); 1537 })); 1538 // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels 1539 PetscFunctionReturn(PETSC_SUCCESS); 1540 } 1541 1542 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1543 { 1544 Mat_SeqAIJ *aseq; 1545 PetscInt i, m, n; 1546 auto exec = PetscGetKokkosExecutionSpace(); 1547 1548 PetscFunctionBegin; 1549 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1550 1551 m = akok->nrows(); 1552 n = akok->ncols(); 1553 PetscCall(MatSetSizes(A, m, n, m, n)); 1554 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1555 1556 /* Set up data structures of A as a MATSEQAIJ */ 1557 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1558 aseq = (Mat_SeqAIJ *)A->data; 1559 1560 PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */ 1561 PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec)); 1562 1563 aseq->i = akok->i_host_data(); 1564 aseq->j = akok->j_host_data(); 1565 aseq->a = akok->a_host_data(); 1566 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1567 aseq->free_a = PETSC_FALSE; 1568 aseq->free_ij = PETSC_FALSE; 1569 aseq->nz = akok->nnz(); 1570 aseq->maxnz = aseq->nz; 1571 1572 PetscCall(PetscMalloc1(m, &aseq->imax)); 1573 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1574 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1575 1576 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1577 akok->nonzerostate = A->nonzerostate; 1578 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1579 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1580 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1581 PetscFunctionReturn(PETSC_SUCCESS); 1582 } 1583 1584 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 1585 { 1586 PetscFunctionBegin; 1587 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1588 *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 1589 PetscFunctionReturn(PETSC_SUCCESS); 1590 } 1591 1592 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 1593 { 1594 Mat_SeqAIJKokkos *akok; 1595 1596 PetscFunctionBegin; 1597 PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 1598 PetscCall(MatCreate(comm, A)); 1599 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1600 PetscFunctionReturn(PETSC_SUCCESS); 1601 } 1602 1603 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1604 1605 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1606 */ 1607 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1608 { 1609 PetscFunctionBegin; 1610 PetscCall(MatCreate(comm, A)); 1611 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1612 PetscFunctionReturn(PETSC_SUCCESS); 1613 } 1614 1615 /*@C 1616 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1617 (the default parallel PETSc format). This matrix will ultimately be handled by 1618 Kokkos for calculations. 1619 1620 Collective 1621 1622 Input Parameters: 1623 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1624 . m - number of rows 1625 . n - number of columns 1626 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 1627 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 1628 1629 Output Parameter: 1630 . A - the matrix 1631 1632 Level: intermediate 1633 1634 Notes: 1635 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1636 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1637 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1638 1639 The AIJ format, also called 1640 compressed row storage, is fully compatible with standard Fortran 1641 storage. That is, the stored row and column indices can begin at 1642 either one (as in Fortran) or zero. 1643 1644 Specify the preallocated storage with either `nz` or `nnz` (not both). 1645 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1646 allocation. 1647 1648 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()` 1649 @*/ 1650 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1651 { 1652 PetscFunctionBegin; 1653 PetscCall(PetscKokkosInitializeCheck()); 1654 PetscCall(MatCreate(comm, A)); 1655 PetscCall(MatSetSizes(*A, m, n, m, n)); 1656 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1657 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1658 PetscFunctionReturn(PETSC_SUCCESS); 1659 } 1660 1661 // After matrix numeric factorization, there are still steps to do before triangular solve can be called. 1662 // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does). 1663 // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve(). 1664 // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck. 1665 static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A) 1666 { 1667 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1668 const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy 1669 const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy 1670 1671 PetscFunctionBegin; 1672 if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet 1673 if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d)); 1674 if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d)); 1675 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1676 } 1677 PetscFunctionReturn(PETSC_SUCCESS); 1678 } 1679 1680 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1681 { 1682 const PetscInt n = A->rmap->n; 1683 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1684 const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy 1685 const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy 1686 1687 PetscFunctionBegin; 1688 if (!factors->transpose_updated) { 1689 if (has_upper) { 1690 if (!factors->iUt_d.extent(0)) { // Allocate Ut on device if not yet 1691 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix 1692 factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 1693 factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 1694 } 1695 1696 if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host 1697 if (!factors->U) { 1698 Mat_SeqAIJ *seq; 1699 1700 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U)); 1701 PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut)); 1702 1703 seq = static_cast<Mat_SeqAIJ *>(factors->Ut->data); 1704 factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1); 1705 factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz); 1706 factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz); 1707 } else { 1708 PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h 1709 } 1710 // Copy Ut from host to device 1711 PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h)); 1712 PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h)); 1713 PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h)); 1714 } else { // If U was computed on device, we also compute the transpose there 1715 // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. 1716 PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, 1717 factors->jU_d, factors->aU_d, 1718 factors->iUt_d, factors->jUt_d, 1719 factors->aUt_d)); 1720 PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d)); 1721 } 1722 PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d)); 1723 } 1724 1725 // do the same for L with LU 1726 if (has_lower) { 1727 if (!factors->iLt_d.extent(0)) { // Allocate Lt on device if not yet 1728 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix 1729 factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 1730 factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 1731 } 1732 1733 if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host 1734 if (!factors->L) { 1735 Mat_SeqAIJ *seq; 1736 1737 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L)); 1738 PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt)); 1739 1740 seq = static_cast<Mat_SeqAIJ *>(factors->Lt->data); 1741 factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1); 1742 factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz); 1743 factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz); 1744 } else { 1745 PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h 1746 } 1747 // Copy Lt from host to device 1748 PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h)); 1749 PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h)); 1750 PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h)); 1751 } else { // If L was computed on device, we also compute the transpose there 1752 // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. 1753 PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, 1754 factors->jL_d, factors->aL_d, 1755 factors->iLt_d, factors->jLt_d, 1756 factors->aLt_d)); 1757 PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d)); 1758 } 1759 PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d)); 1760 } 1761 1762 factors->transpose_updated = PETSC_TRUE; 1763 } 1764 PetscFunctionReturn(PETSC_SUCCESS); 1765 } 1766 1767 // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A. 1768 // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty. 1769 static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx) 1770 { 1771 auto exec = PetscGetKokkosExecutionSpace(); 1772 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1773 PetscInt m = A->rmap->n; 1774 PetscScalarKokkosView D = factors->D_d; 1775 PetscScalarKokkosView X, Y, B; // alias 1776 ConstPetscScalarKokkosView b; 1777 PetscScalarKokkosView x; 1778 PetscIntKokkosView &rowperm = factors->rowperm; 1779 PetscBool identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1780 1781 PetscFunctionBegin; 1782 PetscCall(PetscLogGpuTimeBegin()); 1783 PetscCall(MatSeqAIJKokkosSolveCheck(A)); // for UX = T 1784 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B 1785 PetscCall(VecGetKokkosView(bb, &b)); 1786 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1787 1788 // Solve U^T Y = B 1789 if (identity) { // Reorder b with the row permutation 1790 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1791 Y = factors->workVector; 1792 } else { 1793 B = factors->workVector; 1794 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); })); 1795 Y = x; 1796 } 1797 PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y)); 1798 1799 // Solve diag(D) Y' = Y. 1800 // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication. 1801 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); })); 1802 1803 // Solve UX = Y 1804 if (identity) { 1805 X = x; 1806 } else { 1807 X = factors->workVector; // B is not needed anymore 1808 } 1809 PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X)); 1810 1811 // Reorder X with the inverse column (row) permutation 1812 if (!identity) { 1813 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); })); 1814 } 1815 1816 PetscCall(VecRestoreKokkosView(bb, &b)); 1817 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1818 PetscCall(PetscLogGpuTimeEnd()); 1819 PetscFunctionReturn(PETSC_SUCCESS); 1820 } 1821 1822 // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively. 1823 // R and C are represented by rowperm and colperm in factors. 1824 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty. 1825 static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx) 1826 { 1827 auto exec = PetscGetKokkosExecutionSpace(); 1828 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1829 PetscInt m = A->rmap->n; 1830 PetscScalarKokkosView X, Y, B; // alias 1831 ConstPetscScalarKokkosView b; 1832 PetscScalarKokkosView x; 1833 PetscIntKokkosView &rowperm = factors->rowperm; 1834 PetscIntKokkosView &colperm = factors->colperm; 1835 PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1836 PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1837 1838 PetscFunctionBegin; 1839 PetscCall(PetscLogGpuTimeBegin()); 1840 PetscCall(MatSeqAIJKokkosSolveCheck(A)); 1841 PetscCall(VecGetKokkosView(bb, &b)); 1842 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1843 1844 // Solve L Y = B (i.e., L (U C^- x) = R b). R b indicates applying the row permutation on b. 1845 if (row_identity) { 1846 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1847 Y = factors->workVector; 1848 } else { 1849 B = factors->workVector; 1850 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); })); 1851 Y = x; 1852 } 1853 PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y)); 1854 1855 // Solve U C^- x = Y 1856 if (col_identity) { 1857 X = x; 1858 } else { 1859 X = factors->workVector; 1860 } 1861 PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X)); 1862 1863 // x = C X; Reorder X with the inverse col permutation 1864 if (!col_identity) { 1865 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); })); 1866 } 1867 1868 PetscCall(VecRestoreKokkosView(bb, &b)); 1869 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1870 PetscCall(PetscLogGpuTimeEnd()); 1871 PetscFunctionReturn(PETSC_SUCCESS); 1872 } 1873 1874 // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively. 1875 // R and C are represented by rowperm and colperm in factors. 1876 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty. 1877 // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R. 1878 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx) 1879 { 1880 auto exec = PetscGetKokkosExecutionSpace(); 1881 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1882 PetscInt m = A->rmap->n; 1883 PetscScalarKokkosView X, Y, B; // alias 1884 ConstPetscScalarKokkosView b; 1885 PetscScalarKokkosView x; 1886 PetscIntKokkosView &rowperm = factors->rowperm; 1887 PetscIntKokkosView &colperm = factors->colperm; 1888 PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1889 PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1890 1891 PetscFunctionBegin; 1892 PetscCall(PetscLogGpuTimeBegin()); 1893 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T 1894 PetscCall(VecGetKokkosView(bb, &b)); 1895 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1896 1897 // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b). Note C^- b = C^T b, which means applying the column permutation on b. 1898 if (col_identity) { // Reorder b with the col permutation 1899 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1900 Y = factors->workVector; 1901 } else { 1902 B = factors->workVector; 1903 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); })); 1904 Y = x; 1905 } 1906 PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y)); 1907 1908 // Solve L^T X = Y 1909 if (row_identity) { 1910 X = x; 1911 } else { 1912 X = factors->workVector; 1913 } 1914 PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X)); 1915 1916 // x = R^- X = R^T X; Reorder X with the inverse row permutation 1917 if (!row_identity) { 1918 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); })); 1919 } 1920 1921 PetscCall(VecRestoreKokkosView(bb, &b)); 1922 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1923 PetscCall(PetscLogGpuTimeEnd()); 1924 PetscFunctionReturn(PETSC_SUCCESS); 1925 } 1926 1927 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1928 { 1929 PetscFunctionBegin; 1930 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1931 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1932 1933 if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device 1934 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1935 Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data); 1936 const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag; 1937 const MatScalar *Ba = b->a; 1938 PetscInt m = B->rmap->n, n = B->cmap->n; 1939 1940 if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time 1941 // Allocate memory and copy the structure 1942 factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1); 1943 factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries 1944 factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m); 1945 factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1); 1946 factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m])); 1947 factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m])); 1948 1949 PetscInt *Li = factors->iL_h.data(); 1950 PetscInt *Lj = factors->jL_h.data(); 1951 PetscInt *Ui = factors->iU_h.data(); 1952 PetscInt *Uj = factors->jU_h.data(); 1953 1954 Li[0] = Ui[0] = 0; 1955 for (PetscInt i = 0; i < m; i++) { 1956 PetscInt llen = Bi[i + 1] - Bi[i]; // exclusive of the diagonal entry 1957 PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry 1958 1959 PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen); // entries of L on the left of the diagonal 1960 Lj[Li[i] + llen] = i; // diagonal entry of L 1961 1962 Uj[Ui[i]] = i; // diagonal entry of U 1963 PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1); // entries of U on the right of the diagonal 1964 1965 Li[i + 1] = Li[i] + llen + 1; 1966 Ui[i + 1] = Ui[i] + ulen; 1967 } 1968 1969 factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h); 1970 factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h); 1971 factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h); 1972 factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h); 1973 factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h); 1974 factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h); 1975 1976 // Copy row/col permutation to device 1977 IS rowperm = ((Mat_SeqAIJ *)B->data)->row; 1978 PetscBool row_identity; 1979 PetscCall(ISIdentity(rowperm, &row_identity)); 1980 if (!row_identity) { 1981 const PetscInt *ip; 1982 1983 PetscCall(ISGetIndices(rowperm, &ip)); 1984 factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m); 1985 PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m))); 1986 PetscCall(ISRestoreIndices(rowperm, &ip)); 1987 PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt))); 1988 } 1989 1990 IS colperm = ((Mat_SeqAIJ *)B->data)->col; 1991 PetscBool col_identity; 1992 PetscCall(ISIdentity(colperm, &col_identity)); 1993 if (!col_identity) { 1994 const PetscInt *ip; 1995 1996 PetscCall(ISGetIndices(colperm, &ip)); 1997 factors->colperm = PetscIntKokkosView(NoInit("colperm"), n); 1998 PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n))); 1999 PetscCall(ISRestoreIndices(colperm, &ip)); 2000 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 2001 } 2002 2003 /* Create sptrsv handles for L, U and their transpose */ 2004 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2005 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2006 #else 2007 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2008 #endif 2009 factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */); 2010 factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */); 2011 factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */); 2012 factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */); 2013 } 2014 2015 // Copy the value 2016 for (PetscInt i = 0; i < m; i++) { 2017 PetscInt llen = Bi[i + 1] - Bi[i]; 2018 PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; 2019 const PetscInt *Li = factors->iL_h.data(); 2020 const PetscInt *Ui = factors->iU_h.data(); 2021 2022 PetscScalar *La = factors->aL_h.data(); 2023 PetscScalar *Ua = factors->aU_h.data(); 2024 2025 PetscArraycpy(La + Li[i], Ba + Bi[i], llen); // entries of L 2026 La[Li[i] + llen] = 1.0; // diagonal entry 2027 2028 Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]]; // diagonal entry 2029 PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1); // entries of U 2030 } 2031 2032 PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h)); 2033 PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h)); 2034 // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic 2035 factors->transpose_updated = PETSC_FALSE; 2036 factors->sptrsv_symbolic_completed = PETSC_FALSE; 2037 2038 B->ops->solve = MatSolve_SeqAIJKokkos_LU; 2039 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU; 2040 } 2041 2042 B->ops->matsolve = NULL; 2043 B->ops->matsolvetranspose = NULL; 2044 PetscFunctionReturn(PETSC_SUCCESS); 2045 } 2046 2047 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info) 2048 { 2049 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 2050 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2051 PetscInt fill_lev = info->levels; 2052 2053 PetscFunctionBegin; 2054 PetscCall(PetscLogGpuTimeBegin()); 2055 PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false"); 2056 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2057 2058 auto a_d = aijkok->a_dual.view_device(); 2059 auto i_d = aijkok->i_dual.view_device(); 2060 auto j_d = aijkok->j_dual.view_device(); 2061 2062 PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d)); 2063 2064 B->assembled = PETSC_TRUE; 2065 B->preallocated = PETSC_TRUE; 2066 B->ops->solve = MatSolve_SeqAIJKokkos_LU; 2067 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU; 2068 B->ops->matsolve = NULL; 2069 B->ops->matsolvetranspose = NULL; 2070 2071 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 2072 factors->transpose_updated = PETSC_FALSE; 2073 factors->sptrsv_symbolic_completed = PETSC_FALSE; 2074 /* TODO: log flops, but how to know that? */ 2075 PetscCall(PetscLogGpuTimeEnd()); 2076 PetscFunctionReturn(PETSC_SUCCESS); 2077 } 2078 2079 // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering 2080 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info) 2081 { 2082 Mat_SeqAIJKokkos *aijkok; 2083 Mat_SeqAIJ *b; 2084 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2085 PetscInt fill_lev = info->levels; 2086 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 2087 PetscInt n = A->rmap->n; 2088 2089 PetscFunctionBegin; 2090 PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now"); 2091 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2092 2093 /* Create a spiluk handle and then do symbolic factorization */ 2094 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 2095 factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 2096 2097 auto spiluk_handle = factors->kh.get_spiluk_handle(); 2098 2099 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 2100 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 2101 Kokkos::realloc(factors->iU_d, n + 1); 2102 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 2103 2104 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 2105 auto i_d = aijkok->i_dual.view_device(); 2106 auto j_d = aijkok->j_dual.view_device(); 2107 PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d)); 2108 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 2109 2110 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 2111 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 2112 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 2113 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 2114 2115 /* TODO: add options to select sptrsv algorithms */ 2116 /* Create sptrsv handles for L, U and their transpose */ 2117 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2118 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2119 #else 2120 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2121 #endif 2122 2123 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 2124 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 2125 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 2126 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 2127 2128 /* Fill fields of the factor matrix B */ 2129 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 2130 b = (Mat_SeqAIJ *)B->data; 2131 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 2132 B->info.fill_ratio_given = info->fill; 2133 B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 2134 2135 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0; 2136 PetscFunctionReturn(PETSC_SUCCESS); 2137 } 2138 2139 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2140 { 2141 PetscFunctionBegin; 2142 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2143 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2144 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2145 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 2146 PetscFunctionReturn(PETSC_SUCCESS); 2147 } 2148 2149 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2150 { 2151 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2152 2153 PetscFunctionBegin; 2154 if (!info->factoronhost) { 2155 PetscCall(ISIdentity(isrow, &row_identity)); 2156 PetscCall(ISIdentity(iscol, &col_identity)); 2157 } 2158 2159 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2160 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2161 2162 if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering 2163 PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info)); 2164 } else { 2165 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host 2166 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 2167 } 2168 PetscFunctionReturn(PETSC_SUCCESS); 2169 } 2170 2171 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 2172 { 2173 PetscFunctionBegin; 2174 PetscCall(MatSeqAIJKokkosSyncHost(A)); 2175 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 2176 2177 if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device 2178 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2179 Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data); 2180 const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag; 2181 const MatScalar *Ba = b->a; 2182 PetscInt m = B->rmap->n; 2183 2184 if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization 2185 // Allocate memory and copy the structure 2186 factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h 2187 factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]); 2188 factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]); 2189 factors->D_h = MatScalarKokkosViewHost(NoInit("D_h"), m); 2190 factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h); 2191 factors->D_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h); 2192 2193 // Build jU_h from the skewed Aj 2194 PetscInt *Uj = factors->jU_h.data(); 2195 for (PetscInt i = 0; i < m; i++) { 2196 PetscInt ulen = Bi[i + 1] - Bi[i]; 2197 Uj[Bi[i]] = i; // diagonal entry 2198 PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal 2199 } 2200 2201 // Copy iU, jU to device 2202 PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h)); 2203 PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h)); 2204 2205 // Copy row/col permutation to device 2206 IS rowperm = ((Mat_SeqAIJ *)B->data)->row; 2207 PetscBool row_identity; 2208 PetscCall(ISIdentity(rowperm, &row_identity)); 2209 if (!row_identity) { 2210 const PetscInt *ip; 2211 2212 PetscCall(ISGetIndices(rowperm, &ip)); 2213 PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m)); 2214 PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m))); 2215 PetscCall(ISRestoreIndices(rowperm, &ip)); 2216 PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt))); 2217 } 2218 2219 // Create sptrsv handles for U and U^T 2220 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2221 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2222 #else 2223 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2224 #endif 2225 factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */); 2226 factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */); 2227 } 2228 // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them 2229 B->ops->solve = MatSolve_SeqAIJKokkos_Cholesky; 2230 B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky; 2231 2232 // Copy the value 2233 PetscScalar *Ua = factors->aU_h.data(); 2234 PetscScalar *D = factors->D_h.data(); 2235 for (PetscInt i = 0; i < m; i++) { 2236 D[i] = Ba[Bdiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 2237 Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U 2238 for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k]; 2239 } 2240 PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h)); 2241 PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h)); 2242 2243 factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again 2244 factors->transpose_updated = PETSC_FALSE; 2245 } 2246 2247 B->ops->matsolve = NULL; 2248 B->ops->matsolvetranspose = NULL; 2249 PetscFunctionReturn(PETSC_SUCCESS); 2250 } 2251 2252 static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2253 { 2254 PetscFunctionBegin; 2255 if (info->solveonhost) { 2256 // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc. 2257 PetscCall(MatSetType(B, MATSEQSBAIJ)); 2258 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 2259 } 2260 2261 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2262 2263 if (!info->solveonhost) { 2264 // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr 2265 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2266 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2267 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos; 2268 } 2269 PetscFunctionReturn(PETSC_SUCCESS); 2270 } 2271 2272 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2273 { 2274 PetscFunctionBegin; 2275 if (info->solveonhost) { 2276 // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc. 2277 PetscCall(MatSetType(B, MATSEQSBAIJ)); 2278 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 2279 } 2280 2281 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm 2282 2283 if (!info->solveonhost) { 2284 // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr 2285 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2286 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2287 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos; 2288 } 2289 PetscFunctionReturn(PETSC_SUCCESS); 2290 } 2291 2292 // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix 2293 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type) 2294 { 2295 PetscFunctionBegin; 2296 *type = MATSOLVERKOKKOS; 2297 PetscFunctionReturn(PETSC_SUCCESS); 2298 } 2299 2300 /*MC 2301 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 2302 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 2303 2304 Level: beginner 2305 2306 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 2307 M*/ 2308 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 2309 { 2310 PetscInt n = A->rmap->n; 2311 MPI_Comm comm; 2312 2313 PetscFunctionBegin; 2314 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 2315 PetscCall(MatCreate(comm, B)); 2316 PetscCall(MatSetSizes(*B, n, n, n, n)); 2317 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2318 (*B)->factortype = ftype; 2319 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 2320 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2321 PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2322 2323 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2324 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 2325 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 2326 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2327 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2328 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2329 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2330 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJKokkos; 2331 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos; 2332 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2333 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2334 } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 2335 2336 // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host 2337 (*B)->canuseordering = PETSC_TRUE; 2338 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos)); 2339 PetscFunctionReturn(PETSC_SUCCESS); 2340 } 2341 2342 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void) 2343 { 2344 PetscFunctionBegin; 2345 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 2346 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos)); 2347 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 2348 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos)); 2349 PetscFunctionReturn(PETSC_SUCCESS); 2350 } 2351 2352 /* Utility to print out a KokkosCsrMatrix for debugging */ 2353 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 2354 { 2355 const auto &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map); 2356 const auto &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries); 2357 const auto &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values); 2358 const PetscInt *i = iv.data(); 2359 const PetscInt *j = jv.data(); 2360 const PetscScalar *a = av.data(); 2361 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 2362 2363 PetscFunctionBegin; 2364 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 2365 for (PetscInt k = 0; k < m; k++) { 2366 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 2367 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 2368 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 2369 } 2370 PetscFunctionReturn(PETSC_SUCCESS); 2371 } 2372