1 #include <petsc_kokkos.hpp> 2 #include <petscvec_kokkos.hpp> 3 #include <petscmat_kokkos.hpp> 4 #include <petscpkg_version.h> 5 #include <petsc/private/petscimpl.h> 6 #include <petsc/private/sfimpl.h> 7 #include <petsc/private/kokkosimpl.hpp> 8 #include <petscsys.h> 9 10 #include <Kokkos_Core.hpp> 11 #include <KokkosBlas.hpp> 12 #include <KokkosSparse_CrsMatrix.hpp> 13 14 // To suppress compiler warnings: 15 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63: 16 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, 17 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*, 18 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations] 19 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations") 20 #include <KokkosSparse_spmv.hpp> 21 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END() 22 23 #include <KokkosSparse_spiluk.hpp> 24 #include <KokkosSparse_sptrsv.hpp> 25 #include <KokkosSparse_spgemm.hpp> 26 #include <KokkosSparse_spadd.hpp> 27 #include <KokkosBatched_LU_Decl.hpp> 28 #include <KokkosBatched_InverseLU_Decl.hpp> 29 30 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 31 32 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 33 #include <KokkosSparse_Utils.hpp> 34 using KokkosSparse::sort_crs_matrix; 35 using KokkosSparse::Impl::transpose_matrix; 36 #else 37 #include <KokkosKernels_Sorting.hpp> 38 using KokkosKernels::sort_crs_matrix; 39 using KokkosKernels::Impl::transpose_matrix; 40 #endif 41 42 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0) 43 using KokkosSparse::spiluk_symbolic; 44 using KokkosSparse::spiluk_numeric; 45 using KokkosSparse::sptrsv_symbolic; 46 using KokkosSparse::sptrsv_solve; 47 using KokkosSparse::Experimental::SPTRSVAlgorithm; 48 using KokkosSparse::Experimental::SPILUKAlgorithm; 49 #else 50 using KokkosSparse::Experimental::spiluk_symbolic; 51 using KokkosSparse::Experimental::spiluk_numeric; 52 using KokkosSparse::Experimental::sptrsv_symbolic; 53 using KokkosSparse::Experimental::sptrsv_solve; 54 using KokkosSparse::Experimental::SPTRSVAlgorithm; 55 using KokkosSparse::Experimental::SPILUKAlgorithm; 56 #endif 57 58 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 59 60 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 61 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 62 In the latter case, it is important to set a_dual's sync state correctly. 63 */ 64 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 65 { 66 Mat_SeqAIJ *aijseq; 67 Mat_SeqAIJKokkos *aijkok; 68 69 PetscFunctionBegin; 70 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 71 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 72 73 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 74 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 75 76 /* If aijkok does not exist, we just copy i, j to device. 77 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 78 In both cases, we build a new aijkok structure. 79 */ 80 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 81 if (aijkok && aijkok->host_aij_allocated_by_kokkos) { /* Avoid accidentally freeing much needed a,i,j on host when deleting aijkok */ 82 PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i)); 83 PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j)); 84 PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a)); 85 PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1)); 86 PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz())); 87 PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz())); 88 aijseq->free_a = PETSC_TRUE; 89 aijseq->free_ij = PETSC_TRUE; 90 /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where 91 we have the CsrMatrix on device first and then copy to host, followed by 92 MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL. 93 One could improve it by not using NULL garray. 94 */ 95 } 96 delete aijkok; 97 aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 98 A->spptr = aijkok; 99 } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag. 100 MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n); 101 auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h); 102 aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h); 103 } 104 PetscFunctionReturn(PETSC_SUCCESS); 105 } 106 107 /* Sync CSR data to device if not yet */ 108 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 109 { 110 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 111 112 PetscFunctionBegin; 113 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 114 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 115 if (aijkok->a_dual.need_sync_device()) { 116 PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 117 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 118 aijkok->hermitian_updated = PETSC_FALSE; 119 } 120 PetscFunctionReturn(PETSC_SUCCESS); 121 } 122 123 /* Mark the CSR data on device as modified */ 124 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 125 { 126 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 127 128 PetscFunctionBegin; 129 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 130 aijkok->a_dual.clear_sync_state(); 131 aijkok->a_dual.modify_device(); 132 aijkok->transpose_updated = PETSC_FALSE; 133 aijkok->hermitian_updated = PETSC_FALSE; 134 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 135 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 136 PetscFunctionReturn(PETSC_SUCCESS); 137 } 138 139 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 140 { 141 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 142 auto exec = PetscGetKokkosExecutionSpace(); 143 144 PetscFunctionBegin; 145 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 146 /* We do not expect one needs factors on host */ 147 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 148 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 149 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec)); 150 PetscFunctionReturn(PETSC_SUCCESS); 151 } 152 153 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 154 { 155 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 156 157 PetscFunctionBegin; 158 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 159 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 160 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 161 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 162 */ 163 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 164 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 165 *array = aijkok->a_dual.view_host().data(); 166 } else { /* Happens when calling MatSetValues on a newly created matrix */ 167 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 168 } 169 PetscFunctionReturn(PETSC_SUCCESS); 170 } 171 172 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 173 { 174 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 175 176 PetscFunctionBegin; 177 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 178 PetscFunctionReturn(PETSC_SUCCESS); 179 } 180 181 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 182 { 183 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 184 185 PetscFunctionBegin; 186 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 187 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 188 *array = aijkok->a_dual.view_host().data(); 189 } else { 190 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 191 } 192 PetscFunctionReturn(PETSC_SUCCESS); 193 } 194 195 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 196 { 197 PetscFunctionBegin; 198 *array = NULL; 199 PetscFunctionReturn(PETSC_SUCCESS); 200 } 201 202 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 203 { 204 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 205 206 PetscFunctionBegin; 207 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 208 *array = aijkok->a_dual.view_host().data(); 209 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 210 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 211 } 212 PetscFunctionReturn(PETSC_SUCCESS); 213 } 214 215 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 216 { 217 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 218 219 PetscFunctionBegin; 220 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 221 aijkok->a_dual.clear_sync_state(); 222 aijkok->a_dual.modify_host(); 223 } 224 PetscFunctionReturn(PETSC_SUCCESS); 225 } 226 227 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 228 { 229 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 230 231 PetscFunctionBegin; 232 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 233 234 if (i) *i = aijkok->i_device_data(); 235 if (j) *j = aijkok->j_device_data(); 236 if (a) { 237 PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 238 *a = aijkok->a_device_data(); 239 } 240 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 241 PetscFunctionReturn(PETSC_SUCCESS); 242 } 243 244 static PetscErrorCode MatGetCurrentMemType_SeqAIJKokkos(PETSC_UNUSED Mat A, PetscMemType *mtype) 245 { 246 PetscFunctionBegin; 247 *mtype = PETSC_MEMTYPE_KOKKOS; 248 PetscFunctionReturn(PETSC_SUCCESS); 249 } 250 251 /* 252 Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 253 254 Input Parameter: 255 . A - the MATSEQAIJKOKKOS matrix 256 257 Output Parameters: 258 + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 259 - T_d - the transpose on device, whose value array is allocated but not initialized 260 */ 261 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 262 { 263 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 264 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 265 const PetscInt *Ai = aseq->i, *Aj = aseq->j; 266 MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 267 MatRowMapType *Ti = Ti_h.data(); 268 MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 269 MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 270 PetscInt *Tj = Tj_h.data(); 271 PetscInt *perm = perm_h.data(); 272 PetscInt *offset; 273 274 PetscFunctionBegin; 275 // Populate Ti 276 PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 277 Ti++; 278 for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 279 Ti--; 280 for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 281 282 // Populate Tj and the permutation array 283 PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 284 for (PetscInt i = 0; i < m; i++) { 285 for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 286 PetscInt r = Aj[j]; // row r of T 287 PetscInt disp = Ti[r] + offset[r]; 288 289 Tj[disp] = i; // col i of T 290 perm[disp] = j; 291 offset[r]++; 292 } 293 } 294 PetscCall(PetscFree(offset)); 295 296 // Sort each row of T, along with the permutation array 297 for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 298 299 // Output perm and T on device 300 auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 301 auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 302 PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 303 PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 304 PetscFunctionReturn(PETSC_SUCCESS); 305 } 306 307 // Generate the transpose on device and cache it internally 308 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 309 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 310 { 311 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 312 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 313 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 314 KokkosCsrMatrix &T = akok->csrmatT; 315 316 PetscFunctionBegin; 317 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 318 PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device 319 320 const auto &Aa = akok->a_dual.view_device(); 321 322 if (A->symmetric == PETSC_BOOL3_TRUE) { 323 *csrmatT = akok->csrmat; 324 } else { 325 // See if we already have a cached transpose and its value is up to date 326 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 327 if (!akok->transpose_updated) { // if the value is out of date, update the cached version 328 const auto &perm = akok->transpose_perm; // get the permutation array 329 auto &Ta = T.values; 330 331 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 332 } 333 } else { // Generate T of size n x m for the first time 334 MatRowMapKokkosView perm; 335 336 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 337 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 338 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 339 } 340 akok->transpose_updated = PETSC_TRUE; 341 *csrmatT = akok->csrmatT; 342 } 343 PetscFunctionReturn(PETSC_SUCCESS); 344 } 345 346 // Generate the Hermitian on device and cache it internally 347 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 348 { 349 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 350 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 351 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 352 KokkosCsrMatrix &T = akok->csrmatH; 353 354 PetscFunctionBegin; 355 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 356 PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device 357 358 const auto &Aa = akok->a_dual.view_device(); 359 360 if (A->hermitian == PETSC_BOOL3_TRUE) { 361 *csrmatH = akok->csrmat; 362 } else { 363 // See if we already have a cached hermitian and its value is up to date 364 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 365 if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 366 const auto &perm = akok->transpose_perm; // get the permutation array 367 auto &Ta = T.values; 368 369 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 370 } 371 } else { // Generate T of size n x m for the first time 372 MatRowMapKokkosView perm; 373 374 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 375 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 376 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 377 } 378 akok->hermitian_updated = PETSC_TRUE; 379 *csrmatH = akok->csrmatH; 380 } 381 PetscFunctionReturn(PETSC_SUCCESS); 382 } 383 384 /* y = A x */ 385 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 386 { 387 Mat_SeqAIJKokkos *aijkok; 388 ConstPetscScalarKokkosView xv; 389 PetscScalarKokkosView yv; 390 391 PetscFunctionBegin; 392 PetscCall(PetscLogGpuTimeBegin()); 393 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 394 PetscCall(VecGetKokkosView(xx, &xv)); 395 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 396 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 397 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 398 PetscCall(VecRestoreKokkosView(xx, &xv)); 399 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 400 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 401 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 402 PetscCall(PetscLogGpuTimeEnd()); 403 PetscFunctionReturn(PETSC_SUCCESS); 404 } 405 406 /* y = A^T x */ 407 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 408 { 409 Mat_SeqAIJKokkos *aijkok; 410 const char *mode; 411 ConstPetscScalarKokkosView xv; 412 PetscScalarKokkosView yv; 413 KokkosCsrMatrix csrmat; 414 415 PetscFunctionBegin; 416 PetscCall(PetscLogGpuTimeBegin()); 417 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 418 PetscCall(VecGetKokkosView(xx, &xv)); 419 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 420 if (A->form_explicit_transpose) { 421 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 422 mode = "N"; 423 } else { 424 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 425 csrmat = aijkok->csrmat; 426 mode = "T"; 427 } 428 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 429 PetscCall(VecRestoreKokkosView(xx, &xv)); 430 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 431 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 432 PetscCall(PetscLogGpuTimeEnd()); 433 PetscFunctionReturn(PETSC_SUCCESS); 434 } 435 436 /* y = A^H x */ 437 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 438 { 439 Mat_SeqAIJKokkos *aijkok; 440 const char *mode; 441 ConstPetscScalarKokkosView xv; 442 PetscScalarKokkosView yv; 443 KokkosCsrMatrix csrmat; 444 445 PetscFunctionBegin; 446 PetscCall(PetscLogGpuTimeBegin()); 447 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 448 PetscCall(VecGetKokkosView(xx, &xv)); 449 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 450 if (A->form_explicit_transpose) { 451 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 452 mode = "N"; 453 } else { 454 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 455 csrmat = aijkok->csrmat; 456 mode = "C"; 457 } 458 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 459 PetscCall(VecRestoreKokkosView(xx, &xv)); 460 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 461 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 462 PetscCall(PetscLogGpuTimeEnd()); 463 PetscFunctionReturn(PETSC_SUCCESS); 464 } 465 466 /* z = A x + y */ 467 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 468 { 469 Mat_SeqAIJKokkos *aijkok; 470 ConstPetscScalarKokkosView xv; 471 PetscScalarKokkosView zv; 472 473 PetscFunctionBegin; 474 PetscCall(PetscLogGpuTimeBegin()); 475 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 476 if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host 477 PetscCall(VecGetKokkosView(xx, &xv)); 478 PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device 479 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 480 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 481 PetscCall(VecRestoreKokkosView(xx, &xv)); 482 PetscCall(VecRestoreKokkosView(zz, &zv)); 483 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 484 PetscCall(PetscLogGpuTimeEnd()); 485 PetscFunctionReturn(PETSC_SUCCESS); 486 } 487 488 /* z = A^T x + y */ 489 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 490 { 491 Mat_SeqAIJKokkos *aijkok; 492 const char *mode; 493 ConstPetscScalarKokkosView xv; 494 PetscScalarKokkosView zv; 495 KokkosCsrMatrix csrmat; 496 497 PetscFunctionBegin; 498 PetscCall(PetscLogGpuTimeBegin()); 499 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 500 if (zz != yy) PetscCall(VecCopy(yy, zz)); 501 PetscCall(VecGetKokkosView(xx, &xv)); 502 PetscCall(VecGetKokkosView(zz, &zv)); 503 if (A->form_explicit_transpose) { 504 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 505 mode = "N"; 506 } else { 507 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 508 csrmat = aijkok->csrmat; 509 mode = "T"; 510 } 511 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 512 PetscCall(VecRestoreKokkosView(xx, &xv)); 513 PetscCall(VecRestoreKokkosView(zz, &zv)); 514 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 515 PetscCall(PetscLogGpuTimeEnd()); 516 PetscFunctionReturn(PETSC_SUCCESS); 517 } 518 519 /* z = A^H x + y */ 520 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 521 { 522 Mat_SeqAIJKokkos *aijkok; 523 const char *mode; 524 ConstPetscScalarKokkosView xv; 525 PetscScalarKokkosView zv; 526 KokkosCsrMatrix csrmat; 527 528 PetscFunctionBegin; 529 PetscCall(PetscLogGpuTimeBegin()); 530 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 531 if (zz != yy) PetscCall(VecCopy(yy, zz)); 532 PetscCall(VecGetKokkosView(xx, &xv)); 533 PetscCall(VecGetKokkosView(zz, &zv)); 534 if (A->form_explicit_transpose) { 535 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 536 mode = "N"; 537 } else { 538 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 539 csrmat = aijkok->csrmat; 540 mode = "C"; 541 } 542 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 543 PetscCall(VecRestoreKokkosView(xx, &xv)); 544 PetscCall(VecRestoreKokkosView(zz, &zv)); 545 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 546 PetscCall(PetscLogGpuTimeEnd()); 547 PetscFunctionReturn(PETSC_SUCCESS); 548 } 549 550 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 551 { 552 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 553 554 PetscFunctionBegin; 555 switch (op) { 556 case MAT_FORM_EXPLICIT_TRANSPOSE: 557 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 558 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 559 A->form_explicit_transpose = flg; 560 break; 561 default: 562 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 563 break; 564 } 565 PetscFunctionReturn(PETSC_SUCCESS); 566 } 567 568 /* Depending on reuse, either build a new mat, or use the existing mat */ 569 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 570 { 571 Mat_SeqAIJ *aseq; 572 573 PetscFunctionBegin; 574 PetscCall(PetscKokkosInitializeCheck()); 575 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 576 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 577 PetscCall(MatSetType(*newmat, mtype)); 578 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 579 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 580 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 581 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 582 PetscCall(PetscFree(A->defaultvectype)); 583 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 584 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 585 PetscCall(MatSetOps_SeqAIJKokkos(A)); 586 aseq = static_cast<Mat_SeqAIJ *>(A->data); 587 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 588 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 589 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE); 590 } 591 } 592 PetscFunctionReturn(PETSC_SUCCESS); 593 } 594 595 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 596 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 597 */ 598 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 599 { 600 Mat_SeqAIJ *bseq; 601 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 602 Mat mat; 603 604 PetscFunctionBegin; 605 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 606 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 607 mat = *B; 608 if (A->assembled) { 609 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 610 bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE); 611 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 612 /* Now copy values to B if needed */ 613 if (dupOption == MAT_COPY_VALUES) { 614 if (akok->a_dual.need_sync_device()) { 615 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 616 bkok->a_dual.modify_host(); 617 } else { /* If device has the latest data, we only copy data on device */ 618 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 619 bkok->a_dual.modify_device(); 620 } 621 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 622 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 623 bkok->a_dual.modify_host(); 624 } 625 mat->spptr = bkok; 626 } 627 628 PetscCall(PetscFree(mat->defaultvectype)); 629 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 630 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 631 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 632 PetscFunctionReturn(PETSC_SUCCESS); 633 } 634 635 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 636 { 637 Mat At; 638 KokkosCsrMatrix internT; 639 Mat_SeqAIJKokkos *atkok, *bkok; 640 641 PetscFunctionBegin; 642 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 643 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 644 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 645 /* Deep copy internT, as we want to isolate the internal transpose */ 646 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 647 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 648 if (reuse == MAT_INITIAL_MATRIX) *B = At; 649 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 650 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 651 if ((*B)->assembled) { 652 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 653 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 654 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 655 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 656 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 657 MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 658 MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 659 PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 660 PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 661 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 662 } 663 PetscFunctionReturn(PETSC_SUCCESS); 664 } 665 666 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 667 { 668 Mat_SeqAIJKokkos *aijkok; 669 670 PetscFunctionBegin; 671 if (A->factortype == MAT_FACTOR_NONE) { 672 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 673 delete aijkok; 674 } else { 675 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 676 } 677 A->spptr = NULL; 678 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 679 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 680 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 681 #if defined(PETSC_HAVE_HYPRE) 682 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL)); 683 #endif 684 PetscCall(MatDestroy_SeqAIJ(A)); 685 PetscFunctionReturn(PETSC_SUCCESS); 686 } 687 688 /*MC 689 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 690 691 A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types 692 693 Options Database Key: 694 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 695 696 Level: beginner 697 698 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 699 M*/ 700 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 701 { 702 PetscFunctionBegin; 703 PetscCall(PetscKokkosInitializeCheck()); 704 PetscCall(MatCreate_SeqAIJ(A)); 705 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 706 PetscFunctionReturn(PETSC_SUCCESS); 707 } 708 709 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 710 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 711 { 712 Mat_SeqAIJ *a, *b; 713 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 714 MatScalarKokkosView aa, ba, ca; 715 MatRowMapKokkosView ai, bi, ci; 716 MatColIdxKokkosView aj, bj, cj; 717 PetscInt m, n, nnz, aN; 718 719 PetscFunctionBegin; 720 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 721 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 722 PetscAssertPointer(C, 4); 723 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 724 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 725 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 726 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 727 728 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 729 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 730 a = static_cast<Mat_SeqAIJ *>(A->data); 731 b = static_cast<Mat_SeqAIJ *>(B->data); 732 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 733 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 734 aa = akok->a_dual.view_device(); 735 ai = akok->i_dual.view_device(); 736 ba = bkok->a_dual.view_device(); 737 bi = bkok->i_dual.view_device(); 738 m = A->rmap->n; /* M, N and nnz of C */ 739 n = A->cmap->n + B->cmap->n; 740 nnz = a->nz + b->nz; 741 aN = A->cmap->n; /* N of A */ 742 if (reuse == MAT_INITIAL_MATRIX) { 743 aj = akok->j_dual.view_device(); 744 bj = bkok->j_dual.view_device(); 745 auto ca = MatScalarKokkosView("a", aa.extent(0) + ba.extent(0)); 746 auto ci = MatRowMapKokkosView("i", ai.extent(0)); 747 auto cj = MatColIdxKokkosView("j", aj.extent(0) + bj.extent(0)); 748 749 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 750 Kokkos::parallel_for( 751 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 752 PetscInt i = t.league_rank(); /* row i */ 753 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 754 755 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 756 ci(i) = coffset; 757 if (i == m - 1) ci(m) = ai(m) + bi(m); 758 }); 759 760 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 761 if (k < alen) { 762 ca(coffset + k) = aa(ai(i) + k); 763 cj(coffset + k) = aj(ai(i) + k); 764 } else { 765 ca(coffset + k) = ba(bi(i) + k - alen); 766 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 767 } 768 }); 769 }); 770 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci, cj, ca)); 771 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 772 } else if (reuse == MAT_REUSE_MATRIX) { 773 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 774 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 775 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 776 ca = ckok->a_dual.view_device(); 777 ci = ckok->i_dual.view_device(); 778 779 Kokkos::parallel_for( 780 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 781 PetscInt i = t.league_rank(); /* row i */ 782 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 783 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 784 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 785 else ca(ci(i) + k) = ba(bi(i) + k - alen); 786 }); 787 }); 788 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 789 } 790 PetscFunctionReturn(PETSC_SUCCESS); 791 } 792 793 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 794 { 795 PetscFunctionBegin; 796 delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 797 PetscFunctionReturn(PETSC_SUCCESS); 798 } 799 800 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 801 { 802 Mat_Product *product = C->product; 803 Mat A, B; 804 bool transA, transB; /* use bool, since KK needs this type */ 805 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 806 Mat_SeqAIJ *c; 807 MatProductData_SeqAIJKokkos *pdata; 808 KokkosCsrMatrix csrmatA, csrmatB; 809 810 PetscFunctionBegin; 811 MatCheckProduct(C, 1); 812 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 813 pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 814 815 // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 816 // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 817 // we still do numeric. 818 if (pdata->reusesym) { // numeric reuses results from symbolic 819 pdata->reusesym = PETSC_FALSE; 820 PetscFunctionReturn(PETSC_SUCCESS); 821 } 822 823 switch (product->type) { 824 case MATPRODUCT_AB: 825 transA = false; 826 transB = false; 827 break; 828 case MATPRODUCT_AtB: 829 transA = true; 830 transB = false; 831 break; 832 case MATPRODUCT_ABt: 833 transA = false; 834 transB = true; 835 break; 836 default: 837 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 838 } 839 840 A = product->A; 841 B = product->B; 842 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 843 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 844 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 845 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 846 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 847 848 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 849 850 csrmatA = akok->csrmat; 851 csrmatB = bkok->csrmat; 852 853 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 854 if (transA) { 855 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 856 transA = false; 857 } 858 859 if (transB) { 860 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 861 transB = false; 862 } 863 PetscCall(PetscLogGpuTimeBegin()); 864 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 865 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 866 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 867 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 868 #endif 869 870 PetscCall(PetscLogGpuTimeEnd()); 871 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 872 /* shorter version of MatAssemblyEnd_SeqAIJ */ 873 c = (Mat_SeqAIJ *)C->data; 874 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 875 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 876 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 877 c->reallocs = 0; 878 C->info.mallocs = 0; 879 C->info.nz_unneeded = 0; 880 C->assembled = C->was_assembled = PETSC_TRUE; 881 C->num_ass++; 882 PetscFunctionReturn(PETSC_SUCCESS); 883 } 884 885 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 886 { 887 Mat_Product *product = C->product; 888 MatProductType ptype; 889 Mat A, B; 890 bool transA, transB; 891 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 892 MatProductData_SeqAIJKokkos *pdata; 893 MPI_Comm comm; 894 KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 895 896 PetscFunctionBegin; 897 MatCheckProduct(C, 1); 898 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 899 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 900 A = product->A; 901 B = product->B; 902 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 903 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 904 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 905 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 906 csrmatA = akok->csrmat; 907 csrmatB = bkok->csrmat; 908 909 ptype = product->type; 910 // Take advantage of the symmetry if true 911 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 912 ptype = MATPRODUCT_AB; 913 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 914 } 915 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 916 ptype = MATPRODUCT_AB; 917 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 918 } 919 920 switch (ptype) { 921 case MATPRODUCT_AB: 922 transA = false; 923 transB = false; 924 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 925 break; 926 case MATPRODUCT_AtB: 927 transA = true; 928 transB = false; 929 if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs)); 930 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 931 break; 932 case MATPRODUCT_ABt: 933 transA = false; 934 transB = true; 935 if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs)); 936 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 937 break; 938 default: 939 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 940 } 941 PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 942 pdata->reusesym = product->api_user; 943 944 /* TODO: add command line options to select spgemm algorithms */ 945 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 946 947 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 948 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 949 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 950 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 951 #endif 952 #endif 953 PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 954 955 PetscCall(PetscLogGpuTimeBegin()); 956 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 957 if (transA) { 958 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 959 transA = false; 960 } 961 962 if (transB) { 963 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 964 transB = false; 965 } 966 967 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 968 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 969 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 970 calling new Mat_SeqAIJKokkos(). 971 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 972 */ 973 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 974 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 975 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 976 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 977 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 978 #endif 979 PetscCall(PetscLogGpuTimeEnd()); 980 981 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 982 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 983 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 984 PetscFunctionReturn(PETSC_SUCCESS); 985 } 986 987 /* handles sparse matrix matrix ops */ 988 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 989 { 990 Mat_Product *product = mat->product; 991 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 992 993 PetscFunctionBegin; 994 MatCheckProduct(mat, 1); 995 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 996 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 997 if (Biskok && Ciskok) { 998 switch (product->type) { 999 case MATPRODUCT_AB: 1000 case MATPRODUCT_AtB: 1001 case MATPRODUCT_ABt: 1002 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 1003 break; 1004 case MATPRODUCT_PtAP: 1005 case MATPRODUCT_RARt: 1006 case MATPRODUCT_ABC: 1007 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 1008 break; 1009 default: 1010 break; 1011 } 1012 } else { /* fallback for AIJ */ 1013 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 1014 } 1015 PetscFunctionReturn(PETSC_SUCCESS); 1016 } 1017 1018 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 1019 { 1020 Mat_SeqAIJKokkos *aijkok; 1021 1022 PetscFunctionBegin; 1023 PetscCall(PetscLogGpuTimeBegin()); 1024 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1025 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1026 KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 1027 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1028 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 1029 PetscCall(PetscLogGpuTimeEnd()); 1030 PetscFunctionReturn(PETSC_SUCCESS); 1031 } 1032 1033 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular) 1034 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a) 1035 { 1036 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1037 1038 PetscFunctionBegin; 1039 if (A->assembled && aijseq->diagonaldense) { // no missing diagonals 1040 PetscInt n = PetscMin(A->rmap->n, A->cmap->n); 1041 1042 PetscCall(PetscLogGpuTimeBegin()); 1043 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1044 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1045 const auto &Aa = aijkok->a_dual.view_device(); 1046 const auto &Adiag = aijkok->diag_dual.view_device(); 1047 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; })); 1048 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1049 PetscCall(PetscLogGpuFlops(n)); 1050 PetscCall(PetscLogGpuTimeEnd()); 1051 } else { // need reassembly, very slow! 1052 PetscCall(MatShift_Basic(A, a)); 1053 } 1054 PetscFunctionReturn(PETSC_SUCCESS); 1055 } 1056 1057 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is) 1058 { 1059 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data); 1060 1061 PetscFunctionBegin; 1062 if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals 1063 ConstPetscScalarKokkosView dv; 1064 PetscInt n, nv; 1065 1066 PetscCall(PetscLogGpuTimeBegin()); 1067 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1068 PetscCall(VecGetKokkosView(D, &dv)); 1069 PetscCall(VecGetLocalSize(D, &nv)); 1070 n = PetscMin(Y->rmap->n, Y->cmap->n); 1071 PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match"); 1072 1073 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1074 const auto &Aa = aijkok->a_dual.view_device(); 1075 const auto &Adiag = aijkok->diag_dual.view_device(); 1076 PetscCallCXX(Kokkos::parallel_for( 1077 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1078 if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i); 1079 else Aa(Adiag(i)) += dv(i); 1080 })); 1081 PetscCall(VecRestoreKokkosView(D, &dv)); 1082 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1083 PetscCall(PetscLogGpuFlops(n)); 1084 PetscCall(PetscLogGpuTimeEnd()); 1085 } else { // need reassembly, very slow! 1086 PetscCall(MatDiagonalSet_Default(Y, D, is)); 1087 } 1088 PetscFunctionReturn(PETSC_SUCCESS); 1089 } 1090 1091 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr) 1092 { 1093 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1094 PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz; 1095 ConstPetscScalarKokkosView lv, rv; 1096 1097 PetscFunctionBegin; 1098 PetscCall(PetscLogGpuTimeBegin()); 1099 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1100 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1101 const auto &Aa = aijkok->a_dual.view_device(); 1102 const auto &Ai = aijkok->i_dual.view_device(); 1103 const auto &Aj = aijkok->j_dual.view_device(); 1104 if (ll) { 1105 PetscCall(VecGetLocalSize(ll, &m)); 1106 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1107 PetscCall(VecGetKokkosView(ll, &lv)); 1108 PetscCallCXX(Kokkos::parallel_for( // for each row 1109 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1110 PetscInt i = t.league_rank(); // row i 1111 PetscInt len = Ai(i + 1) - Ai(i); 1112 // scale entries on the row 1113 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); }); 1114 })); 1115 PetscCall(VecRestoreKokkosView(ll, &lv)); 1116 PetscCall(PetscLogGpuFlops(nz)); 1117 } 1118 if (rr) { 1119 PetscCall(VecGetLocalSize(rr, &n)); 1120 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1121 PetscCall(VecGetKokkosView(rr, &rv)); 1122 PetscCallCXX(Kokkos::parallel_for( // for each nonzero 1123 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); })); 1124 PetscCall(VecRestoreKokkosView(rr, &lv)); 1125 PetscCall(PetscLogGpuFlops(nz)); 1126 } 1127 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1128 PetscCall(PetscLogGpuTimeEnd()); 1129 PetscFunctionReturn(PETSC_SUCCESS); 1130 } 1131 1132 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 1133 { 1134 Mat_SeqAIJKokkos *aijkok; 1135 1136 PetscFunctionBegin; 1137 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1138 if (aijkok) { /* Only zero the device if data is already there */ 1139 KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0); 1140 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1141 } else { /* Might be preallocated but not assembled */ 1142 PetscCall(MatZeroEntries_SeqAIJ(A)); 1143 } 1144 PetscFunctionReturn(PETSC_SUCCESS); 1145 } 1146 1147 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1148 { 1149 Mat_SeqAIJKokkos *aijkok; 1150 PetscInt n; 1151 PetscScalarKokkosView xv; 1152 1153 PetscFunctionBegin; 1154 PetscCall(VecGetLocalSize(x, &n)); 1155 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1156 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1157 1158 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1159 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1160 1161 const auto &Aa = aijkok->a_dual.view_device(); 1162 const auto &Ai = aijkok->i_dual.view_device(); 1163 const auto &Adiag = aijkok->diag_dual.view_device(); 1164 1165 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1166 Kokkos::parallel_for( 1167 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1168 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1169 else xv(i) = 0; 1170 }); 1171 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1172 PetscFunctionReturn(PETSC_SUCCESS); 1173 } 1174 1175 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1176 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1177 { 1178 Mat_SeqAIJKokkos *aijkok; 1179 1180 PetscFunctionBegin; 1181 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1182 PetscAssertPointer(kv, 2); 1183 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1184 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1185 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1186 *kv = aijkok->a_dual.view_device(); 1187 PetscFunctionReturn(PETSC_SUCCESS); 1188 } 1189 1190 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1191 { 1192 PetscFunctionBegin; 1193 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1194 PetscAssertPointer(kv, 2); 1195 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1196 PetscFunctionReturn(PETSC_SUCCESS); 1197 } 1198 1199 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1200 { 1201 Mat_SeqAIJKokkos *aijkok; 1202 1203 PetscFunctionBegin; 1204 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1205 PetscAssertPointer(kv, 2); 1206 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1207 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1208 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1209 *kv = aijkok->a_dual.view_device(); 1210 PetscFunctionReturn(PETSC_SUCCESS); 1211 } 1212 1213 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1214 { 1215 PetscFunctionBegin; 1216 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1217 PetscAssertPointer(kv, 2); 1218 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1219 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1220 PetscFunctionReturn(PETSC_SUCCESS); 1221 } 1222 1223 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1224 { 1225 Mat_SeqAIJKokkos *aijkok; 1226 1227 PetscFunctionBegin; 1228 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1229 PetscAssertPointer(kv, 2); 1230 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1231 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1232 *kv = aijkok->a_dual.view_device(); 1233 PetscFunctionReturn(PETSC_SUCCESS); 1234 } 1235 1236 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1237 { 1238 PetscFunctionBegin; 1239 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1240 PetscAssertPointer(kv, 2); 1241 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1242 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1243 PetscFunctionReturn(PETSC_SUCCESS); 1244 } 1245 1246 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A) 1247 { 1248 Mat_SeqAIJKokkos *akok; 1249 1250 PetscFunctionBegin; 1251 PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_d.extent(0), i_d, j_d, a_d)); 1252 PetscCall(MatCreate(comm, A)); 1253 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1254 PetscFunctionReturn(PETSC_SUCCESS); 1255 } 1256 1257 /* Computes Y += alpha X */ 1258 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1259 { 1260 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1261 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1262 ConstMatScalarKokkosView Xa; 1263 MatScalarKokkosView Ya; 1264 auto exec = PetscGetKokkosExecutionSpace(); 1265 1266 PetscFunctionBegin; 1267 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1268 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1269 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1270 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1271 PetscCall(PetscLogGpuTimeBegin()); 1272 1273 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1274 PetscBool e; 1275 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1276 if (e) { 1277 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1278 if (e) pattern = SAME_NONZERO_PATTERN; 1279 } 1280 } 1281 1282 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1283 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1284 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1285 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1286 */ 1287 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1288 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1289 Xa = xkok->a_dual.view_device(); 1290 Ya = ykok->a_dual.view_device(); 1291 1292 if (pattern == SAME_NONZERO_PATTERN) { 1293 KokkosBlas::axpy(exec, alpha, Xa, Ya); 1294 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1295 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1296 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1297 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1298 1299 Kokkos::parallel_for( 1300 Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1301 PetscInt i = t.league_rank(); // row i 1302 Kokkos::single(Kokkos::PerTeam(t), [=]() { 1303 // Only one thread works in a team 1304 PetscInt p, q = Yi(i); 1305 for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 1306 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 1307 if (Xj(p) == Yj(q)) { // Found it 1308 Ya(q) += alpha * Xa(p); 1309 q++; 1310 } else { 1311 // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1312 // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1313 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 1314 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 1315 #else 1316 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 1317 #endif 1318 } 1319 } 1320 }); 1321 }); 1322 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1323 } else { // different nonzero patterns 1324 Mat Z; 1325 KokkosCsrMatrix zcsr; 1326 KernelHandle kh; 1327 kh.create_spadd_handle(true); // X, Y are sorted 1328 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1329 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1330 zkok = new Mat_SeqAIJKokkos(zcsr); 1331 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1332 PetscCall(MatHeaderReplace(Y, &Z)); 1333 kh.destroy_spadd_handle(); 1334 } 1335 PetscCall(PetscLogGpuTimeEnd()); 1336 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 1337 PetscFunctionReturn(PETSC_SUCCESS); 1338 } 1339 1340 struct MatCOOStruct_SeqAIJKokkos { 1341 PetscCount n; 1342 PetscCount Atot; 1343 PetscInt nz; 1344 PetscCountKokkosView jmap; 1345 PetscCountKokkosView perm; 1346 1347 MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h) 1348 { 1349 nz = coo_h->nz; 1350 n = coo_h->n; 1351 Atot = coo_h->Atot; 1352 jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1)); 1353 perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot)); 1354 } 1355 }; 1356 1357 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data) 1358 { 1359 PetscFunctionBegin; 1360 PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data)); 1361 PetscFunctionReturn(PETSC_SUCCESS); 1362 } 1363 1364 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1365 { 1366 Mat_SeqAIJKokkos *akok; 1367 Mat_SeqAIJ *aseq; 1368 PetscContainer container_h; 1369 MatCOOStruct_SeqAIJ *coo_h; 1370 MatCOOStruct_SeqAIJKokkos *coo_d; 1371 1372 PetscFunctionBegin; 1373 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1374 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1375 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1376 delete akok; 1377 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE); 1378 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1379 1380 // Copy the COO struct to device 1381 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 1382 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 1383 PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h)); 1384 1385 // Put the COO struct in a container and then attach that to the matrix 1386 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos)); 1387 PetscFunctionReturn(PETSC_SUCCESS); 1388 } 1389 1390 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1391 { 1392 MatScalarKokkosView Aa; 1393 ConstMatScalarKokkosView kv; 1394 PetscMemType memtype; 1395 PetscContainer container; 1396 MatCOOStruct_SeqAIJKokkos *coo; 1397 1398 PetscFunctionBegin; 1399 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 1400 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 1401 1402 const auto &n = coo->n; 1403 const auto &Annz = coo->nz; 1404 const auto &jmap = coo->jmap; 1405 const auto &perm = coo->perm; 1406 1407 PetscCall(PetscGetMemType(v, &memtype)); 1408 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1409 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n)); 1410 } else { 1411 kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */ 1412 } 1413 1414 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1415 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1416 1417 PetscCall(PetscLogGpuTimeBegin()); 1418 Kokkos::parallel_for( 1419 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) { 1420 PetscScalar sum = 0.0; 1421 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1422 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1423 }); 1424 PetscCall(PetscLogGpuTimeEnd()); 1425 1426 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1427 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1428 PetscFunctionReturn(PETSC_SUCCESS); 1429 } 1430 1431 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1432 { 1433 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1434 1435 PetscFunctionBegin; 1436 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1437 A->boundtocpu = PETSC_FALSE; 1438 1439 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1440 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1441 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1442 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1443 A->ops->scale = MatScale_SeqAIJKokkos; 1444 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1445 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1446 A->ops->mult = MatMult_SeqAIJKokkos; 1447 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1448 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1449 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1450 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1451 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1452 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1453 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1454 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1455 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1456 A->ops->shift = MatShift_SeqAIJKokkos; 1457 A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos; 1458 A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos; 1459 A->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJKokkos; 1460 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1461 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1462 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1463 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1464 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1465 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1466 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1467 1468 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1469 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1470 #if defined(PETSC_HAVE_HYPRE) 1471 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE)); 1472 #endif 1473 PetscFunctionReturn(PETSC_SUCCESS); 1474 } 1475 1476 /* 1477 Extract the (prescribled) diagonal blocks of the matrix and then invert them 1478 1479 Input Parameters: 1480 + A - the MATSEQAIJKOKKOS matrix 1481 . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i) 1482 . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[] 1483 . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i) 1484 - work - a pre-allocated work buffer (as big as diagVal) for use by this routine 1485 1486 Output Parameter: 1487 . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order) 1488 */ 1489 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal) 1490 { 1491 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1492 PetscInt nblocks = bs.extent(0) - 1; 1493 1494 PetscFunctionBegin; 1495 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device 1496 1497 // Pull out the diagonal blocks of the matrix and then invert the blocks 1498 auto Aa = akok->a_dual.view_device(); 1499 auto Ai = akok->i_dual.view_device(); 1500 auto Aj = akok->j_dual.view_device(); 1501 auto Adiag = akok->diag_dual.view_device(); 1502 // TODO: how to tune the team size? 1503 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY) 1504 auto ts = Kokkos::AUTO(); 1505 #else 1506 auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs 1507 #endif 1508 PetscCallCXX(Kokkos::parallel_for( 1509 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) { 1510 const PetscInt bid = teamMember.league_rank(); // block id 1511 const PetscInt rstart = bs(bid); // this block starts from this row 1512 const PetscInt m = bs(bid + 1) - bs(bid); // size of this block 1513 const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order 1514 const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m); 1515 1516 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B 1517 PetscInt i = rstart + r; // i-th row in A 1518 1519 if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case) 1520 PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row 1521 1522 for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet 1523 if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero 1524 B(r, c) = 0.0; 1525 } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column 1526 B(r, c) = Aa(first + c); 1527 } else { // this entry does not show up in the CSR 1528 B(r, c) = 0.0; 1529 } 1530 } 1531 } else { // rare case that the diagonal does not exist 1532 const PetscInt begin = Ai(i); 1533 const PetscInt end = Ai(i + 1); 1534 for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0; 1535 for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not. 1536 if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j); 1537 else if (Aj(j) >= rstart + m) break; 1538 } 1539 } 1540 }); 1541 1542 // LU-decompose B (w/o pivoting) and then invert B 1543 KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0); 1544 KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W); 1545 })); 1546 // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels 1547 PetscFunctionReturn(PETSC_SUCCESS); 1548 } 1549 1550 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1551 { 1552 Mat_SeqAIJ *aseq; 1553 PetscInt i, m, n; 1554 auto exec = PetscGetKokkosExecutionSpace(); 1555 1556 PetscFunctionBegin; 1557 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1558 1559 m = akok->nrows(); 1560 n = akok->ncols(); 1561 PetscCall(MatSetSizes(A, m, n, m, n)); 1562 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1563 1564 /* Set up data structures of A as a MATSEQAIJ */ 1565 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1566 aseq = (Mat_SeqAIJ *)A->data; 1567 1568 PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */ 1569 PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec)); 1570 1571 aseq->i = akok->i_host_data(); 1572 aseq->j = akok->j_host_data(); 1573 aseq->a = akok->a_host_data(); 1574 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1575 aseq->free_a = PETSC_FALSE; 1576 aseq->free_ij = PETSC_FALSE; 1577 aseq->nz = akok->nnz(); 1578 aseq->maxnz = aseq->nz; 1579 1580 PetscCall(PetscMalloc1(m, &aseq->imax)); 1581 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1582 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1583 1584 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1585 akok->nonzerostate = A->nonzerostate; 1586 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1587 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1588 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1589 PetscFunctionReturn(PETSC_SUCCESS); 1590 } 1591 1592 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 1593 { 1594 PetscFunctionBegin; 1595 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1596 *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 1597 PetscFunctionReturn(PETSC_SUCCESS); 1598 } 1599 1600 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 1601 { 1602 Mat_SeqAIJKokkos *akok; 1603 1604 PetscFunctionBegin; 1605 PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 1606 PetscCall(MatCreate(comm, A)); 1607 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1608 PetscFunctionReturn(PETSC_SUCCESS); 1609 } 1610 1611 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1612 1613 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1614 */ 1615 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1616 { 1617 PetscFunctionBegin; 1618 PetscCall(MatCreate(comm, A)); 1619 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1620 PetscFunctionReturn(PETSC_SUCCESS); 1621 } 1622 1623 /*@C 1624 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1625 (the default parallel PETSc format). This matrix will ultimately be handled by 1626 Kokkos for calculations. 1627 1628 Collective 1629 1630 Input Parameters: 1631 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1632 . m - number of rows 1633 . n - number of columns 1634 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 1635 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 1636 1637 Output Parameter: 1638 . A - the matrix 1639 1640 Level: intermediate 1641 1642 Notes: 1643 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1644 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1645 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1646 1647 The AIJ format, also called 1648 compressed row storage, is fully compatible with standard Fortran 1649 storage. That is, the stored row and column indices can begin at 1650 either one (as in Fortran) or zero. 1651 1652 Specify the preallocated storage with either `nz` or `nnz` (not both). 1653 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1654 allocation. 1655 1656 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()` 1657 @*/ 1658 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1659 { 1660 PetscFunctionBegin; 1661 PetscCall(PetscKokkosInitializeCheck()); 1662 PetscCall(MatCreate(comm, A)); 1663 PetscCall(MatSetSizes(*A, m, n, m, n)); 1664 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1665 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1666 PetscFunctionReturn(PETSC_SUCCESS); 1667 } 1668 1669 // After matrix numeric factorization, there are still steps to do before triangular solve can be called. 1670 // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does). 1671 // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve(). 1672 // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck. 1673 static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A) 1674 { 1675 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1676 const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy 1677 const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy 1678 1679 PetscFunctionBegin; 1680 if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet 1681 if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d)); 1682 if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d)); 1683 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1684 } 1685 PetscFunctionReturn(PETSC_SUCCESS); 1686 } 1687 1688 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1689 { 1690 const PetscInt n = A->rmap->n; 1691 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1692 const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy 1693 const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy 1694 1695 PetscFunctionBegin; 1696 if (!factors->transpose_updated) { 1697 if (has_upper) { 1698 if (!factors->iUt_d.extent(0)) { // Allocate Ut on device if not yet 1699 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix 1700 factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 1701 factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 1702 } 1703 1704 if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host 1705 if (!factors->U) { 1706 Mat_SeqAIJ *seq; 1707 1708 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U)); 1709 PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut)); 1710 1711 seq = static_cast<Mat_SeqAIJ *>(factors->Ut->data); 1712 factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1); 1713 factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz); 1714 factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz); 1715 } else { 1716 PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h 1717 } 1718 // Copy Ut from host to device 1719 PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h)); 1720 PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h)); 1721 PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h)); 1722 } else { // If U was computed on device, we also compute the transpose there 1723 // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. 1724 PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, 1725 factors->jU_d, factors->aU_d, 1726 factors->iUt_d, factors->jUt_d, 1727 factors->aUt_d)); 1728 PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d)); 1729 } 1730 PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d)); 1731 } 1732 1733 // do the same for L with LU 1734 if (has_lower) { 1735 if (!factors->iLt_d.extent(0)) { // Allocate Lt on device if not yet 1736 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix 1737 factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 1738 factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 1739 } 1740 1741 if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host 1742 if (!factors->L) { 1743 Mat_SeqAIJ *seq; 1744 1745 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L)); 1746 PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt)); 1747 1748 seq = static_cast<Mat_SeqAIJ *>(factors->Lt->data); 1749 factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1); 1750 factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz); 1751 factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz); 1752 } else { 1753 PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h 1754 } 1755 // Copy Lt from host to device 1756 PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h)); 1757 PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h)); 1758 PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h)); 1759 } else { // If L was computed on device, we also compute the transpose there 1760 // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. 1761 PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, 1762 factors->jL_d, factors->aL_d, 1763 factors->iLt_d, factors->jLt_d, 1764 factors->aLt_d)); 1765 PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d)); 1766 } 1767 PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d)); 1768 } 1769 1770 factors->transpose_updated = PETSC_TRUE; 1771 } 1772 PetscFunctionReturn(PETSC_SUCCESS); 1773 } 1774 1775 // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A. 1776 // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty. 1777 static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx) 1778 { 1779 auto exec = PetscGetKokkosExecutionSpace(); 1780 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1781 PetscInt m = A->rmap->n; 1782 PetscScalarKokkosView D = factors->D_d; 1783 PetscScalarKokkosView X, Y, B; // alias 1784 ConstPetscScalarKokkosView b; 1785 PetscScalarKokkosView x; 1786 PetscIntKokkosView &rowperm = factors->rowperm; 1787 PetscBool identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1788 1789 PetscFunctionBegin; 1790 PetscCall(PetscLogGpuTimeBegin()); 1791 PetscCall(MatSeqAIJKokkosSolveCheck(A)); // for UX = T 1792 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B 1793 PetscCall(VecGetKokkosView(bb, &b)); 1794 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1795 1796 // Solve U^T Y = B 1797 if (identity) { // Reorder b with the row permutation 1798 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1799 Y = factors->workVector; 1800 } else { 1801 B = factors->workVector; 1802 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); })); 1803 Y = x; 1804 } 1805 PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y)); 1806 1807 // Solve diag(D) Y' = Y. 1808 // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication. 1809 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); })); 1810 1811 // Solve UX = Y 1812 if (identity) { 1813 X = x; 1814 } else { 1815 X = factors->workVector; // B is not needed anymore 1816 } 1817 PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X)); 1818 1819 // Reorder X with the inverse column (row) permutation 1820 if (!identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); })); 1821 1822 PetscCall(VecRestoreKokkosView(bb, &b)); 1823 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1824 PetscCall(PetscLogGpuTimeEnd()); 1825 PetscFunctionReturn(PETSC_SUCCESS); 1826 } 1827 1828 // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively. 1829 // R and C are represented by rowperm and colperm in factors. 1830 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty. 1831 static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx) 1832 { 1833 auto exec = PetscGetKokkosExecutionSpace(); 1834 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1835 PetscInt m = A->rmap->n; 1836 PetscScalarKokkosView X, Y, B; // alias 1837 ConstPetscScalarKokkosView b; 1838 PetscScalarKokkosView x; 1839 PetscIntKokkosView &rowperm = factors->rowperm; 1840 PetscIntKokkosView &colperm = factors->colperm; 1841 PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1842 PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1843 1844 PetscFunctionBegin; 1845 PetscCall(PetscLogGpuTimeBegin()); 1846 PetscCall(MatSeqAIJKokkosSolveCheck(A)); 1847 PetscCall(VecGetKokkosView(bb, &b)); 1848 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1849 1850 // Solve L Y = B (i.e., L (U C^- x) = R b). R b indicates applying the row permutation on b. 1851 if (row_identity) { 1852 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1853 Y = factors->workVector; 1854 } else { 1855 B = factors->workVector; 1856 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); })); 1857 Y = x; 1858 } 1859 PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y)); 1860 1861 // Solve U C^- x = Y 1862 if (col_identity) { 1863 X = x; 1864 } else { 1865 X = factors->workVector; 1866 } 1867 PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X)); 1868 1869 // x = C X; Reorder X with the inverse col permutation 1870 if (!col_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); })); 1871 1872 PetscCall(VecRestoreKokkosView(bb, &b)); 1873 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1874 PetscCall(PetscLogGpuTimeEnd()); 1875 PetscFunctionReturn(PETSC_SUCCESS); 1876 } 1877 1878 // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively. 1879 // R and C are represented by rowperm and colperm in factors. 1880 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty. 1881 // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R. 1882 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx) 1883 { 1884 auto exec = PetscGetKokkosExecutionSpace(); 1885 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1886 PetscInt m = A->rmap->n; 1887 PetscScalarKokkosView X, Y, B; // alias 1888 ConstPetscScalarKokkosView b; 1889 PetscScalarKokkosView x; 1890 PetscIntKokkosView &rowperm = factors->rowperm; 1891 PetscIntKokkosView &colperm = factors->colperm; 1892 PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1893 PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1894 1895 PetscFunctionBegin; 1896 PetscCall(PetscLogGpuTimeBegin()); 1897 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T 1898 PetscCall(VecGetKokkosView(bb, &b)); 1899 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1900 1901 // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b). Note C^- b = C^T b, which means applying the column permutation on b. 1902 if (col_identity) { // Reorder b with the col permutation 1903 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1904 Y = factors->workVector; 1905 } else { 1906 B = factors->workVector; 1907 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); })); 1908 Y = x; 1909 } 1910 PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y)); 1911 1912 // Solve L^T X = Y 1913 if (row_identity) { 1914 X = x; 1915 } else { 1916 X = factors->workVector; 1917 } 1918 PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X)); 1919 1920 // x = R^- X = R^T X; Reorder X with the inverse row permutation 1921 if (!row_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); })); 1922 1923 PetscCall(VecRestoreKokkosView(bb, &b)); 1924 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1925 PetscCall(PetscLogGpuTimeEnd()); 1926 PetscFunctionReturn(PETSC_SUCCESS); 1927 } 1928 1929 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1930 { 1931 PetscFunctionBegin; 1932 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1933 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1934 1935 if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device 1936 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1937 Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data); 1938 const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag; 1939 const MatScalar *Ba = b->a; 1940 PetscInt m = B->rmap->n, n = B->cmap->n; 1941 1942 if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time 1943 // Allocate memory and copy the structure 1944 factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1); 1945 factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries 1946 factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m); 1947 factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1); 1948 factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m])); 1949 factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m])); 1950 1951 PetscInt *Li = factors->iL_h.data(); 1952 PetscInt *Lj = factors->jL_h.data(); 1953 PetscInt *Ui = factors->iU_h.data(); 1954 PetscInt *Uj = factors->jU_h.data(); 1955 1956 Li[0] = Ui[0] = 0; 1957 for (PetscInt i = 0; i < m; i++) { 1958 PetscInt llen = Bi[i + 1] - Bi[i]; // exclusive of the diagonal entry 1959 PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry 1960 1961 PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen); // entries of L on the left of the diagonal 1962 Lj[Li[i] + llen] = i; // diagonal entry of L 1963 1964 Uj[Ui[i]] = i; // diagonal entry of U 1965 PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1); // entries of U on the right of the diagonal 1966 1967 Li[i + 1] = Li[i] + llen + 1; 1968 Ui[i + 1] = Ui[i] + ulen; 1969 } 1970 1971 factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h); 1972 factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h); 1973 factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h); 1974 factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h); 1975 factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h); 1976 factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h); 1977 1978 // Copy row/col permutation to device 1979 IS rowperm = ((Mat_SeqAIJ *)B->data)->row; 1980 PetscBool row_identity; 1981 PetscCall(ISIdentity(rowperm, &row_identity)); 1982 if (!row_identity) { 1983 const PetscInt *ip; 1984 1985 PetscCall(ISGetIndices(rowperm, &ip)); 1986 factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m); 1987 PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m))); 1988 PetscCall(ISRestoreIndices(rowperm, &ip)); 1989 PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt))); 1990 } 1991 1992 IS colperm = ((Mat_SeqAIJ *)B->data)->col; 1993 PetscBool col_identity; 1994 PetscCall(ISIdentity(colperm, &col_identity)); 1995 if (!col_identity) { 1996 const PetscInt *ip; 1997 1998 PetscCall(ISGetIndices(colperm, &ip)); 1999 factors->colperm = PetscIntKokkosView(NoInit("colperm"), n); 2000 PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n))); 2001 PetscCall(ISRestoreIndices(colperm, &ip)); 2002 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 2003 } 2004 2005 /* Create sptrsv handles for L, U and their transpose */ 2006 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2007 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2008 #else 2009 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2010 #endif 2011 factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */); 2012 factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */); 2013 factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */); 2014 factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */); 2015 } 2016 2017 // Copy the value 2018 for (PetscInt i = 0; i < m; i++) { 2019 PetscInt llen = Bi[i + 1] - Bi[i]; 2020 PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; 2021 const PetscInt *Li = factors->iL_h.data(); 2022 const PetscInt *Ui = factors->iU_h.data(); 2023 2024 PetscScalar *La = factors->aL_h.data(); 2025 PetscScalar *Ua = factors->aU_h.data(); 2026 2027 PetscArraycpy(La + Li[i], Ba + Bi[i], llen); // entries of L 2028 La[Li[i] + llen] = 1.0; // diagonal entry 2029 2030 Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]]; // diagonal entry 2031 PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1); // entries of U 2032 } 2033 2034 PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h)); 2035 PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h)); 2036 // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic 2037 factors->transpose_updated = PETSC_FALSE; 2038 factors->sptrsv_symbolic_completed = PETSC_FALSE; 2039 2040 B->ops->solve = MatSolve_SeqAIJKokkos_LU; 2041 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU; 2042 } 2043 2044 B->ops->matsolve = NULL; 2045 B->ops->matsolvetranspose = NULL; 2046 PetscFunctionReturn(PETSC_SUCCESS); 2047 } 2048 2049 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info) 2050 { 2051 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 2052 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2053 PetscInt fill_lev = info->levels; 2054 2055 PetscFunctionBegin; 2056 PetscCall(PetscLogGpuTimeBegin()); 2057 PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false"); 2058 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2059 2060 auto a_d = aijkok->a_dual.view_device(); 2061 auto i_d = aijkok->i_dual.view_device(); 2062 auto j_d = aijkok->j_dual.view_device(); 2063 2064 PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d)); 2065 2066 B->assembled = PETSC_TRUE; 2067 B->preallocated = PETSC_TRUE; 2068 B->ops->solve = MatSolve_SeqAIJKokkos_LU; 2069 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU; 2070 B->ops->matsolve = NULL; 2071 B->ops->matsolvetranspose = NULL; 2072 2073 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 2074 factors->transpose_updated = PETSC_FALSE; 2075 factors->sptrsv_symbolic_completed = PETSC_FALSE; 2076 /* TODO: log flops, but how to know that? */ 2077 PetscCall(PetscLogGpuTimeEnd()); 2078 PetscFunctionReturn(PETSC_SUCCESS); 2079 } 2080 2081 // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering 2082 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info) 2083 { 2084 Mat_SeqAIJKokkos *aijkok; 2085 Mat_SeqAIJ *b; 2086 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2087 PetscInt fill_lev = info->levels; 2088 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 2089 PetscInt n = A->rmap->n; 2090 2091 PetscFunctionBegin; 2092 PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now"); 2093 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2094 2095 /* Create a spiluk handle and then do symbolic factorization */ 2096 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 2097 factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 2098 2099 auto spiluk_handle = factors->kh.get_spiluk_handle(); 2100 2101 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 2102 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 2103 Kokkos::realloc(factors->iU_d, n + 1); 2104 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 2105 2106 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 2107 auto i_d = aijkok->i_dual.view_device(); 2108 auto j_d = aijkok->j_dual.view_device(); 2109 PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d)); 2110 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 2111 2112 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 2113 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 2114 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 2115 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 2116 2117 /* TODO: add options to select sptrsv algorithms */ 2118 /* Create sptrsv handles for L, U and their transpose */ 2119 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2120 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2121 #else 2122 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2123 #endif 2124 2125 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 2126 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 2127 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 2128 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 2129 2130 /* Fill fields of the factor matrix B */ 2131 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 2132 b = (Mat_SeqAIJ *)B->data; 2133 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 2134 B->info.fill_ratio_given = info->fill; 2135 B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 2136 2137 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0; 2138 PetscFunctionReturn(PETSC_SUCCESS); 2139 } 2140 2141 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2142 { 2143 PetscFunctionBegin; 2144 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2145 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2146 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2147 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 2148 PetscFunctionReturn(PETSC_SUCCESS); 2149 } 2150 2151 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2152 { 2153 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2154 2155 PetscFunctionBegin; 2156 if (!info->factoronhost) { 2157 PetscCall(ISIdentity(isrow, &row_identity)); 2158 PetscCall(ISIdentity(iscol, &col_identity)); 2159 } 2160 2161 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2162 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2163 2164 if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering 2165 PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info)); 2166 } else { 2167 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host 2168 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 2169 } 2170 PetscFunctionReturn(PETSC_SUCCESS); 2171 } 2172 2173 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 2174 { 2175 PetscFunctionBegin; 2176 PetscCall(MatSeqAIJKokkosSyncHost(A)); 2177 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 2178 2179 if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device 2180 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2181 Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data); 2182 const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag; 2183 const MatScalar *Ba = b->a; 2184 PetscInt m = B->rmap->n; 2185 2186 if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization 2187 // Allocate memory and copy the structure 2188 factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h 2189 factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]); 2190 factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]); 2191 factors->D_h = MatScalarKokkosViewHost(NoInit("D_h"), m); 2192 factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h); 2193 factors->D_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h); 2194 2195 // Build jU_h from the skewed Aj 2196 PetscInt *Uj = factors->jU_h.data(); 2197 for (PetscInt i = 0; i < m; i++) { 2198 PetscInt ulen = Bi[i + 1] - Bi[i]; 2199 Uj[Bi[i]] = i; // diagonal entry 2200 PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal 2201 } 2202 2203 // Copy iU, jU to device 2204 PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h)); 2205 PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h)); 2206 2207 // Copy row/col permutation to device 2208 IS rowperm = ((Mat_SeqAIJ *)B->data)->row; 2209 PetscBool row_identity; 2210 PetscCall(ISIdentity(rowperm, &row_identity)); 2211 if (!row_identity) { 2212 const PetscInt *ip; 2213 2214 PetscCall(ISGetIndices(rowperm, &ip)); 2215 PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m)); 2216 PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m))); 2217 PetscCall(ISRestoreIndices(rowperm, &ip)); 2218 PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt))); 2219 } 2220 2221 // Create sptrsv handles for U and U^T 2222 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2223 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2224 #else 2225 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2226 #endif 2227 factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */); 2228 factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */); 2229 } 2230 // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them 2231 B->ops->solve = MatSolve_SeqAIJKokkos_Cholesky; 2232 B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky; 2233 2234 // Copy the value 2235 PetscScalar *Ua = factors->aU_h.data(); 2236 PetscScalar *D = factors->D_h.data(); 2237 for (PetscInt i = 0; i < m; i++) { 2238 D[i] = Ba[Bdiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 2239 Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U 2240 for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k]; 2241 } 2242 PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h)); 2243 PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h)); 2244 2245 factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again 2246 factors->transpose_updated = PETSC_FALSE; 2247 } 2248 2249 B->ops->matsolve = NULL; 2250 B->ops->matsolvetranspose = NULL; 2251 PetscFunctionReturn(PETSC_SUCCESS); 2252 } 2253 2254 static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2255 { 2256 PetscFunctionBegin; 2257 if (info->solveonhost) { 2258 // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc. 2259 PetscCall(MatSetType(B, MATSEQSBAIJ)); 2260 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 2261 } 2262 2263 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2264 2265 if (!info->solveonhost) { 2266 // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr 2267 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2268 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2269 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos; 2270 } 2271 PetscFunctionReturn(PETSC_SUCCESS); 2272 } 2273 2274 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2275 { 2276 PetscFunctionBegin; 2277 if (info->solveonhost) { 2278 // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc. 2279 PetscCall(MatSetType(B, MATSEQSBAIJ)); 2280 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 2281 } 2282 2283 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm 2284 2285 if (!info->solveonhost) { 2286 // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr 2287 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2288 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2289 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos; 2290 } 2291 PetscFunctionReturn(PETSC_SUCCESS); 2292 } 2293 2294 // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix 2295 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type) 2296 { 2297 PetscFunctionBegin; 2298 *type = MATSOLVERKOKKOS; 2299 PetscFunctionReturn(PETSC_SUCCESS); 2300 } 2301 2302 /*MC 2303 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 2304 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 2305 2306 Level: beginner 2307 2308 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 2309 M*/ 2310 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 2311 { 2312 PetscInt n = A->rmap->n; 2313 MPI_Comm comm; 2314 2315 PetscFunctionBegin; 2316 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 2317 PetscCall(MatCreate(comm, B)); 2318 PetscCall(MatSetSizes(*B, n, n, n, n)); 2319 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2320 (*B)->factortype = ftype; 2321 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 2322 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2323 PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2324 2325 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2326 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 2327 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 2328 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2329 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2330 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2331 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2332 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJKokkos; 2333 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos; 2334 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2335 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2336 } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 2337 2338 // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host 2339 (*B)->canuseordering = PETSC_TRUE; 2340 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos)); 2341 PetscFunctionReturn(PETSC_SUCCESS); 2342 } 2343 2344 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void) 2345 { 2346 PetscFunctionBegin; 2347 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 2348 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos)); 2349 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 2350 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos)); 2351 PetscFunctionReturn(PETSC_SUCCESS); 2352 } 2353 2354 /* Utility to print out a KokkosCsrMatrix for debugging */ 2355 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 2356 { 2357 const auto &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map); 2358 const auto &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries); 2359 const auto &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values); 2360 const PetscInt *i = iv.data(); 2361 const PetscInt *j = jv.data(); 2362 const PetscScalar *a = av.data(); 2363 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 2364 2365 PetscFunctionBegin; 2366 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 2367 for (PetscInt k = 0; k < m; k++) { 2368 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 2369 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 2370 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 2371 } 2372 PetscFunctionReturn(PETSC_SUCCESS); 2373 } 2374