1 #include <petsc_kokkos.hpp> 2 #include <petscvec_kokkos.hpp> 3 #include <petscmat_kokkos.hpp> 4 #include <petscpkg_version.h> 5 #include <petsc/private/petscimpl.h> 6 #include <petsc/private/sfimpl.h> 7 #include <petsc/private/kokkosimpl.hpp> 8 #include <petscsys.h> 9 10 #include <Kokkos_Core.hpp> 11 #include <KokkosBlas.hpp> 12 #include <KokkosSparse_CrsMatrix.hpp> 13 14 // To suppress compiler warnings: 15 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63: 16 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, 17 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*, 18 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations] 19 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations") 20 #include <KokkosSparse_spmv.hpp> 21 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END() 22 23 #include <KokkosSparse_spiluk.hpp> 24 #include <KokkosSparse_sptrsv.hpp> 25 #include <KokkosSparse_spgemm.hpp> 26 #include <KokkosSparse_spadd.hpp> 27 #include <KokkosBatched_LU_Decl.hpp> 28 #include <KokkosBatched_InverseLU_Decl.hpp> 29 30 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 31 32 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 33 #include <KokkosSparse_Utils.hpp> 34 using KokkosSparse::sort_crs_matrix; 35 using KokkosSparse::Impl::transpose_matrix; 36 #else 37 #include <KokkosKernels_Sorting.hpp> 38 using KokkosKernels::sort_crs_matrix; 39 using KokkosKernels::Impl::transpose_matrix; 40 #endif 41 42 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0) 43 using KokkosSparse::spiluk_symbolic; 44 using KokkosSparse::spiluk_numeric; 45 using KokkosSparse::sptrsv_symbolic; 46 using KokkosSparse::sptrsv_solve; 47 using KokkosSparse::Experimental::SPTRSVAlgorithm; 48 using KokkosSparse::Experimental::SPILUKAlgorithm; 49 #else 50 using KokkosSparse::Experimental::spiluk_symbolic; 51 using KokkosSparse::Experimental::spiluk_numeric; 52 using KokkosSparse::Experimental::sptrsv_symbolic; 53 using KokkosSparse::Experimental::sptrsv_solve; 54 using KokkosSparse::Experimental::SPTRSVAlgorithm; 55 using KokkosSparse::Experimental::SPILUKAlgorithm; 56 #endif 57 58 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 59 60 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 61 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 62 In the latter case, it is important to set a_dual's sync state correctly. 63 */ 64 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 65 { 66 Mat_SeqAIJ *aijseq; 67 Mat_SeqAIJKokkos *aijkok; 68 69 PetscFunctionBegin; 70 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 71 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 72 73 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 74 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 75 76 /* If aijkok does not exist, we just copy i, j to device. 77 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 78 In both cases, we build a new aijkok structure. 79 */ 80 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 81 if (aijkok && aijkok->host_aij_allocated_by_kokkos) { /* Avoid accidently freeing much needed a,i,j on host when deleting aijkok */ 82 PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i)); 83 PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j)); 84 PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a)); 85 PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1)); 86 PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz())); 87 PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz())); 88 aijseq->free_a = PETSC_TRUE; 89 aijseq->free_ij = PETSC_TRUE; 90 /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where 91 we have the CsrMatrix on device first and then copy to host, followed by 92 MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL. 93 One could improve it by not using NULL garray. 94 */ 95 } 96 delete aijkok; 97 aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 98 A->spptr = aijkok; 99 } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag. 100 MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n); 101 auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h); 102 aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h); 103 } 104 PetscFunctionReturn(PETSC_SUCCESS); 105 } 106 107 /* Sync CSR data to device if not yet */ 108 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 109 { 110 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 111 112 PetscFunctionBegin; 113 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 114 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 115 if (aijkok->a_dual.need_sync_device()) { 116 PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 117 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 118 aijkok->hermitian_updated = PETSC_FALSE; 119 } 120 PetscFunctionReturn(PETSC_SUCCESS); 121 } 122 123 /* Mark the CSR data on device as modified */ 124 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 125 { 126 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 127 128 PetscFunctionBegin; 129 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 130 aijkok->a_dual.clear_sync_state(); 131 aijkok->a_dual.modify_device(); 132 aijkok->transpose_updated = PETSC_FALSE; 133 aijkok->hermitian_updated = PETSC_FALSE; 134 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 135 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 136 PetscFunctionReturn(PETSC_SUCCESS); 137 } 138 139 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 140 { 141 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 142 auto exec = PetscGetKokkosExecutionSpace(); 143 144 PetscFunctionBegin; 145 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 146 /* We do not expect one needs factors on host */ 147 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 148 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 149 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec)); 150 PetscFunctionReturn(PETSC_SUCCESS); 151 } 152 153 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 154 { 155 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 156 157 PetscFunctionBegin; 158 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 159 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 160 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 161 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 162 */ 163 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 164 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 165 *array = aijkok->a_dual.view_host().data(); 166 } else { /* Happens when calling MatSetValues on a newly created matrix */ 167 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 168 } 169 PetscFunctionReturn(PETSC_SUCCESS); 170 } 171 172 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 173 { 174 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 175 176 PetscFunctionBegin; 177 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 178 PetscFunctionReturn(PETSC_SUCCESS); 179 } 180 181 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 182 { 183 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 184 185 PetscFunctionBegin; 186 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 187 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 188 *array = aijkok->a_dual.view_host().data(); 189 } else { 190 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 191 } 192 PetscFunctionReturn(PETSC_SUCCESS); 193 } 194 195 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 196 { 197 PetscFunctionBegin; 198 *array = NULL; 199 PetscFunctionReturn(PETSC_SUCCESS); 200 } 201 202 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 203 { 204 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 205 206 PetscFunctionBegin; 207 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 208 *array = aijkok->a_dual.view_host().data(); 209 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 210 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 211 } 212 PetscFunctionReturn(PETSC_SUCCESS); 213 } 214 215 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 216 { 217 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 218 219 PetscFunctionBegin; 220 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 221 aijkok->a_dual.clear_sync_state(); 222 aijkok->a_dual.modify_host(); 223 } 224 PetscFunctionReturn(PETSC_SUCCESS); 225 } 226 227 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 228 { 229 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 230 231 PetscFunctionBegin; 232 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 233 234 if (i) *i = aijkok->i_device_data(); 235 if (j) *j = aijkok->j_device_data(); 236 if (a) { 237 PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 238 *a = aijkok->a_device_data(); 239 } 240 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 241 PetscFunctionReturn(PETSC_SUCCESS); 242 } 243 244 static PetscErrorCode MatGetCurrentMemType_SeqAIJKokkos(PETSC_UNUSED Mat A, PetscMemType *mtype) 245 { 246 PetscFunctionBegin; 247 *mtype = PETSC_MEMTYPE_KOKKOS; 248 PetscFunctionReturn(PETSC_SUCCESS); 249 } 250 251 /* 252 Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 253 254 Input Parameter: 255 . A - the MATSEQAIJKOKKOS matrix 256 257 Output Parameters: 258 + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 259 - T_d - the transpose on device, whose value array is allocated but not initialized 260 */ 261 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 262 { 263 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 264 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 265 const PetscInt *Ai = aseq->i, *Aj = aseq->j; 266 MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 267 MatRowMapType *Ti = Ti_h.data(); 268 MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 269 MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 270 PetscInt *Tj = Tj_h.data(); 271 PetscInt *perm = perm_h.data(); 272 PetscInt *offset; 273 274 PetscFunctionBegin; 275 // Populate Ti 276 PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 277 Ti++; 278 for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 279 Ti--; 280 for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 281 282 // Populate Tj and the permutation array 283 PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 284 for (PetscInt i = 0; i < m; i++) { 285 for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 286 PetscInt r = Aj[j]; // row r of T 287 PetscInt disp = Ti[r] + offset[r]; 288 289 Tj[disp] = i; // col i of T 290 perm[disp] = j; 291 offset[r]++; 292 } 293 } 294 PetscCall(PetscFree(offset)); 295 296 // Sort each row of T, along with the permutation array 297 for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 298 299 // Output perm and T on device 300 auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 301 auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 302 PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 303 PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 304 PetscFunctionReturn(PETSC_SUCCESS); 305 } 306 307 // Generate the transpose on device and cache it internally 308 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 309 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 310 { 311 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 312 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 313 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 314 KokkosCsrMatrix &T = akok->csrmatT; 315 316 PetscFunctionBegin; 317 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 318 PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device 319 320 const auto &Aa = akok->a_dual.view_device(); 321 322 if (A->symmetric == PETSC_BOOL3_TRUE) { 323 *csrmatT = akok->csrmat; 324 } else { 325 // See if we already have a cached transpose and its value is up to date 326 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 327 if (!akok->transpose_updated) { // if the value is out of date, update the cached version 328 const auto &perm = akok->transpose_perm; // get the permutation array 329 auto &Ta = T.values; 330 331 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 332 } 333 } else { // Generate T of size n x m for the first time 334 MatRowMapKokkosView perm; 335 336 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 337 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 338 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 339 } 340 akok->transpose_updated = PETSC_TRUE; 341 *csrmatT = akok->csrmatT; 342 } 343 PetscFunctionReturn(PETSC_SUCCESS); 344 } 345 346 // Generate the Hermitian on device and cache it internally 347 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 348 { 349 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 350 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 351 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 352 KokkosCsrMatrix &T = akok->csrmatH; 353 354 PetscFunctionBegin; 355 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 356 PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device 357 358 const auto &Aa = akok->a_dual.view_device(); 359 360 if (A->hermitian == PETSC_BOOL3_TRUE) { 361 *csrmatH = akok->csrmat; 362 } else { 363 // See if we already have a cached hermitian and its value is up to date 364 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 365 if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 366 const auto &perm = akok->transpose_perm; // get the permutation array 367 auto &Ta = T.values; 368 369 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 370 } 371 } else { // Generate T of size n x m for the first time 372 MatRowMapKokkosView perm; 373 374 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 375 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 376 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 377 } 378 akok->hermitian_updated = PETSC_TRUE; 379 *csrmatH = akok->csrmatH; 380 } 381 PetscFunctionReturn(PETSC_SUCCESS); 382 } 383 384 /* y = A x */ 385 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 386 { 387 Mat_SeqAIJKokkos *aijkok; 388 ConstPetscScalarKokkosView xv; 389 PetscScalarKokkosView yv; 390 391 PetscFunctionBegin; 392 PetscCall(PetscLogGpuTimeBegin()); 393 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 394 PetscCall(VecGetKokkosView(xx, &xv)); 395 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 396 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 397 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 398 PetscCall(VecRestoreKokkosView(xx, &xv)); 399 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 400 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 401 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 402 PetscCall(PetscLogGpuTimeEnd()); 403 PetscFunctionReturn(PETSC_SUCCESS); 404 } 405 406 /* y = A^T x */ 407 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 408 { 409 Mat_SeqAIJKokkos *aijkok; 410 const char *mode; 411 ConstPetscScalarKokkosView xv; 412 PetscScalarKokkosView yv; 413 KokkosCsrMatrix csrmat; 414 415 PetscFunctionBegin; 416 PetscCall(PetscLogGpuTimeBegin()); 417 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 418 PetscCall(VecGetKokkosView(xx, &xv)); 419 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 420 if (A->form_explicit_transpose) { 421 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 422 mode = "N"; 423 } else { 424 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 425 csrmat = aijkok->csrmat; 426 mode = "T"; 427 } 428 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 429 PetscCall(VecRestoreKokkosView(xx, &xv)); 430 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 431 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 432 PetscCall(PetscLogGpuTimeEnd()); 433 PetscFunctionReturn(PETSC_SUCCESS); 434 } 435 436 /* y = A^H x */ 437 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 438 { 439 Mat_SeqAIJKokkos *aijkok; 440 const char *mode; 441 ConstPetscScalarKokkosView xv; 442 PetscScalarKokkosView yv; 443 KokkosCsrMatrix csrmat; 444 445 PetscFunctionBegin; 446 PetscCall(PetscLogGpuTimeBegin()); 447 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 448 PetscCall(VecGetKokkosView(xx, &xv)); 449 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 450 if (A->form_explicit_transpose) { 451 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 452 mode = "N"; 453 } else { 454 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 455 csrmat = aijkok->csrmat; 456 mode = "C"; 457 } 458 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 459 PetscCall(VecRestoreKokkosView(xx, &xv)); 460 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 461 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 462 PetscCall(PetscLogGpuTimeEnd()); 463 PetscFunctionReturn(PETSC_SUCCESS); 464 } 465 466 /* z = A x + y */ 467 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 468 { 469 Mat_SeqAIJKokkos *aijkok; 470 ConstPetscScalarKokkosView xv; 471 PetscScalarKokkosView zv; 472 473 PetscFunctionBegin; 474 PetscCall(PetscLogGpuTimeBegin()); 475 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 476 if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host 477 PetscCall(VecGetKokkosView(xx, &xv)); 478 PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device 479 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 480 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 481 PetscCall(VecRestoreKokkosView(xx, &xv)); 482 PetscCall(VecRestoreKokkosView(zz, &zv)); 483 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 484 PetscCall(PetscLogGpuTimeEnd()); 485 PetscFunctionReturn(PETSC_SUCCESS); 486 } 487 488 /* z = A^T x + y */ 489 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 490 { 491 Mat_SeqAIJKokkos *aijkok; 492 const char *mode; 493 ConstPetscScalarKokkosView xv; 494 PetscScalarKokkosView zv; 495 KokkosCsrMatrix csrmat; 496 497 PetscFunctionBegin; 498 PetscCall(PetscLogGpuTimeBegin()); 499 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 500 if (zz != yy) PetscCall(VecCopy(yy, zz)); 501 PetscCall(VecGetKokkosView(xx, &xv)); 502 PetscCall(VecGetKokkosView(zz, &zv)); 503 if (A->form_explicit_transpose) { 504 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 505 mode = "N"; 506 } else { 507 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 508 csrmat = aijkok->csrmat; 509 mode = "T"; 510 } 511 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 512 PetscCall(VecRestoreKokkosView(xx, &xv)); 513 PetscCall(VecRestoreKokkosView(zz, &zv)); 514 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 515 PetscCall(PetscLogGpuTimeEnd()); 516 PetscFunctionReturn(PETSC_SUCCESS); 517 } 518 519 /* z = A^H x + y */ 520 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 521 { 522 Mat_SeqAIJKokkos *aijkok; 523 const char *mode; 524 ConstPetscScalarKokkosView xv; 525 PetscScalarKokkosView zv; 526 KokkosCsrMatrix csrmat; 527 528 PetscFunctionBegin; 529 PetscCall(PetscLogGpuTimeBegin()); 530 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 531 if (zz != yy) PetscCall(VecCopy(yy, zz)); 532 PetscCall(VecGetKokkosView(xx, &xv)); 533 PetscCall(VecGetKokkosView(zz, &zv)); 534 if (A->form_explicit_transpose) { 535 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 536 mode = "N"; 537 } else { 538 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 539 csrmat = aijkok->csrmat; 540 mode = "C"; 541 } 542 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 543 PetscCall(VecRestoreKokkosView(xx, &xv)); 544 PetscCall(VecRestoreKokkosView(zz, &zv)); 545 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 546 PetscCall(PetscLogGpuTimeEnd()); 547 PetscFunctionReturn(PETSC_SUCCESS); 548 } 549 550 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 551 { 552 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 553 554 PetscFunctionBegin; 555 switch (op) { 556 case MAT_FORM_EXPLICIT_TRANSPOSE: 557 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 558 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 559 A->form_explicit_transpose = flg; 560 break; 561 default: 562 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 563 break; 564 } 565 PetscFunctionReturn(PETSC_SUCCESS); 566 } 567 568 /* Depending on reuse, either build a new mat, or use the existing mat */ 569 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 570 { 571 Mat_SeqAIJ *aseq; 572 573 PetscFunctionBegin; 574 PetscCall(PetscKokkosInitializeCheck()); 575 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 576 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 577 PetscCall(MatSetType(*newmat, mtype)); 578 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 579 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 580 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 581 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 582 PetscCall(PetscFree(A->defaultvectype)); 583 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 584 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 585 PetscCall(MatSetOps_SeqAIJKokkos(A)); 586 aseq = static_cast<Mat_SeqAIJ *>(A->data); 587 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 588 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 589 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE); 590 } 591 } 592 PetscFunctionReturn(PETSC_SUCCESS); 593 } 594 595 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 596 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 597 */ 598 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 599 { 600 Mat_SeqAIJ *bseq; 601 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 602 Mat mat; 603 604 PetscFunctionBegin; 605 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 606 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 607 mat = *B; 608 if (A->assembled) { 609 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 610 bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE); 611 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 612 /* Now copy values to B if needed */ 613 if (dupOption == MAT_COPY_VALUES) { 614 if (akok->a_dual.need_sync_device()) { 615 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 616 bkok->a_dual.modify_host(); 617 } else { /* If device has the latest data, we only copy data on device */ 618 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 619 bkok->a_dual.modify_device(); 620 } 621 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 622 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 623 bkok->a_dual.modify_host(); 624 } 625 mat->spptr = bkok; 626 } 627 628 PetscCall(PetscFree(mat->defaultvectype)); 629 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 630 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 631 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 632 PetscFunctionReturn(PETSC_SUCCESS); 633 } 634 635 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 636 { 637 Mat At; 638 KokkosCsrMatrix internT; 639 Mat_SeqAIJKokkos *atkok, *bkok; 640 641 PetscFunctionBegin; 642 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 643 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 644 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 645 /* Deep copy internT, as we want to isolate the internal transpose */ 646 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 647 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 648 if (reuse == MAT_INITIAL_MATRIX) *B = At; 649 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 650 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 651 if ((*B)->assembled) { 652 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 653 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 654 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 655 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 656 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 657 MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 658 MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 659 PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 660 PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 661 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 662 } 663 PetscFunctionReturn(PETSC_SUCCESS); 664 } 665 666 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 667 { 668 Mat_SeqAIJKokkos *aijkok; 669 670 PetscFunctionBegin; 671 if (A->factortype == MAT_FACTOR_NONE) { 672 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 673 delete aijkok; 674 } else { 675 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 676 } 677 A->spptr = NULL; 678 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 679 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 680 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 681 #if defined(PETSC_HAVE_HYPRE) 682 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL)); 683 #endif 684 PetscCall(MatDestroy_SeqAIJ(A)); 685 PetscFunctionReturn(PETSC_SUCCESS); 686 } 687 688 /*MC 689 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 690 691 A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types 692 693 Options Database Key: 694 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 695 696 Level: beginner 697 698 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 699 M*/ 700 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 701 { 702 PetscFunctionBegin; 703 PetscCall(PetscKokkosInitializeCheck()); 704 PetscCall(MatCreate_SeqAIJ(A)); 705 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 706 PetscFunctionReturn(PETSC_SUCCESS); 707 } 708 709 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 710 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 711 { 712 Mat_SeqAIJ *a, *b; 713 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 714 MatScalarKokkosView aa, ba, ca; 715 MatRowMapKokkosView ai, bi, ci; 716 MatColIdxKokkosView aj, bj, cj; 717 PetscInt m, n, nnz, aN; 718 719 PetscFunctionBegin; 720 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 721 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 722 PetscAssertPointer(C, 4); 723 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 724 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 725 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 726 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 727 728 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 729 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 730 a = static_cast<Mat_SeqAIJ *>(A->data); 731 b = static_cast<Mat_SeqAIJ *>(B->data); 732 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 733 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 734 aa = akok->a_dual.view_device(); 735 ai = akok->i_dual.view_device(); 736 ba = bkok->a_dual.view_device(); 737 bi = bkok->i_dual.view_device(); 738 m = A->rmap->n; /* M, N and nnz of C */ 739 n = A->cmap->n + B->cmap->n; 740 nnz = a->nz + b->nz; 741 aN = A->cmap->n; /* N of A */ 742 if (reuse == MAT_INITIAL_MATRIX) { 743 aj = akok->j_dual.view_device(); 744 bj = bkok->j_dual.view_device(); 745 auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 746 auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 747 auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 748 ca = ca_dual.view_device(); 749 ci = ci_dual.view_device(); 750 cj = cj_dual.view_device(); 751 752 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 753 Kokkos::parallel_for( 754 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 755 PetscInt i = t.league_rank(); /* row i */ 756 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 757 758 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 759 ci(i) = coffset; 760 if (i == m - 1) ci(m) = ai(m) + bi(m); 761 }); 762 763 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 764 if (k < alen) { 765 ca(coffset + k) = aa(ai(i) + k); 766 cj(coffset + k) = aj(ai(i) + k); 767 } else { 768 ca(coffset + k) = ba(bi(i) + k - alen); 769 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 770 } 771 }); 772 }); 773 ca_dual.modify_device(); 774 ci_dual.modify_device(); 775 cj_dual.modify_device(); 776 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 777 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 778 } else if (reuse == MAT_REUSE_MATRIX) { 779 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 780 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 781 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 782 ca = ckok->a_dual.view_device(); 783 ci = ckok->i_dual.view_device(); 784 785 Kokkos::parallel_for( 786 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 787 PetscInt i = t.league_rank(); /* row i */ 788 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 789 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 790 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 791 else ca(ci(i) + k) = ba(bi(i) + k - alen); 792 }); 793 }); 794 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 795 } 796 PetscFunctionReturn(PETSC_SUCCESS); 797 } 798 799 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 800 { 801 PetscFunctionBegin; 802 delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 803 PetscFunctionReturn(PETSC_SUCCESS); 804 } 805 806 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 807 { 808 Mat_Product *product = C->product; 809 Mat A, B; 810 bool transA, transB; /* use bool, since KK needs this type */ 811 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 812 Mat_SeqAIJ *c; 813 MatProductData_SeqAIJKokkos *pdata; 814 KokkosCsrMatrix csrmatA, csrmatB; 815 816 PetscFunctionBegin; 817 MatCheckProduct(C, 1); 818 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 819 pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 820 821 // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 822 // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 823 // we still do numeric. 824 if (pdata->reusesym) { // numeric reuses results from symbolic 825 pdata->reusesym = PETSC_FALSE; 826 PetscFunctionReturn(PETSC_SUCCESS); 827 } 828 829 switch (product->type) { 830 case MATPRODUCT_AB: 831 transA = false; 832 transB = false; 833 break; 834 case MATPRODUCT_AtB: 835 transA = true; 836 transB = false; 837 break; 838 case MATPRODUCT_ABt: 839 transA = false; 840 transB = true; 841 break; 842 default: 843 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 844 } 845 846 A = product->A; 847 B = product->B; 848 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 849 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 850 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 851 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 852 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 853 854 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 855 856 csrmatA = akok->csrmat; 857 csrmatB = bkok->csrmat; 858 859 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 860 if (transA) { 861 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 862 transA = false; 863 } 864 865 if (transB) { 866 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 867 transB = false; 868 } 869 PetscCall(PetscLogGpuTimeBegin()); 870 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 871 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 872 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 873 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 874 #endif 875 876 PetscCall(PetscLogGpuTimeEnd()); 877 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 878 /* shorter version of MatAssemblyEnd_SeqAIJ */ 879 c = (Mat_SeqAIJ *)C->data; 880 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 881 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 882 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 883 c->reallocs = 0; 884 C->info.mallocs = 0; 885 C->info.nz_unneeded = 0; 886 C->assembled = C->was_assembled = PETSC_TRUE; 887 C->num_ass++; 888 PetscFunctionReturn(PETSC_SUCCESS); 889 } 890 891 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 892 { 893 Mat_Product *product = C->product; 894 MatProductType ptype; 895 Mat A, B; 896 bool transA, transB; 897 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 898 MatProductData_SeqAIJKokkos *pdata; 899 MPI_Comm comm; 900 KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 901 902 PetscFunctionBegin; 903 MatCheckProduct(C, 1); 904 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 905 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 906 A = product->A; 907 B = product->B; 908 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 909 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 910 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 911 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 912 csrmatA = akok->csrmat; 913 csrmatB = bkok->csrmat; 914 915 ptype = product->type; 916 // Take advantage of the symmetry if true 917 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 918 ptype = MATPRODUCT_AB; 919 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 920 } 921 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 922 ptype = MATPRODUCT_AB; 923 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 924 } 925 926 switch (ptype) { 927 case MATPRODUCT_AB: 928 transA = false; 929 transB = false; 930 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 931 break; 932 case MATPRODUCT_AtB: 933 transA = true; 934 transB = false; 935 if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs)); 936 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 937 break; 938 case MATPRODUCT_ABt: 939 transA = false; 940 transB = true; 941 if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs)); 942 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 943 break; 944 default: 945 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 946 } 947 PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 948 pdata->reusesym = product->api_user; 949 950 /* TODO: add command line options to select spgemm algorithms */ 951 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 952 953 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 954 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 955 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 956 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 957 #endif 958 #endif 959 PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 960 961 PetscCall(PetscLogGpuTimeBegin()); 962 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 963 if (transA) { 964 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 965 transA = false; 966 } 967 968 if (transB) { 969 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 970 transB = false; 971 } 972 973 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 974 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 975 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 976 calling new Mat_SeqAIJKokkos(). 977 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 978 */ 979 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 980 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 981 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 982 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 983 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 984 #endif 985 PetscCall(PetscLogGpuTimeEnd()); 986 987 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 988 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 989 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 990 PetscFunctionReturn(PETSC_SUCCESS); 991 } 992 993 /* handles sparse matrix matrix ops */ 994 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 995 { 996 Mat_Product *product = mat->product; 997 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 998 999 PetscFunctionBegin; 1000 MatCheckProduct(mat, 1); 1001 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 1002 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 1003 if (Biskok && Ciskok) { 1004 switch (product->type) { 1005 case MATPRODUCT_AB: 1006 case MATPRODUCT_AtB: 1007 case MATPRODUCT_ABt: 1008 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 1009 break; 1010 case MATPRODUCT_PtAP: 1011 case MATPRODUCT_RARt: 1012 case MATPRODUCT_ABC: 1013 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 1014 break; 1015 default: 1016 break; 1017 } 1018 } else { /* fallback for AIJ */ 1019 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 1020 } 1021 PetscFunctionReturn(PETSC_SUCCESS); 1022 } 1023 1024 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 1025 { 1026 Mat_SeqAIJKokkos *aijkok; 1027 1028 PetscFunctionBegin; 1029 PetscCall(PetscLogGpuTimeBegin()); 1030 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1031 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1032 KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 1033 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1034 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 1035 PetscCall(PetscLogGpuTimeEnd()); 1036 PetscFunctionReturn(PETSC_SUCCESS); 1037 } 1038 1039 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular) 1040 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a) 1041 { 1042 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1043 1044 PetscFunctionBegin; 1045 if (A->assembled && aijseq->diagonaldense) { // no missing diagonals 1046 PetscInt n = PetscMin(A->rmap->n, A->cmap->n); 1047 1048 PetscCall(PetscLogGpuTimeBegin()); 1049 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1050 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1051 const auto &Aa = aijkok->a_dual.view_device(); 1052 const auto &Adiag = aijkok->diag_dual.view_device(); 1053 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; })); 1054 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1055 PetscCall(PetscLogGpuFlops(n)); 1056 PetscCall(PetscLogGpuTimeEnd()); 1057 } else { // need reassembly, very slow! 1058 PetscCall(MatShift_Basic(A, a)); 1059 } 1060 PetscFunctionReturn(PETSC_SUCCESS); 1061 } 1062 1063 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is) 1064 { 1065 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data); 1066 1067 PetscFunctionBegin; 1068 if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals 1069 ConstPetscScalarKokkosView dv; 1070 PetscInt n, nv; 1071 1072 PetscCall(PetscLogGpuTimeBegin()); 1073 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1074 PetscCall(VecGetKokkosView(D, &dv)); 1075 PetscCall(VecGetLocalSize(D, &nv)); 1076 n = PetscMin(Y->rmap->n, Y->cmap->n); 1077 PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match"); 1078 1079 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1080 const auto &Aa = aijkok->a_dual.view_device(); 1081 const auto &Adiag = aijkok->diag_dual.view_device(); 1082 PetscCallCXX(Kokkos::parallel_for( 1083 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1084 if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i); 1085 else Aa(Adiag(i)) += dv(i); 1086 })); 1087 PetscCall(VecRestoreKokkosView(D, &dv)); 1088 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1089 PetscCall(PetscLogGpuFlops(n)); 1090 PetscCall(PetscLogGpuTimeEnd()); 1091 } else { // need reassembly, very slow! 1092 PetscCall(MatDiagonalSet_Default(Y, D, is)); 1093 } 1094 PetscFunctionReturn(PETSC_SUCCESS); 1095 } 1096 1097 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr) 1098 { 1099 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1100 PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz; 1101 ConstPetscScalarKokkosView lv, rv; 1102 1103 PetscFunctionBegin; 1104 PetscCall(PetscLogGpuTimeBegin()); 1105 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1106 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1107 const auto &Aa = aijkok->a_dual.view_device(); 1108 const auto &Ai = aijkok->i_dual.view_device(); 1109 const auto &Aj = aijkok->j_dual.view_device(); 1110 if (ll) { 1111 PetscCall(VecGetLocalSize(ll, &m)); 1112 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1113 PetscCall(VecGetKokkosView(ll, &lv)); 1114 PetscCallCXX(Kokkos::parallel_for( // for each row 1115 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1116 PetscInt i = t.league_rank(); // row i 1117 PetscInt len = Ai(i + 1) - Ai(i); 1118 // scale entries on the row 1119 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); }); 1120 })); 1121 PetscCall(VecRestoreKokkosView(ll, &lv)); 1122 PetscCall(PetscLogGpuFlops(nz)); 1123 } 1124 if (rr) { 1125 PetscCall(VecGetLocalSize(rr, &n)); 1126 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1127 PetscCall(VecGetKokkosView(rr, &rv)); 1128 PetscCallCXX(Kokkos::parallel_for( // for each nonzero 1129 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); })); 1130 PetscCall(VecRestoreKokkosView(rr, &lv)); 1131 PetscCall(PetscLogGpuFlops(nz)); 1132 } 1133 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1134 PetscCall(PetscLogGpuTimeEnd()); 1135 PetscFunctionReturn(PETSC_SUCCESS); 1136 } 1137 1138 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 1139 { 1140 Mat_SeqAIJKokkos *aijkok; 1141 1142 PetscFunctionBegin; 1143 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1144 if (aijkok) { /* Only zero the device if data is already there */ 1145 KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0); 1146 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1147 } else { /* Might be preallocated but not assembled */ 1148 PetscCall(MatZeroEntries_SeqAIJ(A)); 1149 } 1150 PetscFunctionReturn(PETSC_SUCCESS); 1151 } 1152 1153 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1154 { 1155 Mat_SeqAIJKokkos *aijkok; 1156 PetscInt n; 1157 PetscScalarKokkosView xv; 1158 1159 PetscFunctionBegin; 1160 PetscCall(VecGetLocalSize(x, &n)); 1161 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1162 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1163 1164 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1165 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1166 1167 const auto &Aa = aijkok->a_dual.view_device(); 1168 const auto &Ai = aijkok->i_dual.view_device(); 1169 const auto &Adiag = aijkok->diag_dual.view_device(); 1170 1171 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1172 Kokkos::parallel_for( 1173 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1174 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1175 else xv(i) = 0; 1176 }); 1177 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1178 PetscFunctionReturn(PETSC_SUCCESS); 1179 } 1180 1181 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1182 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1183 { 1184 Mat_SeqAIJKokkos *aijkok; 1185 1186 PetscFunctionBegin; 1187 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1188 PetscAssertPointer(kv, 2); 1189 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1190 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1191 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1192 *kv = aijkok->a_dual.view_device(); 1193 PetscFunctionReturn(PETSC_SUCCESS); 1194 } 1195 1196 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1197 { 1198 PetscFunctionBegin; 1199 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1200 PetscAssertPointer(kv, 2); 1201 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1202 PetscFunctionReturn(PETSC_SUCCESS); 1203 } 1204 1205 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1206 { 1207 Mat_SeqAIJKokkos *aijkok; 1208 1209 PetscFunctionBegin; 1210 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1211 PetscAssertPointer(kv, 2); 1212 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1213 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1214 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1215 *kv = aijkok->a_dual.view_device(); 1216 PetscFunctionReturn(PETSC_SUCCESS); 1217 } 1218 1219 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1220 { 1221 PetscFunctionBegin; 1222 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1223 PetscAssertPointer(kv, 2); 1224 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1225 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1226 PetscFunctionReturn(PETSC_SUCCESS); 1227 } 1228 1229 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1230 { 1231 Mat_SeqAIJKokkos *aijkok; 1232 1233 PetscFunctionBegin; 1234 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1235 PetscAssertPointer(kv, 2); 1236 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1237 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1238 *kv = aijkok->a_dual.view_device(); 1239 PetscFunctionReturn(PETSC_SUCCESS); 1240 } 1241 1242 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1243 { 1244 PetscFunctionBegin; 1245 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1246 PetscAssertPointer(kv, 2); 1247 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1248 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1249 PetscFunctionReturn(PETSC_SUCCESS); 1250 } 1251 1252 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A) 1253 { 1254 Mat_SeqAIJKokkos *akok; 1255 1256 PetscFunctionBegin; 1257 // Create host copies of the input aij 1258 auto i_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), i_d); 1259 auto j_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), j_d); 1260 // Don't copy the vals to the host now 1261 auto a_h = Kokkos::create_mirror_view(HostMirrorMemorySpace(), a_d); 1262 1263 MatScalarKokkosDualView a_dual = MatScalarKokkosDualView(a_d, a_h); 1264 // Note we have modified device data so it will copy lazily 1265 a_dual.modify_device(); 1266 MatRowMapKokkosDualView i_dual = MatRowMapKokkosDualView(i_d, i_h); 1267 MatColIdxKokkosDualView j_dual = MatColIdxKokkosDualView(j_d, j_h); 1268 1269 PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_dual.extent(0), i_dual, j_dual, a_dual)); 1270 PetscCall(MatCreate(comm, A)); 1271 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1272 PetscFunctionReturn(PETSC_SUCCESS); 1273 } 1274 1275 /* Computes Y += alpha X */ 1276 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1277 { 1278 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1279 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1280 ConstMatScalarKokkosView Xa; 1281 MatScalarKokkosView Ya; 1282 auto exec = PetscGetKokkosExecutionSpace(); 1283 1284 PetscFunctionBegin; 1285 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1286 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1287 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1288 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1289 PetscCall(PetscLogGpuTimeBegin()); 1290 1291 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1292 PetscBool e; 1293 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1294 if (e) { 1295 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1296 if (e) pattern = SAME_NONZERO_PATTERN; 1297 } 1298 } 1299 1300 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1301 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1302 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1303 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1304 */ 1305 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1306 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1307 Xa = xkok->a_dual.view_device(); 1308 Ya = ykok->a_dual.view_device(); 1309 1310 if (pattern == SAME_NONZERO_PATTERN) { 1311 KokkosBlas::axpy(exec, alpha, Xa, Ya); 1312 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1313 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1314 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1315 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1316 1317 Kokkos::parallel_for( 1318 Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1319 PetscInt i = t.league_rank(); // row i 1320 Kokkos::single(Kokkos::PerTeam(t), [=]() { 1321 // Only one thread works in a team 1322 PetscInt p, q = Yi(i); 1323 for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 1324 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 1325 if (Xj(p) == Yj(q)) { // Found it 1326 Ya(q) += alpha * Xa(p); 1327 q++; 1328 } else { 1329 // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1330 // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1331 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 1332 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 1333 #else 1334 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 1335 #endif 1336 } 1337 } 1338 }); 1339 }); 1340 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1341 } else { // different nonzero patterns 1342 Mat Z; 1343 KokkosCsrMatrix zcsr; 1344 KernelHandle kh; 1345 kh.create_spadd_handle(true); // X, Y are sorted 1346 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1347 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1348 zkok = new Mat_SeqAIJKokkos(zcsr); 1349 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1350 PetscCall(MatHeaderReplace(Y, &Z)); 1351 kh.destroy_spadd_handle(); 1352 } 1353 PetscCall(PetscLogGpuTimeEnd()); 1354 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 1355 PetscFunctionReturn(PETSC_SUCCESS); 1356 } 1357 1358 struct MatCOOStruct_SeqAIJKokkos { 1359 PetscCount n; 1360 PetscCount Atot; 1361 PetscInt nz; 1362 PetscCountKokkosView jmap; 1363 PetscCountKokkosView perm; 1364 1365 MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h) 1366 { 1367 nz = coo_h->nz; 1368 n = coo_h->n; 1369 Atot = coo_h->Atot; 1370 jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1)); 1371 perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot)); 1372 } 1373 }; 1374 1375 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data) 1376 { 1377 PetscFunctionBegin; 1378 PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data)); 1379 PetscFunctionReturn(PETSC_SUCCESS); 1380 } 1381 1382 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1383 { 1384 Mat_SeqAIJKokkos *akok; 1385 Mat_SeqAIJ *aseq; 1386 PetscContainer container_h; 1387 MatCOOStruct_SeqAIJ *coo_h; 1388 MatCOOStruct_SeqAIJKokkos *coo_d; 1389 1390 PetscFunctionBegin; 1391 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1392 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1393 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1394 delete akok; 1395 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE); 1396 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1397 1398 // Copy the COO struct to device 1399 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 1400 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 1401 PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h)); 1402 1403 // Put the COO struct in a container and then attach that to the matrix 1404 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos)); 1405 PetscFunctionReturn(PETSC_SUCCESS); 1406 } 1407 1408 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1409 { 1410 MatScalarKokkosView Aa; 1411 ConstMatScalarKokkosView kv; 1412 PetscMemType memtype; 1413 PetscContainer container; 1414 MatCOOStruct_SeqAIJKokkos *coo; 1415 1416 PetscFunctionBegin; 1417 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 1418 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 1419 1420 const auto &n = coo->n; 1421 const auto &Annz = coo->nz; 1422 const auto &jmap = coo->jmap; 1423 const auto &perm = coo->perm; 1424 1425 PetscCall(PetscGetMemType(v, &memtype)); 1426 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1427 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n)); 1428 } else { 1429 kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */ 1430 } 1431 1432 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1433 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1434 1435 PetscCall(PetscLogGpuTimeBegin()); 1436 Kokkos::parallel_for( 1437 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) { 1438 PetscScalar sum = 0.0; 1439 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1440 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1441 }); 1442 PetscCall(PetscLogGpuTimeEnd()); 1443 1444 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1445 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1446 PetscFunctionReturn(PETSC_SUCCESS); 1447 } 1448 1449 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1450 { 1451 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1452 1453 PetscFunctionBegin; 1454 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1455 A->boundtocpu = PETSC_FALSE; 1456 1457 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1458 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1459 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1460 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1461 A->ops->scale = MatScale_SeqAIJKokkos; 1462 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1463 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1464 A->ops->mult = MatMult_SeqAIJKokkos; 1465 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1466 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1467 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1468 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1469 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1470 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1471 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1472 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1473 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1474 A->ops->shift = MatShift_SeqAIJKokkos; 1475 A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos; 1476 A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos; 1477 A->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJKokkos; 1478 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1479 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1480 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1481 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1482 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1483 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1484 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1485 1486 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1487 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1488 #if defined(PETSC_HAVE_HYPRE) 1489 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE)); 1490 #endif 1491 PetscFunctionReturn(PETSC_SUCCESS); 1492 } 1493 1494 /* 1495 Extract the (prescribled) diagonal blocks of the matrix and then invert them 1496 1497 Input Parameters: 1498 + A - the MATSEQAIJKOKKOS matrix 1499 . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i) 1500 . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[] 1501 . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i) 1502 - work - a pre-allocated work buffer (as big as diagVal) for use by this routine 1503 1504 Output Parameter: 1505 . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order) 1506 */ 1507 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal) 1508 { 1509 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1510 PetscInt nblocks = bs.extent(0) - 1; 1511 1512 PetscFunctionBegin; 1513 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device 1514 1515 // Pull out the diagonal blocks of the matrix and then invert the blocks 1516 auto Aa = akok->a_dual.view_device(); 1517 auto Ai = akok->i_dual.view_device(); 1518 auto Aj = akok->j_dual.view_device(); 1519 auto Adiag = akok->diag_dual.view_device(); 1520 // TODO: how to tune the team size? 1521 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY) 1522 auto ts = Kokkos::AUTO(); 1523 #else 1524 auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs 1525 #endif 1526 PetscCallCXX(Kokkos::parallel_for( 1527 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) { 1528 const PetscInt bid = teamMember.league_rank(); // block id 1529 const PetscInt rstart = bs(bid); // this block starts from this row 1530 const PetscInt m = bs(bid + 1) - bs(bid); // size of this block 1531 const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order 1532 const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m); 1533 1534 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B 1535 PetscInt i = rstart + r; // i-th row in A 1536 1537 if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case) 1538 PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row 1539 1540 for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet 1541 if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero 1542 B(r, c) = 0.0; 1543 } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column 1544 B(r, c) = Aa(first + c); 1545 } else { // this entry does not show up in the CSR 1546 B(r, c) = 0.0; 1547 } 1548 } 1549 } else { // rare case that the diagonal does not exist 1550 const PetscInt begin = Ai(i); 1551 const PetscInt end = Ai(i + 1); 1552 for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0; 1553 for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not. 1554 if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j); 1555 else if (Aj(j) >= rstart + m) break; 1556 } 1557 } 1558 }); 1559 1560 // LU-decompose B (w/o pivoting) and then invert B 1561 KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0); 1562 KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W); 1563 })); 1564 // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels 1565 PetscFunctionReturn(PETSC_SUCCESS); 1566 } 1567 1568 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1569 { 1570 Mat_SeqAIJ *aseq; 1571 PetscInt i, m, n; 1572 auto exec = PetscGetKokkosExecutionSpace(); 1573 1574 PetscFunctionBegin; 1575 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1576 1577 m = akok->nrows(); 1578 n = akok->ncols(); 1579 PetscCall(MatSetSizes(A, m, n, m, n)); 1580 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1581 1582 /* Set up data structures of A as a MATSEQAIJ */ 1583 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1584 aseq = (Mat_SeqAIJ *)A->data; 1585 1586 PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */ 1587 PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec)); 1588 1589 aseq->i = akok->i_host_data(); 1590 aseq->j = akok->j_host_data(); 1591 aseq->a = akok->a_host_data(); 1592 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1593 aseq->free_a = PETSC_FALSE; 1594 aseq->free_ij = PETSC_FALSE; 1595 aseq->nz = akok->nnz(); 1596 aseq->maxnz = aseq->nz; 1597 1598 PetscCall(PetscMalloc1(m, &aseq->imax)); 1599 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1600 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1601 1602 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1603 akok->nonzerostate = A->nonzerostate; 1604 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1605 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1606 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1607 PetscFunctionReturn(PETSC_SUCCESS); 1608 } 1609 1610 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 1611 { 1612 PetscFunctionBegin; 1613 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1614 *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 1615 PetscFunctionReturn(PETSC_SUCCESS); 1616 } 1617 1618 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 1619 { 1620 Mat_SeqAIJKokkos *akok; 1621 1622 PetscFunctionBegin; 1623 PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 1624 PetscCall(MatCreate(comm, A)); 1625 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1626 PetscFunctionReturn(PETSC_SUCCESS); 1627 } 1628 1629 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1630 1631 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1632 */ 1633 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1634 { 1635 PetscFunctionBegin; 1636 PetscCall(MatCreate(comm, A)); 1637 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1638 PetscFunctionReturn(PETSC_SUCCESS); 1639 } 1640 1641 /*@C 1642 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1643 (the default parallel PETSc format). This matrix will ultimately be handled by 1644 Kokkos for calculations. 1645 1646 Collective 1647 1648 Input Parameters: 1649 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1650 . m - number of rows 1651 . n - number of columns 1652 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 1653 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 1654 1655 Output Parameter: 1656 . A - the matrix 1657 1658 Level: intermediate 1659 1660 Notes: 1661 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1662 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1663 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1664 1665 The AIJ format, also called 1666 compressed row storage, is fully compatible with standard Fortran 1667 storage. That is, the stored row and column indices can begin at 1668 either one (as in Fortran) or zero. 1669 1670 Specify the preallocated storage with either `nz` or `nnz` (not both). 1671 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1672 allocation. 1673 1674 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()` 1675 @*/ 1676 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1677 { 1678 PetscFunctionBegin; 1679 PetscCall(PetscKokkosInitializeCheck()); 1680 PetscCall(MatCreate(comm, A)); 1681 PetscCall(MatSetSizes(*A, m, n, m, n)); 1682 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1683 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1684 PetscFunctionReturn(PETSC_SUCCESS); 1685 } 1686 1687 // After matrix numeric factorization, there are still steps to do before triangular solve can be called. 1688 // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does). 1689 // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve(). 1690 // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck. 1691 static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A) 1692 { 1693 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1694 const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy 1695 const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy 1696 1697 PetscFunctionBegin; 1698 if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet 1699 if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d)); 1700 if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d)); 1701 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1702 } 1703 PetscFunctionReturn(PETSC_SUCCESS); 1704 } 1705 1706 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1707 { 1708 const PetscInt n = A->rmap->n; 1709 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1710 const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy 1711 const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy 1712 1713 PetscFunctionBegin; 1714 if (!factors->transpose_updated) { 1715 if (has_upper) { 1716 if (!factors->iUt_d.extent(0)) { // Allocate Ut on device if not yet 1717 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix 1718 factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 1719 factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 1720 } 1721 1722 if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host 1723 if (!factors->U) { 1724 Mat_SeqAIJ *seq; 1725 1726 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U)); 1727 PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut)); 1728 1729 seq = static_cast<Mat_SeqAIJ *>(factors->Ut->data); 1730 factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1); 1731 factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz); 1732 factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz); 1733 } else { 1734 PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h 1735 } 1736 // Copy Ut from host to device 1737 PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h)); 1738 PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h)); 1739 PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h)); 1740 } else { // If U was computed on device, we also compute the transpose there 1741 // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. 1742 PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, 1743 factors->jU_d, factors->aU_d, 1744 factors->iUt_d, factors->jUt_d, 1745 factors->aUt_d)); 1746 PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d)); 1747 } 1748 PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d)); 1749 } 1750 1751 // do the same for L with LU 1752 if (has_lower) { 1753 if (!factors->iLt_d.extent(0)) { // Allocate Lt on device if not yet 1754 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix 1755 factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 1756 factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 1757 } 1758 1759 if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host 1760 if (!factors->L) { 1761 Mat_SeqAIJ *seq; 1762 1763 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L)); 1764 PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt)); 1765 1766 seq = static_cast<Mat_SeqAIJ *>(factors->Lt->data); 1767 factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1); 1768 factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz); 1769 factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz); 1770 } else { 1771 PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h 1772 } 1773 // Copy Lt from host to device 1774 PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h)); 1775 PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h)); 1776 PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h)); 1777 } else { // If L was computed on device, we also compute the transpose there 1778 // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. 1779 PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, 1780 factors->jL_d, factors->aL_d, 1781 factors->iLt_d, factors->jLt_d, 1782 factors->aLt_d)); 1783 PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d)); 1784 } 1785 PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d)); 1786 } 1787 1788 factors->transpose_updated = PETSC_TRUE; 1789 } 1790 PetscFunctionReturn(PETSC_SUCCESS); 1791 } 1792 1793 // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A. 1794 // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty. 1795 static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx) 1796 { 1797 auto exec = PetscGetKokkosExecutionSpace(); 1798 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1799 PetscInt m = A->rmap->n; 1800 PetscScalarKokkosView D = factors->D_d; 1801 PetscScalarKokkosView X, Y, B; // alias 1802 ConstPetscScalarKokkosView b; 1803 PetscScalarKokkosView x; 1804 PetscIntKokkosView &rowperm = factors->rowperm; 1805 PetscBool identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1806 1807 PetscFunctionBegin; 1808 PetscCall(PetscLogGpuTimeBegin()); 1809 PetscCall(MatSeqAIJKokkosSolveCheck(A)); // for UX = T 1810 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B 1811 PetscCall(VecGetKokkosView(bb, &b)); 1812 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1813 1814 // Solve U^T Y = B 1815 if (identity) { // Reorder b with the row permutation 1816 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1817 Y = factors->workVector; 1818 } else { 1819 B = factors->workVector; 1820 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); })); 1821 Y = x; 1822 } 1823 PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y)); 1824 1825 // Solve diag(D) Y' = Y. 1826 // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication. 1827 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); })); 1828 1829 // Solve UX = Y 1830 if (identity) { 1831 X = x; 1832 } else { 1833 X = factors->workVector; // B is not needed anymore 1834 } 1835 PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X)); 1836 1837 // Reorder X with the inverse column (row) permutation 1838 if (!identity) { 1839 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); })); 1840 } 1841 1842 PetscCall(VecRestoreKokkosView(bb, &b)); 1843 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1844 PetscCall(PetscLogGpuTimeEnd()); 1845 PetscFunctionReturn(PETSC_SUCCESS); 1846 } 1847 1848 // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively. 1849 // R and C are represented by rowperm and colperm in factors. 1850 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty. 1851 static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx) 1852 { 1853 auto exec = PetscGetKokkosExecutionSpace(); 1854 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1855 PetscInt m = A->rmap->n; 1856 PetscScalarKokkosView X, Y, B; // alias 1857 ConstPetscScalarKokkosView b; 1858 PetscScalarKokkosView x; 1859 PetscIntKokkosView &rowperm = factors->rowperm; 1860 PetscIntKokkosView &colperm = factors->colperm; 1861 PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1862 PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1863 1864 PetscFunctionBegin; 1865 PetscCall(PetscLogGpuTimeBegin()); 1866 PetscCall(MatSeqAIJKokkosSolveCheck(A)); 1867 PetscCall(VecGetKokkosView(bb, &b)); 1868 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1869 1870 // Solve L Y = B (i.e., L (U C^- x) = R b). R b indicates applying the row permutation on b. 1871 if (row_identity) { 1872 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1873 Y = factors->workVector; 1874 } else { 1875 B = factors->workVector; 1876 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); })); 1877 Y = x; 1878 } 1879 PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y)); 1880 1881 // Solve U C^- x = Y 1882 if (col_identity) { 1883 X = x; 1884 } else { 1885 X = factors->workVector; 1886 } 1887 PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X)); 1888 1889 // x = C X; Reorder X with the inverse col permutation 1890 if (!col_identity) { 1891 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); })); 1892 } 1893 1894 PetscCall(VecRestoreKokkosView(bb, &b)); 1895 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1896 PetscCall(PetscLogGpuTimeEnd()); 1897 PetscFunctionReturn(PETSC_SUCCESS); 1898 } 1899 1900 // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively. 1901 // R and C are represented by rowperm and colperm in factors. 1902 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty. 1903 // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R. 1904 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx) 1905 { 1906 auto exec = PetscGetKokkosExecutionSpace(); 1907 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1908 PetscInt m = A->rmap->n; 1909 PetscScalarKokkosView X, Y, B; // alias 1910 ConstPetscScalarKokkosView b; 1911 PetscScalarKokkosView x; 1912 PetscIntKokkosView &rowperm = factors->rowperm; 1913 PetscIntKokkosView &colperm = factors->colperm; 1914 PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1915 PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1916 1917 PetscFunctionBegin; 1918 PetscCall(PetscLogGpuTimeBegin()); 1919 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T 1920 PetscCall(VecGetKokkosView(bb, &b)); 1921 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1922 1923 // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b). Note C^- b = C^T b, which means applying the column permutation on b. 1924 if (col_identity) { // Reorder b with the col permutation 1925 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1926 Y = factors->workVector; 1927 } else { 1928 B = factors->workVector; 1929 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); })); 1930 Y = x; 1931 } 1932 PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y)); 1933 1934 // Solve L^T X = Y 1935 if (row_identity) { 1936 X = x; 1937 } else { 1938 X = factors->workVector; 1939 } 1940 PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X)); 1941 1942 // x = R^- X = R^T X; Reorder X with the inverse row permutation 1943 if (!row_identity) { 1944 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); })); 1945 } 1946 1947 PetscCall(VecRestoreKokkosView(bb, &b)); 1948 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1949 PetscCall(PetscLogGpuTimeEnd()); 1950 PetscFunctionReturn(PETSC_SUCCESS); 1951 } 1952 1953 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1954 { 1955 PetscFunctionBegin; 1956 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1957 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1958 1959 if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device 1960 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1961 Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data); 1962 const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag; 1963 const MatScalar *Ba = b->a; 1964 PetscInt m = B->rmap->n, n = B->cmap->n; 1965 1966 if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time 1967 // Allocate memory and copy the structure 1968 factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1); 1969 factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries 1970 factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m); 1971 factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1); 1972 factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m])); 1973 factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m])); 1974 1975 PetscInt *Li = factors->iL_h.data(); 1976 PetscInt *Lj = factors->jL_h.data(); 1977 PetscInt *Ui = factors->iU_h.data(); 1978 PetscInt *Uj = factors->jU_h.data(); 1979 1980 Li[0] = Ui[0] = 0; 1981 for (PetscInt i = 0; i < m; i++) { 1982 PetscInt llen = Bi[i + 1] - Bi[i]; // exclusive of the diagonal entry 1983 PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry 1984 1985 PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen); // entries of L on the left of the diagonal 1986 Lj[Li[i] + llen] = i; // diagonal entry of L 1987 1988 Uj[Ui[i]] = i; // diagonal entry of U 1989 PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1); // entries of U on the right of the diagonal 1990 1991 Li[i + 1] = Li[i] + llen + 1; 1992 Ui[i + 1] = Ui[i] + ulen; 1993 } 1994 1995 factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h); 1996 factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h); 1997 factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h); 1998 factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h); 1999 factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h); 2000 factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h); 2001 2002 // Copy row/col permutation to device 2003 IS rowperm = ((Mat_SeqAIJ *)B->data)->row; 2004 PetscBool row_identity; 2005 PetscCall(ISIdentity(rowperm, &row_identity)); 2006 if (!row_identity) { 2007 const PetscInt *ip; 2008 2009 PetscCall(ISGetIndices(rowperm, &ip)); 2010 factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m); 2011 PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m))); 2012 PetscCall(ISRestoreIndices(rowperm, &ip)); 2013 PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt))); 2014 } 2015 2016 IS colperm = ((Mat_SeqAIJ *)B->data)->col; 2017 PetscBool col_identity; 2018 PetscCall(ISIdentity(colperm, &col_identity)); 2019 if (!col_identity) { 2020 const PetscInt *ip; 2021 2022 PetscCall(ISGetIndices(colperm, &ip)); 2023 factors->colperm = PetscIntKokkosView(NoInit("colperm"), n); 2024 PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n))); 2025 PetscCall(ISRestoreIndices(colperm, &ip)); 2026 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 2027 } 2028 2029 /* Create sptrsv handles for L, U and their transpose */ 2030 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2031 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2032 #else 2033 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2034 #endif 2035 factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */); 2036 factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */); 2037 factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */); 2038 factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */); 2039 } 2040 2041 // Copy the value 2042 for (PetscInt i = 0; i < m; i++) { 2043 PetscInt llen = Bi[i + 1] - Bi[i]; 2044 PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; 2045 const PetscInt *Li = factors->iL_h.data(); 2046 const PetscInt *Ui = factors->iU_h.data(); 2047 2048 PetscScalar *La = factors->aL_h.data(); 2049 PetscScalar *Ua = factors->aU_h.data(); 2050 2051 PetscArraycpy(La + Li[i], Ba + Bi[i], llen); // entries of L 2052 La[Li[i] + llen] = 1.0; // diagonal entry 2053 2054 Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]]; // diagonal entry 2055 PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1); // entries of U 2056 } 2057 2058 PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h)); 2059 PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h)); 2060 // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic 2061 factors->transpose_updated = PETSC_FALSE; 2062 factors->sptrsv_symbolic_completed = PETSC_FALSE; 2063 2064 B->ops->solve = MatSolve_SeqAIJKokkos_LU; 2065 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU; 2066 } 2067 2068 B->ops->matsolve = NULL; 2069 B->ops->matsolvetranspose = NULL; 2070 PetscFunctionReturn(PETSC_SUCCESS); 2071 } 2072 2073 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info) 2074 { 2075 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 2076 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2077 PetscInt fill_lev = info->levels; 2078 2079 PetscFunctionBegin; 2080 PetscCall(PetscLogGpuTimeBegin()); 2081 PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false"); 2082 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2083 2084 auto a_d = aijkok->a_dual.view_device(); 2085 auto i_d = aijkok->i_dual.view_device(); 2086 auto j_d = aijkok->j_dual.view_device(); 2087 2088 PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d)); 2089 2090 B->assembled = PETSC_TRUE; 2091 B->preallocated = PETSC_TRUE; 2092 B->ops->solve = MatSolve_SeqAIJKokkos_LU; 2093 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU; 2094 B->ops->matsolve = NULL; 2095 B->ops->matsolvetranspose = NULL; 2096 2097 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 2098 factors->transpose_updated = PETSC_FALSE; 2099 factors->sptrsv_symbolic_completed = PETSC_FALSE; 2100 /* TODO: log flops, but how to know that? */ 2101 PetscCall(PetscLogGpuTimeEnd()); 2102 PetscFunctionReturn(PETSC_SUCCESS); 2103 } 2104 2105 // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering 2106 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info) 2107 { 2108 Mat_SeqAIJKokkos *aijkok; 2109 Mat_SeqAIJ *b; 2110 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2111 PetscInt fill_lev = info->levels; 2112 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 2113 PetscInt n = A->rmap->n; 2114 2115 PetscFunctionBegin; 2116 PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now"); 2117 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2118 2119 /* Create a spiluk handle and then do symbolic factorization */ 2120 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 2121 factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 2122 2123 auto spiluk_handle = factors->kh.get_spiluk_handle(); 2124 2125 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 2126 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 2127 Kokkos::realloc(factors->iU_d, n + 1); 2128 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 2129 2130 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 2131 auto i_d = aijkok->i_dual.view_device(); 2132 auto j_d = aijkok->j_dual.view_device(); 2133 PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d)); 2134 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 2135 2136 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 2137 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 2138 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 2139 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 2140 2141 /* TODO: add options to select sptrsv algorithms */ 2142 /* Create sptrsv handles for L, U and their transpose */ 2143 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2144 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2145 #else 2146 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2147 #endif 2148 2149 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 2150 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 2151 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 2152 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 2153 2154 /* Fill fields of the factor matrix B */ 2155 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 2156 b = (Mat_SeqAIJ *)B->data; 2157 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 2158 B->info.fill_ratio_given = info->fill; 2159 B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 2160 2161 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0; 2162 PetscFunctionReturn(PETSC_SUCCESS); 2163 } 2164 2165 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2166 { 2167 PetscFunctionBegin; 2168 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2169 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2170 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2171 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 2172 PetscFunctionReturn(PETSC_SUCCESS); 2173 } 2174 2175 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2176 { 2177 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2178 2179 PetscFunctionBegin; 2180 if (!info->factoronhost) { 2181 PetscCall(ISIdentity(isrow, &row_identity)); 2182 PetscCall(ISIdentity(iscol, &col_identity)); 2183 } 2184 2185 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2186 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2187 2188 if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering 2189 PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info)); 2190 } else { 2191 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host 2192 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 2193 } 2194 PetscFunctionReturn(PETSC_SUCCESS); 2195 } 2196 2197 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 2198 { 2199 PetscFunctionBegin; 2200 PetscCall(MatSeqAIJKokkosSyncHost(A)); 2201 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 2202 2203 if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device 2204 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2205 Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data); 2206 const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag; 2207 const MatScalar *Ba = b->a; 2208 PetscInt m = B->rmap->n; 2209 2210 if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization 2211 // Allocate memory and copy the structure 2212 factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h 2213 factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]); 2214 factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]); 2215 factors->D_h = MatScalarKokkosViewHost(NoInit("D_h"), m); 2216 factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h); 2217 factors->D_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h); 2218 2219 // Build jU_h from the skewed Aj 2220 PetscInt *Uj = factors->jU_h.data(); 2221 for (PetscInt i = 0; i < m; i++) { 2222 PetscInt ulen = Bi[i + 1] - Bi[i]; 2223 Uj[Bi[i]] = i; // diagonal entry 2224 PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal 2225 } 2226 2227 // Copy iU, jU to device 2228 PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h)); 2229 PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h)); 2230 2231 // Copy row/col permutation to device 2232 IS rowperm = ((Mat_SeqAIJ *)B->data)->row; 2233 PetscBool row_identity; 2234 PetscCall(ISIdentity(rowperm, &row_identity)); 2235 if (!row_identity) { 2236 const PetscInt *ip; 2237 2238 PetscCall(ISGetIndices(rowperm, &ip)); 2239 PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m)); 2240 PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m))); 2241 PetscCall(ISRestoreIndices(rowperm, &ip)); 2242 PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt))); 2243 } 2244 2245 // Create sptrsv handles for U and U^T 2246 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2247 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2248 #else 2249 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2250 #endif 2251 factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */); 2252 factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */); 2253 } 2254 // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them 2255 B->ops->solve = MatSolve_SeqAIJKokkos_Cholesky; 2256 B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky; 2257 2258 // Copy the value 2259 PetscScalar *Ua = factors->aU_h.data(); 2260 PetscScalar *D = factors->D_h.data(); 2261 for (PetscInt i = 0; i < m; i++) { 2262 D[i] = Ba[Bdiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 2263 Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U 2264 for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k]; 2265 } 2266 PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h)); 2267 PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h)); 2268 2269 factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again 2270 factors->transpose_updated = PETSC_FALSE; 2271 } 2272 2273 B->ops->matsolve = NULL; 2274 B->ops->matsolvetranspose = NULL; 2275 PetscFunctionReturn(PETSC_SUCCESS); 2276 } 2277 2278 static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2279 { 2280 PetscFunctionBegin; 2281 if (info->solveonhost) { 2282 // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc. 2283 PetscCall(MatSetType(B, MATSEQSBAIJ)); 2284 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 2285 } 2286 2287 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2288 2289 if (!info->solveonhost) { 2290 // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr 2291 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2292 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2293 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos; 2294 } 2295 PetscFunctionReturn(PETSC_SUCCESS); 2296 } 2297 2298 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2299 { 2300 PetscFunctionBegin; 2301 if (info->solveonhost) { 2302 // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc. 2303 PetscCall(MatSetType(B, MATSEQSBAIJ)); 2304 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 2305 } 2306 2307 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm 2308 2309 if (!info->solveonhost) { 2310 // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr 2311 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2312 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2313 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos; 2314 } 2315 PetscFunctionReturn(PETSC_SUCCESS); 2316 } 2317 2318 // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix 2319 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type) 2320 { 2321 PetscFunctionBegin; 2322 *type = MATSOLVERKOKKOS; 2323 PetscFunctionReturn(PETSC_SUCCESS); 2324 } 2325 2326 /*MC 2327 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 2328 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 2329 2330 Level: beginner 2331 2332 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 2333 M*/ 2334 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 2335 { 2336 PetscInt n = A->rmap->n; 2337 MPI_Comm comm; 2338 2339 PetscFunctionBegin; 2340 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 2341 PetscCall(MatCreate(comm, B)); 2342 PetscCall(MatSetSizes(*B, n, n, n, n)); 2343 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2344 (*B)->factortype = ftype; 2345 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 2346 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2347 PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2348 2349 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2350 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 2351 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 2352 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2353 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2354 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2355 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2356 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJKokkos; 2357 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos; 2358 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2359 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2360 } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 2361 2362 // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host 2363 (*B)->canuseordering = PETSC_TRUE; 2364 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos)); 2365 PetscFunctionReturn(PETSC_SUCCESS); 2366 } 2367 2368 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void) 2369 { 2370 PetscFunctionBegin; 2371 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 2372 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos)); 2373 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 2374 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos)); 2375 PetscFunctionReturn(PETSC_SUCCESS); 2376 } 2377 2378 /* Utility to print out a KokkosCsrMatrix for debugging */ 2379 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 2380 { 2381 const auto &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map); 2382 const auto &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries); 2383 const auto &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values); 2384 const PetscInt *i = iv.data(); 2385 const PetscInt *j = jv.data(); 2386 const PetscScalar *a = av.data(); 2387 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 2388 2389 PetscFunctionBegin; 2390 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 2391 for (PetscInt k = 0; k < m; k++) { 2392 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 2393 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 2394 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 2395 } 2396 PetscFunctionReturn(PETSC_SUCCESS); 2397 } 2398