1 #include <petsc_kokkos.hpp> 2 #include <petscvec_kokkos.hpp> 3 #include <petscmat_kokkos.hpp> 4 #include <petscpkg_version.h> 5 #include <petsc/private/petscimpl.h> 6 #include <petsc/private/sfimpl.h> 7 #include <petsc/private/kokkosimpl.hpp> 8 #include <petscsys.h> 9 10 #include <Kokkos_Core.hpp> 11 #include <KokkosBlas.hpp> 12 #include <KokkosSparse_CrsMatrix.hpp> 13 14 // To suppress compiler warnings: 15 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63: 16 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, 17 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*, 18 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations] 19 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations") 20 #include <KokkosSparse_spmv.hpp> 21 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END() 22 23 #include <KokkosSparse_spiluk.hpp> 24 #include <KokkosSparse_sptrsv.hpp> 25 #include <KokkosSparse_spgemm.hpp> 26 #include <KokkosSparse_spadd.hpp> 27 #include <KokkosBatched_LU_Decl.hpp> 28 #include <KokkosBatched_InverseLU_Decl.hpp> 29 30 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 31 32 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 33 #include <KokkosSparse_Utils.hpp> 34 using KokkosSparse::sort_crs_matrix; 35 using KokkosSparse::Impl::transpose_matrix; 36 #else 37 #include <KokkosKernels_Sorting.hpp> 38 using KokkosKernels::sort_crs_matrix; 39 using KokkosKernels::Impl::transpose_matrix; 40 #endif 41 42 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0) 43 using KokkosSparse::spiluk_symbolic; 44 using KokkosSparse::spiluk_numeric; 45 using KokkosSparse::sptrsv_symbolic; 46 using KokkosSparse::sptrsv_solve; 47 using KokkosSparse::Experimental::SPTRSVAlgorithm; 48 using KokkosSparse::Experimental::SPILUKAlgorithm; 49 #else 50 using KokkosSparse::Experimental::spiluk_symbolic; 51 using KokkosSparse::Experimental::spiluk_numeric; 52 using KokkosSparse::Experimental::sptrsv_symbolic; 53 using KokkosSparse::Experimental::sptrsv_solve; 54 using KokkosSparse::Experimental::SPTRSVAlgorithm; 55 using KokkosSparse::Experimental::SPILUKAlgorithm; 56 #endif 57 58 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 59 60 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 61 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 62 In the latter case, it is important to set a_dual's sync state correctly. 63 */ 64 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 65 { 66 Mat_SeqAIJ *aijseq; 67 Mat_SeqAIJKokkos *aijkok; 68 69 PetscFunctionBegin; 70 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 71 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 72 73 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 74 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 75 76 /* If aijkok does not exist, we just copy i, j to device. 77 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 78 In both cases, we build a new aijkok structure. 79 */ 80 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 81 if (aijkok && aijkok->host_aij_allocated_by_kokkos) { /* Avoid accidently freeing much needed a,i,j on host when deleting aijkok */ 82 PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i)); 83 PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j)); 84 PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a)); 85 PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1)); 86 PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz())); 87 PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz())); 88 aijseq->free_a = PETSC_TRUE; 89 aijseq->free_ij = PETSC_TRUE; 90 /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where 91 we have the CsrMatrix on device first and then copy to host, followed by 92 MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL. 93 One could improve it by not using NULL garray. 94 */ 95 } 96 delete aijkok; 97 aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 98 A->spptr = aijkok; 99 } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag. 100 MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n); 101 auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h); 102 aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h); 103 } 104 PetscFunctionReturn(PETSC_SUCCESS); 105 } 106 107 /* Sync CSR data to device if not yet */ 108 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 109 { 110 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 111 112 PetscFunctionBegin; 113 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 114 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 115 if (aijkok->a_dual.need_sync_device()) { 116 PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 117 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 118 aijkok->hermitian_updated = PETSC_FALSE; 119 } 120 PetscFunctionReturn(PETSC_SUCCESS); 121 } 122 123 /* Mark the CSR data on device as modified */ 124 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 125 { 126 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 127 128 PetscFunctionBegin; 129 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 130 aijkok->a_dual.clear_sync_state(); 131 aijkok->a_dual.modify_device(); 132 aijkok->transpose_updated = PETSC_FALSE; 133 aijkok->hermitian_updated = PETSC_FALSE; 134 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 135 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 136 PetscFunctionReturn(PETSC_SUCCESS); 137 } 138 139 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 140 { 141 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 142 auto exec = PetscGetKokkosExecutionSpace(); 143 144 PetscFunctionBegin; 145 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 146 /* We do not expect one needs factors on host */ 147 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 148 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 149 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec)); 150 PetscFunctionReturn(PETSC_SUCCESS); 151 } 152 153 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 154 { 155 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 156 157 PetscFunctionBegin; 158 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 159 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 160 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 161 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 162 */ 163 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 164 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 165 *array = aijkok->a_dual.view_host().data(); 166 } else { /* Happens when calling MatSetValues on a newly created matrix */ 167 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 168 } 169 PetscFunctionReturn(PETSC_SUCCESS); 170 } 171 172 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 173 { 174 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 175 176 PetscFunctionBegin; 177 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 178 PetscFunctionReturn(PETSC_SUCCESS); 179 } 180 181 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 182 { 183 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 184 185 PetscFunctionBegin; 186 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 187 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 188 *array = aijkok->a_dual.view_host().data(); 189 } else { 190 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 191 } 192 PetscFunctionReturn(PETSC_SUCCESS); 193 } 194 195 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 196 { 197 PetscFunctionBegin; 198 *array = NULL; 199 PetscFunctionReturn(PETSC_SUCCESS); 200 } 201 202 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 203 { 204 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 205 206 PetscFunctionBegin; 207 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 208 *array = aijkok->a_dual.view_host().data(); 209 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 210 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 211 } 212 PetscFunctionReturn(PETSC_SUCCESS); 213 } 214 215 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 216 { 217 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 218 219 PetscFunctionBegin; 220 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 221 aijkok->a_dual.clear_sync_state(); 222 aijkok->a_dual.modify_host(); 223 } 224 PetscFunctionReturn(PETSC_SUCCESS); 225 } 226 227 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 228 { 229 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 230 231 PetscFunctionBegin; 232 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 233 234 if (i) *i = aijkok->i_device_data(); 235 if (j) *j = aijkok->j_device_data(); 236 if (a) { 237 PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 238 *a = aijkok->a_device_data(); 239 } 240 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 241 PetscFunctionReturn(PETSC_SUCCESS); 242 } 243 244 /* 245 Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 246 247 Input Parameter: 248 . A - the MATSEQAIJKOKKOS matrix 249 250 Output Parameters: 251 + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 252 - T_d - the transpose on device, whose value array is allocated but not initialized 253 */ 254 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 255 { 256 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 257 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 258 const PetscInt *Ai = aseq->i, *Aj = aseq->j; 259 MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 260 MatRowMapType *Ti = Ti_h.data(); 261 MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 262 MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 263 PetscInt *Tj = Tj_h.data(); 264 PetscInt *perm = perm_h.data(); 265 PetscInt *offset; 266 267 PetscFunctionBegin; 268 // Populate Ti 269 PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 270 Ti++; 271 for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 272 Ti--; 273 for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 274 275 // Populate Tj and the permutation array 276 PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 277 for (PetscInt i = 0; i < m; i++) { 278 for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 279 PetscInt r = Aj[j]; // row r of T 280 PetscInt disp = Ti[r] + offset[r]; 281 282 Tj[disp] = i; // col i of T 283 perm[disp] = j; 284 offset[r]++; 285 } 286 } 287 PetscCall(PetscFree(offset)); 288 289 // Sort each row of T, along with the permutation array 290 for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 291 292 // Output perm and T on device 293 auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 294 auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 295 PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 296 PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 297 PetscFunctionReturn(PETSC_SUCCESS); 298 } 299 300 // Generate the transpose on device and cache it internally 301 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 302 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 303 { 304 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 305 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 306 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 307 KokkosCsrMatrix &T = akok->csrmatT; 308 309 PetscFunctionBegin; 310 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 311 PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device 312 313 const auto &Aa = akok->a_dual.view_device(); 314 315 if (A->symmetric == PETSC_BOOL3_TRUE) { 316 *csrmatT = akok->csrmat; 317 } else { 318 // See if we already have a cached transpose and its value is up to date 319 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 320 if (!akok->transpose_updated) { // if the value is out of date, update the cached version 321 const auto &perm = akok->transpose_perm; // get the permutation array 322 auto &Ta = T.values; 323 324 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 325 } 326 } else { // Generate T of size n x m for the first time 327 MatRowMapKokkosView perm; 328 329 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 330 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 331 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 332 } 333 akok->transpose_updated = PETSC_TRUE; 334 *csrmatT = akok->csrmatT; 335 } 336 PetscFunctionReturn(PETSC_SUCCESS); 337 } 338 339 // Generate the Hermitian on device and cache it internally 340 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 341 { 342 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 343 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 344 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 345 KokkosCsrMatrix &T = akok->csrmatH; 346 347 PetscFunctionBegin; 348 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 349 PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device 350 351 const auto &Aa = akok->a_dual.view_device(); 352 353 if (A->hermitian == PETSC_BOOL3_TRUE) { 354 *csrmatH = akok->csrmat; 355 } else { 356 // See if we already have a cached hermitian and its value is up to date 357 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 358 if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 359 const auto &perm = akok->transpose_perm; // get the permutation array 360 auto &Ta = T.values; 361 362 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 363 } 364 } else { // Generate T of size n x m for the first time 365 MatRowMapKokkosView perm; 366 367 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 368 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 369 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 370 } 371 akok->hermitian_updated = PETSC_TRUE; 372 *csrmatH = akok->csrmatH; 373 } 374 PetscFunctionReturn(PETSC_SUCCESS); 375 } 376 377 /* y = A x */ 378 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 379 { 380 Mat_SeqAIJKokkos *aijkok; 381 ConstPetscScalarKokkosView xv; 382 PetscScalarKokkosView yv; 383 384 PetscFunctionBegin; 385 PetscCall(PetscLogGpuTimeBegin()); 386 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 387 PetscCall(VecGetKokkosView(xx, &xv)); 388 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 389 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 390 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 391 PetscCall(VecRestoreKokkosView(xx, &xv)); 392 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 393 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 394 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 395 PetscCall(PetscLogGpuTimeEnd()); 396 PetscFunctionReturn(PETSC_SUCCESS); 397 } 398 399 /* y = A^T x */ 400 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 401 { 402 Mat_SeqAIJKokkos *aijkok; 403 const char *mode; 404 ConstPetscScalarKokkosView xv; 405 PetscScalarKokkosView yv; 406 KokkosCsrMatrix csrmat; 407 408 PetscFunctionBegin; 409 PetscCall(PetscLogGpuTimeBegin()); 410 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 411 PetscCall(VecGetKokkosView(xx, &xv)); 412 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 413 if (A->form_explicit_transpose) { 414 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 415 mode = "N"; 416 } else { 417 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 418 csrmat = aijkok->csrmat; 419 mode = "T"; 420 } 421 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 422 PetscCall(VecRestoreKokkosView(xx, &xv)); 423 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 424 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 425 PetscCall(PetscLogGpuTimeEnd()); 426 PetscFunctionReturn(PETSC_SUCCESS); 427 } 428 429 /* y = A^H x */ 430 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 431 { 432 Mat_SeqAIJKokkos *aijkok; 433 const char *mode; 434 ConstPetscScalarKokkosView xv; 435 PetscScalarKokkosView yv; 436 KokkosCsrMatrix csrmat; 437 438 PetscFunctionBegin; 439 PetscCall(PetscLogGpuTimeBegin()); 440 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 441 PetscCall(VecGetKokkosView(xx, &xv)); 442 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 443 if (A->form_explicit_transpose) { 444 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 445 mode = "N"; 446 } else { 447 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 448 csrmat = aijkok->csrmat; 449 mode = "C"; 450 } 451 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 452 PetscCall(VecRestoreKokkosView(xx, &xv)); 453 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 454 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 455 PetscCall(PetscLogGpuTimeEnd()); 456 PetscFunctionReturn(PETSC_SUCCESS); 457 } 458 459 /* z = A x + y */ 460 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 461 { 462 Mat_SeqAIJKokkos *aijkok; 463 ConstPetscScalarKokkosView xv; 464 PetscScalarKokkosView zv; 465 466 PetscFunctionBegin; 467 PetscCall(PetscLogGpuTimeBegin()); 468 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 469 if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host 470 PetscCall(VecGetKokkosView(xx, &xv)); 471 PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device 472 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 473 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 474 PetscCall(VecRestoreKokkosView(xx, &xv)); 475 PetscCall(VecRestoreKokkosView(zz, &zv)); 476 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 477 PetscCall(PetscLogGpuTimeEnd()); 478 PetscFunctionReturn(PETSC_SUCCESS); 479 } 480 481 /* z = A^T x + y */ 482 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 483 { 484 Mat_SeqAIJKokkos *aijkok; 485 const char *mode; 486 ConstPetscScalarKokkosView xv; 487 PetscScalarKokkosView zv; 488 KokkosCsrMatrix csrmat; 489 490 PetscFunctionBegin; 491 PetscCall(PetscLogGpuTimeBegin()); 492 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 493 if (zz != yy) PetscCall(VecCopy(yy, zz)); 494 PetscCall(VecGetKokkosView(xx, &xv)); 495 PetscCall(VecGetKokkosView(zz, &zv)); 496 if (A->form_explicit_transpose) { 497 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 498 mode = "N"; 499 } else { 500 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 501 csrmat = aijkok->csrmat; 502 mode = "T"; 503 } 504 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 505 PetscCall(VecRestoreKokkosView(xx, &xv)); 506 PetscCall(VecRestoreKokkosView(zz, &zv)); 507 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 508 PetscCall(PetscLogGpuTimeEnd()); 509 PetscFunctionReturn(PETSC_SUCCESS); 510 } 511 512 /* z = A^H x + y */ 513 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 514 { 515 Mat_SeqAIJKokkos *aijkok; 516 const char *mode; 517 ConstPetscScalarKokkosView xv; 518 PetscScalarKokkosView zv; 519 KokkosCsrMatrix csrmat; 520 521 PetscFunctionBegin; 522 PetscCall(PetscLogGpuTimeBegin()); 523 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 524 if (zz != yy) PetscCall(VecCopy(yy, zz)); 525 PetscCall(VecGetKokkosView(xx, &xv)); 526 PetscCall(VecGetKokkosView(zz, &zv)); 527 if (A->form_explicit_transpose) { 528 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 529 mode = "N"; 530 } else { 531 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 532 csrmat = aijkok->csrmat; 533 mode = "C"; 534 } 535 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 536 PetscCall(VecRestoreKokkosView(xx, &xv)); 537 PetscCall(VecRestoreKokkosView(zz, &zv)); 538 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 539 PetscCall(PetscLogGpuTimeEnd()); 540 PetscFunctionReturn(PETSC_SUCCESS); 541 } 542 543 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 544 { 545 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 546 547 PetscFunctionBegin; 548 switch (op) { 549 case MAT_FORM_EXPLICIT_TRANSPOSE: 550 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 551 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 552 A->form_explicit_transpose = flg; 553 break; 554 default: 555 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 556 break; 557 } 558 PetscFunctionReturn(PETSC_SUCCESS); 559 } 560 561 /* Depending on reuse, either build a new mat, or use the existing mat */ 562 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 563 { 564 Mat_SeqAIJ *aseq; 565 566 PetscFunctionBegin; 567 PetscCall(PetscKokkosInitializeCheck()); 568 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 569 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 570 PetscCall(MatSetType(*newmat, mtype)); 571 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 572 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 573 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 574 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 575 PetscCall(PetscFree(A->defaultvectype)); 576 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 577 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 578 PetscCall(MatSetOps_SeqAIJKokkos(A)); 579 aseq = static_cast<Mat_SeqAIJ *>(A->data); 580 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 581 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 582 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE); 583 } 584 } 585 PetscFunctionReturn(PETSC_SUCCESS); 586 } 587 588 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 589 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 590 */ 591 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 592 { 593 Mat_SeqAIJ *bseq; 594 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 595 Mat mat; 596 597 PetscFunctionBegin; 598 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 599 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 600 mat = *B; 601 if (A->assembled) { 602 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 603 bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE); 604 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 605 /* Now copy values to B if needed */ 606 if (dupOption == MAT_COPY_VALUES) { 607 if (akok->a_dual.need_sync_device()) { 608 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 609 bkok->a_dual.modify_host(); 610 } else { /* If device has the latest data, we only copy data on device */ 611 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 612 bkok->a_dual.modify_device(); 613 } 614 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 615 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 616 bkok->a_dual.modify_host(); 617 } 618 mat->spptr = bkok; 619 } 620 621 PetscCall(PetscFree(mat->defaultvectype)); 622 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 623 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 624 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 625 PetscFunctionReturn(PETSC_SUCCESS); 626 } 627 628 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 629 { 630 Mat At; 631 KokkosCsrMatrix internT; 632 Mat_SeqAIJKokkos *atkok, *bkok; 633 634 PetscFunctionBegin; 635 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 636 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 637 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 638 /* Deep copy internT, as we want to isolate the internal transpose */ 639 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 640 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 641 if (reuse == MAT_INITIAL_MATRIX) *B = At; 642 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 643 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 644 if ((*B)->assembled) { 645 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 646 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 647 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 648 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 649 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 650 MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 651 MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 652 PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 653 PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 654 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 655 } 656 PetscFunctionReturn(PETSC_SUCCESS); 657 } 658 659 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 660 { 661 Mat_SeqAIJKokkos *aijkok; 662 663 PetscFunctionBegin; 664 if (A->factortype == MAT_FACTOR_NONE) { 665 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 666 delete aijkok; 667 } else { 668 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 669 } 670 A->spptr = NULL; 671 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 672 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 673 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 674 #if defined(PETSC_HAVE_HYPRE) 675 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL)); 676 #endif 677 PetscCall(MatDestroy_SeqAIJ(A)); 678 PetscFunctionReturn(PETSC_SUCCESS); 679 } 680 681 /*MC 682 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 683 684 A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types 685 686 Options Database Key: 687 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 688 689 Level: beginner 690 691 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 692 M*/ 693 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 694 { 695 PetscFunctionBegin; 696 PetscCall(PetscKokkosInitializeCheck()); 697 PetscCall(MatCreate_SeqAIJ(A)); 698 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 699 PetscFunctionReturn(PETSC_SUCCESS); 700 } 701 702 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 703 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 704 { 705 Mat_SeqAIJ *a, *b; 706 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 707 MatScalarKokkosView aa, ba, ca; 708 MatRowMapKokkosView ai, bi, ci; 709 MatColIdxKokkosView aj, bj, cj; 710 PetscInt m, n, nnz, aN; 711 712 PetscFunctionBegin; 713 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 714 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 715 PetscAssertPointer(C, 4); 716 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 717 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 718 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 719 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 720 721 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 722 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 723 a = static_cast<Mat_SeqAIJ *>(A->data); 724 b = static_cast<Mat_SeqAIJ *>(B->data); 725 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 726 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 727 aa = akok->a_dual.view_device(); 728 ai = akok->i_dual.view_device(); 729 ba = bkok->a_dual.view_device(); 730 bi = bkok->i_dual.view_device(); 731 m = A->rmap->n; /* M, N and nnz of C */ 732 n = A->cmap->n + B->cmap->n; 733 nnz = a->nz + b->nz; 734 aN = A->cmap->n; /* N of A */ 735 if (reuse == MAT_INITIAL_MATRIX) { 736 aj = akok->j_dual.view_device(); 737 bj = bkok->j_dual.view_device(); 738 auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 739 auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 740 auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 741 ca = ca_dual.view_device(); 742 ci = ci_dual.view_device(); 743 cj = cj_dual.view_device(); 744 745 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 746 Kokkos::parallel_for( 747 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 748 PetscInt i = t.league_rank(); /* row i */ 749 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 750 751 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 752 ci(i) = coffset; 753 if (i == m - 1) ci(m) = ai(m) + bi(m); 754 }); 755 756 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 757 if (k < alen) { 758 ca(coffset + k) = aa(ai(i) + k); 759 cj(coffset + k) = aj(ai(i) + k); 760 } else { 761 ca(coffset + k) = ba(bi(i) + k - alen); 762 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 763 } 764 }); 765 }); 766 ca_dual.modify_device(); 767 ci_dual.modify_device(); 768 cj_dual.modify_device(); 769 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 770 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 771 } else if (reuse == MAT_REUSE_MATRIX) { 772 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 773 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 774 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 775 ca = ckok->a_dual.view_device(); 776 ci = ckok->i_dual.view_device(); 777 778 Kokkos::parallel_for( 779 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 780 PetscInt i = t.league_rank(); /* row i */ 781 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 782 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 783 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 784 else ca(ci(i) + k) = ba(bi(i) + k - alen); 785 }); 786 }); 787 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 788 } 789 PetscFunctionReturn(PETSC_SUCCESS); 790 } 791 792 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 793 { 794 PetscFunctionBegin; 795 delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 796 PetscFunctionReturn(PETSC_SUCCESS); 797 } 798 799 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 800 { 801 Mat_Product *product = C->product; 802 Mat A, B; 803 bool transA, transB; /* use bool, since KK needs this type */ 804 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 805 Mat_SeqAIJ *c; 806 MatProductData_SeqAIJKokkos *pdata; 807 KokkosCsrMatrix csrmatA, csrmatB; 808 809 PetscFunctionBegin; 810 MatCheckProduct(C, 1); 811 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 812 pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 813 814 // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 815 // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 816 // we still do numeric. 817 if (pdata->reusesym) { // numeric reuses results from symbolic 818 pdata->reusesym = PETSC_FALSE; 819 PetscFunctionReturn(PETSC_SUCCESS); 820 } 821 822 switch (product->type) { 823 case MATPRODUCT_AB: 824 transA = false; 825 transB = false; 826 break; 827 case MATPRODUCT_AtB: 828 transA = true; 829 transB = false; 830 break; 831 case MATPRODUCT_ABt: 832 transA = false; 833 transB = true; 834 break; 835 default: 836 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 837 } 838 839 A = product->A; 840 B = product->B; 841 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 842 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 843 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 844 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 845 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 846 847 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 848 849 csrmatA = akok->csrmat; 850 csrmatB = bkok->csrmat; 851 852 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 853 if (transA) { 854 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 855 transA = false; 856 } 857 858 if (transB) { 859 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 860 transB = false; 861 } 862 PetscCall(PetscLogGpuTimeBegin()); 863 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 864 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 865 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 866 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 867 #endif 868 869 PetscCall(PetscLogGpuTimeEnd()); 870 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 871 /* shorter version of MatAssemblyEnd_SeqAIJ */ 872 c = (Mat_SeqAIJ *)C->data; 873 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 874 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 875 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 876 c->reallocs = 0; 877 C->info.mallocs = 0; 878 C->info.nz_unneeded = 0; 879 C->assembled = C->was_assembled = PETSC_TRUE; 880 C->num_ass++; 881 PetscFunctionReturn(PETSC_SUCCESS); 882 } 883 884 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 885 { 886 Mat_Product *product = C->product; 887 MatProductType ptype; 888 Mat A, B; 889 bool transA, transB; 890 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 891 MatProductData_SeqAIJKokkos *pdata; 892 MPI_Comm comm; 893 KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 894 895 PetscFunctionBegin; 896 MatCheckProduct(C, 1); 897 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 898 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 899 A = product->A; 900 B = product->B; 901 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 902 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 903 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 904 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 905 csrmatA = akok->csrmat; 906 csrmatB = bkok->csrmat; 907 908 ptype = product->type; 909 // Take advantage of the symmetry if true 910 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 911 ptype = MATPRODUCT_AB; 912 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 913 } 914 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 915 ptype = MATPRODUCT_AB; 916 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 917 } 918 919 switch (ptype) { 920 case MATPRODUCT_AB: 921 transA = false; 922 transB = false; 923 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 924 break; 925 case MATPRODUCT_AtB: 926 transA = true; 927 transB = false; 928 if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs)); 929 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 930 break; 931 case MATPRODUCT_ABt: 932 transA = false; 933 transB = true; 934 if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs)); 935 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 936 break; 937 default: 938 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 939 } 940 PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 941 pdata->reusesym = product->api_user; 942 943 /* TODO: add command line options to select spgemm algorithms */ 944 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 945 946 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 947 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 948 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 949 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 950 #endif 951 #endif 952 PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 953 954 PetscCall(PetscLogGpuTimeBegin()); 955 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 956 if (transA) { 957 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 958 transA = false; 959 } 960 961 if (transB) { 962 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 963 transB = false; 964 } 965 966 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 967 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 968 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 969 calling new Mat_SeqAIJKokkos(). 970 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 971 */ 972 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 973 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 974 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 975 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 976 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 977 #endif 978 PetscCall(PetscLogGpuTimeEnd()); 979 980 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 981 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 982 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 983 PetscFunctionReturn(PETSC_SUCCESS); 984 } 985 986 /* handles sparse matrix matrix ops */ 987 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 988 { 989 Mat_Product *product = mat->product; 990 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 991 992 PetscFunctionBegin; 993 MatCheckProduct(mat, 1); 994 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 995 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 996 if (Biskok && Ciskok) { 997 switch (product->type) { 998 case MATPRODUCT_AB: 999 case MATPRODUCT_AtB: 1000 case MATPRODUCT_ABt: 1001 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 1002 break; 1003 case MATPRODUCT_PtAP: 1004 case MATPRODUCT_RARt: 1005 case MATPRODUCT_ABC: 1006 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 1007 break; 1008 default: 1009 break; 1010 } 1011 } else { /* fallback for AIJ */ 1012 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 1013 } 1014 PetscFunctionReturn(PETSC_SUCCESS); 1015 } 1016 1017 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 1018 { 1019 Mat_SeqAIJKokkos *aijkok; 1020 1021 PetscFunctionBegin; 1022 PetscCall(PetscLogGpuTimeBegin()); 1023 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1024 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1025 KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 1026 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1027 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 1028 PetscCall(PetscLogGpuTimeEnd()); 1029 PetscFunctionReturn(PETSC_SUCCESS); 1030 } 1031 1032 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular) 1033 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a) 1034 { 1035 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1036 1037 PetscFunctionBegin; 1038 if (A->assembled && aijseq->diagonaldense) { // no missing diagonals 1039 PetscInt n = PetscMin(A->rmap->n, A->cmap->n); 1040 1041 PetscCall(PetscLogGpuTimeBegin()); 1042 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1043 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1044 const auto &Aa = aijkok->a_dual.view_device(); 1045 const auto &Adiag = aijkok->diag_dual.view_device(); 1046 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; })); 1047 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1048 PetscCall(PetscLogGpuFlops(n)); 1049 PetscCall(PetscLogGpuTimeEnd()); 1050 } else { // need reassembly, very slow! 1051 PetscCall(MatShift_Basic(A, a)); 1052 } 1053 PetscFunctionReturn(PETSC_SUCCESS); 1054 } 1055 1056 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is) 1057 { 1058 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data); 1059 1060 PetscFunctionBegin; 1061 if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals 1062 ConstPetscScalarKokkosView dv; 1063 PetscInt n, nv; 1064 1065 PetscCall(PetscLogGpuTimeBegin()); 1066 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1067 PetscCall(VecGetKokkosView(D, &dv)); 1068 PetscCall(VecGetLocalSize(D, &nv)); 1069 n = PetscMin(Y->rmap->n, Y->cmap->n); 1070 PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match"); 1071 1072 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1073 const auto &Aa = aijkok->a_dual.view_device(); 1074 const auto &Adiag = aijkok->diag_dual.view_device(); 1075 PetscCallCXX(Kokkos::parallel_for( 1076 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1077 if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i); 1078 else Aa(Adiag(i)) += dv(i); 1079 })); 1080 PetscCall(VecRestoreKokkosView(D, &dv)); 1081 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1082 PetscCall(PetscLogGpuFlops(n)); 1083 PetscCall(PetscLogGpuTimeEnd()); 1084 } else { // need reassembly, very slow! 1085 PetscCall(MatDiagonalSet_Default(Y, D, is)); 1086 } 1087 PetscFunctionReturn(PETSC_SUCCESS); 1088 } 1089 1090 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr) 1091 { 1092 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1093 PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz; 1094 ConstPetscScalarKokkosView lv, rv; 1095 1096 PetscFunctionBegin; 1097 PetscCall(PetscLogGpuTimeBegin()); 1098 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1099 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1100 const auto &Aa = aijkok->a_dual.view_device(); 1101 const auto &Ai = aijkok->i_dual.view_device(); 1102 const auto &Aj = aijkok->j_dual.view_device(); 1103 if (ll) { 1104 PetscCall(VecGetLocalSize(ll, &m)); 1105 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1106 PetscCall(VecGetKokkosView(ll, &lv)); 1107 PetscCallCXX(Kokkos::parallel_for( // for each row 1108 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1109 PetscInt i = t.league_rank(); // row i 1110 PetscInt len = Ai(i + 1) - Ai(i); 1111 // scale entries on the row 1112 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); }); 1113 })); 1114 PetscCall(VecRestoreKokkosView(ll, &lv)); 1115 PetscCall(PetscLogGpuFlops(nz)); 1116 } 1117 if (rr) { 1118 PetscCall(VecGetLocalSize(rr, &n)); 1119 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1120 PetscCall(VecGetKokkosView(rr, &rv)); 1121 PetscCallCXX(Kokkos::parallel_for( // for each nonzero 1122 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); })); 1123 PetscCall(VecRestoreKokkosView(rr, &lv)); 1124 PetscCall(PetscLogGpuFlops(nz)); 1125 } 1126 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1127 PetscCall(PetscLogGpuTimeEnd()); 1128 PetscFunctionReturn(PETSC_SUCCESS); 1129 } 1130 1131 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 1132 { 1133 Mat_SeqAIJKokkos *aijkok; 1134 1135 PetscFunctionBegin; 1136 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1137 if (aijkok) { /* Only zero the device if data is already there */ 1138 KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0); 1139 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1140 } else { /* Might be preallocated but not assembled */ 1141 PetscCall(MatZeroEntries_SeqAIJ(A)); 1142 } 1143 PetscFunctionReturn(PETSC_SUCCESS); 1144 } 1145 1146 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1147 { 1148 Mat_SeqAIJKokkos *aijkok; 1149 PetscInt n; 1150 PetscScalarKokkosView xv; 1151 1152 PetscFunctionBegin; 1153 PetscCall(VecGetLocalSize(x, &n)); 1154 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1155 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1156 1157 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1158 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1159 1160 const auto &Aa = aijkok->a_dual.view_device(); 1161 const auto &Ai = aijkok->i_dual.view_device(); 1162 const auto &Adiag = aijkok->diag_dual.view_device(); 1163 1164 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1165 Kokkos::parallel_for( 1166 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1167 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1168 else xv(i) = 0; 1169 }); 1170 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1171 PetscFunctionReturn(PETSC_SUCCESS); 1172 } 1173 1174 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1175 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1176 { 1177 Mat_SeqAIJKokkos *aijkok; 1178 1179 PetscFunctionBegin; 1180 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1181 PetscAssertPointer(kv, 2); 1182 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1183 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1184 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1185 *kv = aijkok->a_dual.view_device(); 1186 PetscFunctionReturn(PETSC_SUCCESS); 1187 } 1188 1189 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1190 { 1191 PetscFunctionBegin; 1192 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1193 PetscAssertPointer(kv, 2); 1194 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1195 PetscFunctionReturn(PETSC_SUCCESS); 1196 } 1197 1198 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1199 { 1200 Mat_SeqAIJKokkos *aijkok; 1201 1202 PetscFunctionBegin; 1203 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1204 PetscAssertPointer(kv, 2); 1205 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1206 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1207 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1208 *kv = aijkok->a_dual.view_device(); 1209 PetscFunctionReturn(PETSC_SUCCESS); 1210 } 1211 1212 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1213 { 1214 PetscFunctionBegin; 1215 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1216 PetscAssertPointer(kv, 2); 1217 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1218 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1219 PetscFunctionReturn(PETSC_SUCCESS); 1220 } 1221 1222 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1223 { 1224 Mat_SeqAIJKokkos *aijkok; 1225 1226 PetscFunctionBegin; 1227 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1228 PetscAssertPointer(kv, 2); 1229 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1230 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1231 *kv = aijkok->a_dual.view_device(); 1232 PetscFunctionReturn(PETSC_SUCCESS); 1233 } 1234 1235 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1236 { 1237 PetscFunctionBegin; 1238 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1239 PetscAssertPointer(kv, 2); 1240 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1241 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1242 PetscFunctionReturn(PETSC_SUCCESS); 1243 } 1244 1245 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A) 1246 { 1247 Mat_SeqAIJKokkos *akok; 1248 1249 PetscFunctionBegin; 1250 auto exec = PetscGetKokkosExecutionSpace(); 1251 // Create host copies of the input aij 1252 auto i_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), i_d); 1253 auto j_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), j_d); 1254 // Don't copy the vals to the host now 1255 auto a_h = Kokkos::create_mirror_view(HostMirrorMemorySpace(), a_d); 1256 1257 MatScalarKokkosDualView a_dual = MatScalarKokkosDualView(a_d, a_h); 1258 // Note we have modified device data so it will copy lazily 1259 a_dual.modify_device(); 1260 MatRowMapKokkosDualView i_dual = MatRowMapKokkosDualView(i_d, i_h); 1261 MatColIdxKokkosDualView j_dual = MatColIdxKokkosDualView(j_d, j_h); 1262 1263 PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_dual.extent(0), i_dual, j_dual, a_dual)); 1264 PetscCall(MatCreate(comm, A)); 1265 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1266 PetscFunctionReturn(PETSC_SUCCESS); 1267 } 1268 1269 /* Computes Y += alpha X */ 1270 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1271 { 1272 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1273 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1274 ConstMatScalarKokkosView Xa; 1275 MatScalarKokkosView Ya; 1276 auto exec = PetscGetKokkosExecutionSpace(); 1277 1278 PetscFunctionBegin; 1279 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1280 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1281 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1282 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1283 PetscCall(PetscLogGpuTimeBegin()); 1284 1285 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1286 PetscBool e; 1287 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1288 if (e) { 1289 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1290 if (e) pattern = SAME_NONZERO_PATTERN; 1291 } 1292 } 1293 1294 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1295 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1296 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1297 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1298 */ 1299 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1300 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1301 Xa = xkok->a_dual.view_device(); 1302 Ya = ykok->a_dual.view_device(); 1303 1304 if (pattern == SAME_NONZERO_PATTERN) { 1305 KokkosBlas::axpy(exec, alpha, Xa, Ya); 1306 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1307 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1308 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1309 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1310 1311 Kokkos::parallel_for( 1312 Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1313 PetscInt i = t.league_rank(); // row i 1314 Kokkos::single(Kokkos::PerTeam(t), [=]() { 1315 // Only one thread works in a team 1316 PetscInt p, q = Yi(i); 1317 for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 1318 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 1319 if (Xj(p) == Yj(q)) { // Found it 1320 Ya(q) += alpha * Xa(p); 1321 q++; 1322 } else { 1323 // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1324 // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1325 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 1326 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 1327 #else 1328 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 1329 #endif 1330 } 1331 } 1332 }); 1333 }); 1334 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1335 } else { // different nonzero patterns 1336 Mat Z; 1337 KokkosCsrMatrix zcsr; 1338 KernelHandle kh; 1339 kh.create_spadd_handle(true); // X, Y are sorted 1340 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1341 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1342 zkok = new Mat_SeqAIJKokkos(zcsr); 1343 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1344 PetscCall(MatHeaderReplace(Y, &Z)); 1345 kh.destroy_spadd_handle(); 1346 } 1347 PetscCall(PetscLogGpuTimeEnd()); 1348 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 1349 PetscFunctionReturn(PETSC_SUCCESS); 1350 } 1351 1352 struct MatCOOStruct_SeqAIJKokkos { 1353 PetscCount n; 1354 PetscCount Atot; 1355 PetscInt nz; 1356 PetscCountKokkosView jmap; 1357 PetscCountKokkosView perm; 1358 1359 MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h) 1360 { 1361 nz = coo_h->nz; 1362 n = coo_h->n; 1363 Atot = coo_h->Atot; 1364 jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1)); 1365 perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot)); 1366 } 1367 }; 1368 1369 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data) 1370 { 1371 PetscFunctionBegin; 1372 PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data)); 1373 PetscFunctionReturn(PETSC_SUCCESS); 1374 } 1375 1376 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1377 { 1378 Mat_SeqAIJKokkos *akok; 1379 Mat_SeqAIJ *aseq; 1380 PetscContainer container_h; 1381 MatCOOStruct_SeqAIJ *coo_h; 1382 MatCOOStruct_SeqAIJKokkos *coo_d; 1383 1384 PetscFunctionBegin; 1385 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1386 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1387 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1388 delete akok; 1389 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE); 1390 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1391 1392 // Copy the COO struct to device 1393 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 1394 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 1395 PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h)); 1396 1397 // Put the COO struct in a container and then attach that to the matrix 1398 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos)); 1399 PetscFunctionReturn(PETSC_SUCCESS); 1400 } 1401 1402 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1403 { 1404 MatScalarKokkosView Aa; 1405 ConstMatScalarKokkosView kv; 1406 PetscMemType memtype; 1407 PetscContainer container; 1408 MatCOOStruct_SeqAIJKokkos *coo; 1409 1410 PetscFunctionBegin; 1411 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 1412 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 1413 1414 const auto &n = coo->n; 1415 const auto &Annz = coo->nz; 1416 const auto &jmap = coo->jmap; 1417 const auto &perm = coo->perm; 1418 1419 PetscCall(PetscGetMemType(v, &memtype)); 1420 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1421 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n)); 1422 } else { 1423 kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */ 1424 } 1425 1426 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1427 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1428 1429 PetscCall(PetscLogGpuTimeBegin()); 1430 Kokkos::parallel_for( 1431 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) { 1432 PetscScalar sum = 0.0; 1433 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1434 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1435 }); 1436 PetscCall(PetscLogGpuTimeEnd()); 1437 1438 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1439 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1440 PetscFunctionReturn(PETSC_SUCCESS); 1441 } 1442 1443 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1444 { 1445 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1446 1447 PetscFunctionBegin; 1448 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1449 A->boundtocpu = PETSC_FALSE; 1450 1451 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1452 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1453 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1454 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1455 A->ops->scale = MatScale_SeqAIJKokkos; 1456 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1457 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1458 A->ops->mult = MatMult_SeqAIJKokkos; 1459 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1460 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1461 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1462 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1463 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1464 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1465 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1466 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1467 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1468 A->ops->shift = MatShift_SeqAIJKokkos; 1469 A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos; 1470 A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos; 1471 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1472 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1473 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1474 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1475 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1476 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1477 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1478 1479 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1480 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1481 #if defined(PETSC_HAVE_HYPRE) 1482 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE)); 1483 #endif 1484 PetscFunctionReturn(PETSC_SUCCESS); 1485 } 1486 1487 /* 1488 Extract the (prescribled) diagonal blocks of the matrix and then invert them 1489 1490 Input Parameters: 1491 + A - the MATSEQAIJKOKKOS matrix 1492 . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i) 1493 . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[] 1494 . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i) 1495 - work - a pre-allocated work buffer (as big as diagVal) for use by this routine 1496 1497 Output Parameter: 1498 . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order) 1499 */ 1500 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal) 1501 { 1502 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1503 PetscInt nblocks = bs.extent(0) - 1; 1504 1505 PetscFunctionBegin; 1506 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device 1507 1508 // Pull out the diagonal blocks of the matrix and then invert the blocks 1509 auto Aa = akok->a_dual.view_device(); 1510 auto Ai = akok->i_dual.view_device(); 1511 auto Aj = akok->j_dual.view_device(); 1512 auto Adiag = akok->diag_dual.view_device(); 1513 // TODO: how to tune the team size? 1514 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY) 1515 auto ts = Kokkos::AUTO(); 1516 #else 1517 auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs 1518 #endif 1519 PetscCallCXX(Kokkos::parallel_for( 1520 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) { 1521 const PetscInt bid = teamMember.league_rank(); // block id 1522 const PetscInt rstart = bs(bid); // this block starts from this row 1523 const PetscInt m = bs(bid + 1) - bs(bid); // size of this block 1524 const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order 1525 const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m); 1526 1527 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B 1528 PetscInt i = rstart + r; // i-th row in A 1529 1530 if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case) 1531 PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row 1532 1533 for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet 1534 if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero 1535 B(r, c) = 0.0; 1536 } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column 1537 B(r, c) = Aa(first + c); 1538 } else { // this entry does not show up in the CSR 1539 B(r, c) = 0.0; 1540 } 1541 } 1542 } else { // rare case that the diagonal does not exist 1543 const PetscInt begin = Ai(i); 1544 const PetscInt end = Ai(i + 1); 1545 for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0; 1546 for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not. 1547 if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j); 1548 else if (Aj(j) >= rstart + m) break; 1549 } 1550 } 1551 }); 1552 1553 // LU-decompose B (w/o pivoting) and then invert B 1554 KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0); 1555 KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W); 1556 })); 1557 // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels 1558 PetscFunctionReturn(PETSC_SUCCESS); 1559 } 1560 1561 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1562 { 1563 Mat_SeqAIJ *aseq; 1564 PetscInt i, m, n; 1565 auto exec = PetscGetKokkosExecutionSpace(); 1566 1567 PetscFunctionBegin; 1568 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1569 1570 m = akok->nrows(); 1571 n = akok->ncols(); 1572 PetscCall(MatSetSizes(A, m, n, m, n)); 1573 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1574 1575 /* Set up data structures of A as a MATSEQAIJ */ 1576 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1577 aseq = (Mat_SeqAIJ *)A->data; 1578 1579 PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */ 1580 PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec)); 1581 1582 aseq->i = akok->i_host_data(); 1583 aseq->j = akok->j_host_data(); 1584 aseq->a = akok->a_host_data(); 1585 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1586 aseq->free_a = PETSC_FALSE; 1587 aseq->free_ij = PETSC_FALSE; 1588 aseq->nz = akok->nnz(); 1589 aseq->maxnz = aseq->nz; 1590 1591 PetscCall(PetscMalloc1(m, &aseq->imax)); 1592 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1593 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1594 1595 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1596 akok->nonzerostate = A->nonzerostate; 1597 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1598 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1599 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1600 PetscFunctionReturn(PETSC_SUCCESS); 1601 } 1602 1603 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 1604 { 1605 PetscFunctionBegin; 1606 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1607 *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 1608 PetscFunctionReturn(PETSC_SUCCESS); 1609 } 1610 1611 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 1612 { 1613 Mat_SeqAIJKokkos *akok; 1614 1615 PetscFunctionBegin; 1616 PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 1617 PetscCall(MatCreate(comm, A)); 1618 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1619 PetscFunctionReturn(PETSC_SUCCESS); 1620 } 1621 1622 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1623 1624 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1625 */ 1626 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1627 { 1628 PetscFunctionBegin; 1629 PetscCall(MatCreate(comm, A)); 1630 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1631 PetscFunctionReturn(PETSC_SUCCESS); 1632 } 1633 1634 /*@C 1635 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1636 (the default parallel PETSc format). This matrix will ultimately be handled by 1637 Kokkos for calculations. 1638 1639 Collective 1640 1641 Input Parameters: 1642 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1643 . m - number of rows 1644 . n - number of columns 1645 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 1646 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 1647 1648 Output Parameter: 1649 . A - the matrix 1650 1651 Level: intermediate 1652 1653 Notes: 1654 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1655 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1656 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1657 1658 The AIJ format, also called 1659 compressed row storage, is fully compatible with standard Fortran 1660 storage. That is, the stored row and column indices can begin at 1661 either one (as in Fortran) or zero. 1662 1663 Specify the preallocated storage with either `nz` or `nnz` (not both). 1664 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1665 allocation. 1666 1667 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()` 1668 @*/ 1669 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1670 { 1671 PetscFunctionBegin; 1672 PetscCall(PetscKokkosInitializeCheck()); 1673 PetscCall(MatCreate(comm, A)); 1674 PetscCall(MatSetSizes(*A, m, n, m, n)); 1675 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1676 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1677 PetscFunctionReturn(PETSC_SUCCESS); 1678 } 1679 1680 // After matrix numeric factorization, there are still steps to do before triangular solve can be called. 1681 // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does). 1682 // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve(). 1683 // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck. 1684 static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A) 1685 { 1686 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1687 const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy 1688 const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy 1689 1690 PetscFunctionBegin; 1691 if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet 1692 if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d)); 1693 if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d)); 1694 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1695 } 1696 PetscFunctionReturn(PETSC_SUCCESS); 1697 } 1698 1699 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1700 { 1701 const PetscInt n = A->rmap->n; 1702 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1703 const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy 1704 const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy 1705 1706 PetscFunctionBegin; 1707 if (!factors->transpose_updated) { 1708 if (has_upper) { 1709 if (!factors->iUt_d.extent(0)) { // Allocate Ut on device if not yet 1710 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix 1711 factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 1712 factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 1713 } 1714 1715 if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host 1716 if (!factors->U) { 1717 Mat_SeqAIJ *seq; 1718 1719 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U)); 1720 PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut)); 1721 1722 seq = static_cast<Mat_SeqAIJ *>(factors->Ut->data); 1723 factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1); 1724 factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz); 1725 factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz); 1726 } else { 1727 PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h 1728 } 1729 // Copy Ut from host to device 1730 PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h)); 1731 PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h)); 1732 PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h)); 1733 } else { // If U was computed on device, we also compute the transpose there 1734 // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. 1735 PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, 1736 factors->jU_d, factors->aU_d, 1737 factors->iUt_d, factors->jUt_d, 1738 factors->aUt_d)); 1739 PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d)); 1740 } 1741 PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d)); 1742 } 1743 1744 // do the same for L with LU 1745 if (has_lower) { 1746 if (!factors->iLt_d.extent(0)) { // Allocate Lt on device if not yet 1747 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix 1748 factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 1749 factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 1750 } 1751 1752 if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host 1753 if (!factors->L) { 1754 Mat_SeqAIJ *seq; 1755 1756 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L)); 1757 PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt)); 1758 1759 seq = static_cast<Mat_SeqAIJ *>(factors->Lt->data); 1760 factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1); 1761 factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz); 1762 factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz); 1763 } else { 1764 PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h 1765 } 1766 // Copy Lt from host to device 1767 PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h)); 1768 PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h)); 1769 PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h)); 1770 } else { // If L was computed on device, we also compute the transpose there 1771 // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. 1772 PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, 1773 factors->jL_d, factors->aL_d, 1774 factors->iLt_d, factors->jLt_d, 1775 factors->aLt_d)); 1776 PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d)); 1777 } 1778 PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d)); 1779 } 1780 1781 factors->transpose_updated = PETSC_TRUE; 1782 } 1783 PetscFunctionReturn(PETSC_SUCCESS); 1784 } 1785 1786 // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A. 1787 // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty. 1788 static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx) 1789 { 1790 auto exec = PetscGetKokkosExecutionSpace(); 1791 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1792 PetscInt m = A->rmap->n; 1793 PetscScalarKokkosView D = factors->D_d; 1794 PetscScalarKokkosView X, Y, B; // alias 1795 ConstPetscScalarKokkosView b; 1796 PetscScalarKokkosView x; 1797 PetscIntKokkosView &rowperm = factors->rowperm; 1798 PetscBool identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1799 1800 PetscFunctionBegin; 1801 PetscCall(PetscLogGpuTimeBegin()); 1802 PetscCall(MatSeqAIJKokkosSolveCheck(A)); // for UX = T 1803 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B 1804 PetscCall(VecGetKokkosView(bb, &b)); 1805 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1806 1807 // Solve U^T Y = B 1808 if (identity) { // Reorder b with the row permutation 1809 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1810 Y = factors->workVector; 1811 } else { 1812 B = factors->workVector; 1813 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); })); 1814 Y = x; 1815 } 1816 PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y)); 1817 1818 // Solve diag(D) Y' = Y. 1819 // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication. 1820 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); })); 1821 1822 // Solve UX = Y 1823 if (identity) { 1824 X = x; 1825 } else { 1826 X = factors->workVector; // B is not needed anymore 1827 } 1828 PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X)); 1829 1830 // Reorder X with the inverse column (row) permutation 1831 if (!identity) { 1832 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); })); 1833 } 1834 1835 PetscCall(VecRestoreKokkosView(bb, &b)); 1836 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1837 PetscCall(PetscLogGpuTimeEnd()); 1838 PetscFunctionReturn(PETSC_SUCCESS); 1839 } 1840 1841 // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively. 1842 // R and C are represented by rowperm and colperm in factors. 1843 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty. 1844 static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx) 1845 { 1846 auto exec = PetscGetKokkosExecutionSpace(); 1847 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1848 PetscInt m = A->rmap->n; 1849 PetscScalarKokkosView X, Y, B; // alias 1850 ConstPetscScalarKokkosView b; 1851 PetscScalarKokkosView x; 1852 PetscIntKokkosView &rowperm = factors->rowperm; 1853 PetscIntKokkosView &colperm = factors->colperm; 1854 PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1855 PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1856 1857 PetscFunctionBegin; 1858 PetscCall(PetscLogGpuTimeBegin()); 1859 PetscCall(MatSeqAIJKokkosSolveCheck(A)); 1860 PetscCall(VecGetKokkosView(bb, &b)); 1861 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1862 1863 // Solve L Y = B (i.e., L (U C^- x) = R b). R b indicates applying the row permutation on b. 1864 if (row_identity) { 1865 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1866 Y = factors->workVector; 1867 } else { 1868 B = factors->workVector; 1869 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); })); 1870 Y = x; 1871 } 1872 PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y)); 1873 1874 // Solve U C^- x = Y 1875 if (col_identity) { 1876 X = x; 1877 } else { 1878 X = factors->workVector; 1879 } 1880 PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X)); 1881 1882 // x = C X; Reorder X with the inverse col permutation 1883 if (!col_identity) { 1884 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); })); 1885 } 1886 1887 PetscCall(VecRestoreKokkosView(bb, &b)); 1888 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1889 PetscCall(PetscLogGpuTimeEnd()); 1890 PetscFunctionReturn(PETSC_SUCCESS); 1891 } 1892 1893 // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively. 1894 // R and C are represented by rowperm and colperm in factors. 1895 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty. 1896 // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R. 1897 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx) 1898 { 1899 auto exec = PetscGetKokkosExecutionSpace(); 1900 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1901 PetscInt m = A->rmap->n; 1902 PetscScalarKokkosView X, Y, B; // alias 1903 ConstPetscScalarKokkosView b; 1904 PetscScalarKokkosView x; 1905 PetscIntKokkosView &rowperm = factors->rowperm; 1906 PetscIntKokkosView &colperm = factors->colperm; 1907 PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1908 PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1909 1910 PetscFunctionBegin; 1911 PetscCall(PetscLogGpuTimeBegin()); 1912 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T 1913 PetscCall(VecGetKokkosView(bb, &b)); 1914 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1915 1916 // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b). Note C^- b = C^T b, which means applying the column permutation on b. 1917 if (col_identity) { // Reorder b with the col permutation 1918 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1919 Y = factors->workVector; 1920 } else { 1921 B = factors->workVector; 1922 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); })); 1923 Y = x; 1924 } 1925 PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y)); 1926 1927 // Solve L^T X = Y 1928 if (row_identity) { 1929 X = x; 1930 } else { 1931 X = factors->workVector; 1932 } 1933 PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X)); 1934 1935 // x = R^- X = R^T X; Reorder X with the inverse row permutation 1936 if (!row_identity) { 1937 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); })); 1938 } 1939 1940 PetscCall(VecRestoreKokkosView(bb, &b)); 1941 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1942 PetscCall(PetscLogGpuTimeEnd()); 1943 PetscFunctionReturn(PETSC_SUCCESS); 1944 } 1945 1946 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1947 { 1948 PetscFunctionBegin; 1949 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1950 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1951 1952 if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device 1953 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1954 Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data); 1955 const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag; 1956 const MatScalar *Ba = b->a; 1957 PetscInt m = B->rmap->n, n = B->cmap->n; 1958 1959 if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time 1960 // Allocate memory and copy the structure 1961 factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1); 1962 factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries 1963 factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m); 1964 factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1); 1965 factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m])); 1966 factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m])); 1967 1968 PetscInt *Li = factors->iL_h.data(); 1969 PetscInt *Lj = factors->jL_h.data(); 1970 PetscInt *Ui = factors->iU_h.data(); 1971 PetscInt *Uj = factors->jU_h.data(); 1972 1973 Li[0] = Ui[0] = 0; 1974 for (PetscInt i = 0; i < m; i++) { 1975 PetscInt llen = Bi[i + 1] - Bi[i]; // exclusive of the diagonal entry 1976 PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry 1977 1978 PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen); // entries of L on the left of the diagonal 1979 Lj[Li[i] + llen] = i; // diagonal entry of L 1980 1981 Uj[Ui[i]] = i; // diagonal entry of U 1982 PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1); // entries of U on the right of the diagonal 1983 1984 Li[i + 1] = Li[i] + llen + 1; 1985 Ui[i + 1] = Ui[i] + ulen; 1986 } 1987 1988 factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h); 1989 factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h); 1990 factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h); 1991 factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h); 1992 factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h); 1993 factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h); 1994 1995 // Copy row/col permutation to device 1996 IS rowperm = ((Mat_SeqAIJ *)B->data)->row; 1997 PetscBool row_identity; 1998 PetscCall(ISIdentity(rowperm, &row_identity)); 1999 if (!row_identity) { 2000 const PetscInt *ip; 2001 2002 PetscCall(ISGetIndices(rowperm, &ip)); 2003 factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m); 2004 PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m))); 2005 PetscCall(ISRestoreIndices(rowperm, &ip)); 2006 PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt))); 2007 } 2008 2009 IS colperm = ((Mat_SeqAIJ *)B->data)->col; 2010 PetscBool col_identity; 2011 PetscCall(ISIdentity(colperm, &col_identity)); 2012 if (!col_identity) { 2013 const PetscInt *ip; 2014 2015 PetscCall(ISGetIndices(colperm, &ip)); 2016 factors->colperm = PetscIntKokkosView(NoInit("colperm"), n); 2017 PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n))); 2018 PetscCall(ISRestoreIndices(colperm, &ip)); 2019 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 2020 } 2021 2022 /* Create sptrsv handles for L, U and their transpose */ 2023 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2024 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2025 #else 2026 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2027 #endif 2028 factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */); 2029 factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */); 2030 factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */); 2031 factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */); 2032 } 2033 2034 // Copy the value 2035 for (PetscInt i = 0; i < m; i++) { 2036 PetscInt llen = Bi[i + 1] - Bi[i]; 2037 PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; 2038 const PetscInt *Li = factors->iL_h.data(); 2039 const PetscInt *Ui = factors->iU_h.data(); 2040 2041 PetscScalar *La = factors->aL_h.data(); 2042 PetscScalar *Ua = factors->aU_h.data(); 2043 2044 PetscArraycpy(La + Li[i], Ba + Bi[i], llen); // entries of L 2045 La[Li[i] + llen] = 1.0; // diagonal entry 2046 2047 Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]]; // diagonal entry 2048 PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1); // entries of U 2049 } 2050 2051 PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h)); 2052 PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h)); 2053 // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic 2054 factors->transpose_updated = PETSC_FALSE; 2055 factors->sptrsv_symbolic_completed = PETSC_FALSE; 2056 2057 B->ops->solve = MatSolve_SeqAIJKokkos_LU; 2058 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU; 2059 } 2060 2061 B->ops->matsolve = NULL; 2062 B->ops->matsolvetranspose = NULL; 2063 PetscFunctionReturn(PETSC_SUCCESS); 2064 } 2065 2066 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info) 2067 { 2068 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 2069 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2070 PetscInt fill_lev = info->levels; 2071 2072 PetscFunctionBegin; 2073 PetscCall(PetscLogGpuTimeBegin()); 2074 PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false"); 2075 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2076 2077 auto a_d = aijkok->a_dual.view_device(); 2078 auto i_d = aijkok->i_dual.view_device(); 2079 auto j_d = aijkok->j_dual.view_device(); 2080 2081 PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d)); 2082 2083 B->assembled = PETSC_TRUE; 2084 B->preallocated = PETSC_TRUE; 2085 B->ops->solve = MatSolve_SeqAIJKokkos_LU; 2086 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU; 2087 B->ops->matsolve = NULL; 2088 B->ops->matsolvetranspose = NULL; 2089 2090 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 2091 factors->transpose_updated = PETSC_FALSE; 2092 factors->sptrsv_symbolic_completed = PETSC_FALSE; 2093 /* TODO: log flops, but how to know that? */ 2094 PetscCall(PetscLogGpuTimeEnd()); 2095 PetscFunctionReturn(PETSC_SUCCESS); 2096 } 2097 2098 // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering 2099 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info) 2100 { 2101 Mat_SeqAIJKokkos *aijkok; 2102 Mat_SeqAIJ *b; 2103 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2104 PetscInt fill_lev = info->levels; 2105 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 2106 PetscInt n = A->rmap->n; 2107 2108 PetscFunctionBegin; 2109 PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now"); 2110 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2111 2112 /* Create a spiluk handle and then do symbolic factorization */ 2113 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 2114 factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 2115 2116 auto spiluk_handle = factors->kh.get_spiluk_handle(); 2117 2118 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 2119 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 2120 Kokkos::realloc(factors->iU_d, n + 1); 2121 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 2122 2123 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 2124 auto i_d = aijkok->i_dual.view_device(); 2125 auto j_d = aijkok->j_dual.view_device(); 2126 PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d)); 2127 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 2128 2129 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 2130 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 2131 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 2132 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 2133 2134 /* TODO: add options to select sptrsv algorithms */ 2135 /* Create sptrsv handles for L, U and their transpose */ 2136 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2137 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2138 #else 2139 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2140 #endif 2141 2142 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 2143 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 2144 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 2145 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 2146 2147 /* Fill fields of the factor matrix B */ 2148 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 2149 b = (Mat_SeqAIJ *)B->data; 2150 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 2151 B->info.fill_ratio_given = info->fill; 2152 B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 2153 2154 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0; 2155 PetscFunctionReturn(PETSC_SUCCESS); 2156 } 2157 2158 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2159 { 2160 PetscFunctionBegin; 2161 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2162 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2163 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2164 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 2165 PetscFunctionReturn(PETSC_SUCCESS); 2166 } 2167 2168 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2169 { 2170 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2171 2172 PetscFunctionBegin; 2173 if (!info->factoronhost) { 2174 PetscCall(ISIdentity(isrow, &row_identity)); 2175 PetscCall(ISIdentity(iscol, &col_identity)); 2176 } 2177 2178 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2179 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2180 2181 if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering 2182 PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info)); 2183 } else { 2184 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host 2185 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 2186 } 2187 PetscFunctionReturn(PETSC_SUCCESS); 2188 } 2189 2190 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 2191 { 2192 PetscFunctionBegin; 2193 PetscCall(MatSeqAIJKokkosSyncHost(A)); 2194 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 2195 2196 if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device 2197 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2198 Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data); 2199 const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag; 2200 const MatScalar *Ba = b->a; 2201 PetscInt m = B->rmap->n; 2202 2203 if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization 2204 // Allocate memory and copy the structure 2205 factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h 2206 factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]); 2207 factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]); 2208 factors->D_h = MatScalarKokkosViewHost(NoInit("D_h"), m); 2209 factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h); 2210 factors->D_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h); 2211 2212 // Build jU_h from the skewed Aj 2213 PetscInt *Uj = factors->jU_h.data(); 2214 for (PetscInt i = 0; i < m; i++) { 2215 PetscInt ulen = Bi[i + 1] - Bi[i]; 2216 Uj[Bi[i]] = i; // diagonal entry 2217 PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal 2218 } 2219 2220 // Copy iU, jU to device 2221 PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h)); 2222 PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h)); 2223 2224 // Copy row/col permutation to device 2225 IS rowperm = ((Mat_SeqAIJ *)B->data)->row; 2226 PetscBool row_identity; 2227 PetscCall(ISIdentity(rowperm, &row_identity)); 2228 if (!row_identity) { 2229 const PetscInt *ip; 2230 2231 PetscCall(ISGetIndices(rowperm, &ip)); 2232 PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m)); 2233 PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m))); 2234 PetscCall(ISRestoreIndices(rowperm, &ip)); 2235 PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt))); 2236 } 2237 2238 // Create sptrsv handles for U and U^T 2239 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2240 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2241 #else 2242 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2243 #endif 2244 factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */); 2245 factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */); 2246 } 2247 // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them 2248 B->ops->solve = MatSolve_SeqAIJKokkos_Cholesky; 2249 B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky; 2250 2251 // Copy the value 2252 PetscScalar *Ua = factors->aU_h.data(); 2253 PetscScalar *D = factors->D_h.data(); 2254 for (PetscInt i = 0; i < m; i++) { 2255 D[i] = Ba[Bdiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 2256 Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U 2257 for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k]; 2258 } 2259 PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h)); 2260 PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h)); 2261 2262 factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again 2263 factors->transpose_updated = PETSC_FALSE; 2264 } 2265 2266 B->ops->matsolve = NULL; 2267 B->ops->matsolvetranspose = NULL; 2268 PetscFunctionReturn(PETSC_SUCCESS); 2269 } 2270 2271 static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2272 { 2273 PetscFunctionBegin; 2274 if (info->solveonhost) { 2275 // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc. 2276 PetscCall(MatSetType(B, MATSEQSBAIJ)); 2277 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 2278 } 2279 2280 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2281 2282 if (!info->solveonhost) { 2283 // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr 2284 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2285 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2286 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos; 2287 } 2288 PetscFunctionReturn(PETSC_SUCCESS); 2289 } 2290 2291 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2292 { 2293 PetscFunctionBegin; 2294 if (info->solveonhost) { 2295 // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc. 2296 PetscCall(MatSetType(B, MATSEQSBAIJ)); 2297 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 2298 } 2299 2300 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm 2301 2302 if (!info->solveonhost) { 2303 // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr 2304 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2305 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2306 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos; 2307 } 2308 PetscFunctionReturn(PETSC_SUCCESS); 2309 } 2310 2311 // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix 2312 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type) 2313 { 2314 PetscFunctionBegin; 2315 *type = MATSOLVERKOKKOS; 2316 PetscFunctionReturn(PETSC_SUCCESS); 2317 } 2318 2319 /*MC 2320 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 2321 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 2322 2323 Level: beginner 2324 2325 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 2326 M*/ 2327 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 2328 { 2329 PetscInt n = A->rmap->n; 2330 MPI_Comm comm; 2331 2332 PetscFunctionBegin; 2333 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 2334 PetscCall(MatCreate(comm, B)); 2335 PetscCall(MatSetSizes(*B, n, n, n, n)); 2336 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2337 (*B)->factortype = ftype; 2338 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 2339 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2340 PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2341 2342 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2343 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 2344 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 2345 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2346 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2347 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2348 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2349 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJKokkos; 2350 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos; 2351 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2352 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2353 } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 2354 2355 // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host 2356 (*B)->canuseordering = PETSC_TRUE; 2357 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos)); 2358 PetscFunctionReturn(PETSC_SUCCESS); 2359 } 2360 2361 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void) 2362 { 2363 PetscFunctionBegin; 2364 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 2365 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos)); 2366 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 2367 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos)); 2368 PetscFunctionReturn(PETSC_SUCCESS); 2369 } 2370 2371 /* Utility to print out a KokkosCsrMatrix for debugging */ 2372 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 2373 { 2374 const auto &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map); 2375 const auto &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries); 2376 const auto &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values); 2377 const PetscInt *i = iv.data(); 2378 const PetscInt *j = jv.data(); 2379 const PetscScalar *a = av.data(); 2380 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 2381 2382 PetscFunctionBegin; 2383 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 2384 for (PetscInt k = 0; k < m; k++) { 2385 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 2386 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 2387 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 2388 } 2389 PetscFunctionReturn(PETSC_SUCCESS); 2390 } 2391