1 #include <petsc_kokkos.hpp> 2 #include <petscvec_kokkos.hpp> 3 #include <petscmat_kokkos.hpp> 4 #include <petscpkg_version.h> 5 #include <petsc/private/petscimpl.h> 6 #include <petsc/private/sfimpl.h> 7 #include <petsc/private/kokkosimpl.hpp> 8 #include <petscsys.h> 9 10 #include <Kokkos_Core.hpp> 11 #include <KokkosBlas.hpp> 12 #include <KokkosSparse_CrsMatrix.hpp> 13 14 // To suppress compiler warnings: 15 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63: 16 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, 17 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*, 18 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations] 19 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations") 20 #include <KokkosSparse_spmv.hpp> 21 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END() 22 23 #include <KokkosSparse_spiluk.hpp> 24 #include <KokkosSparse_sptrsv.hpp> 25 #include <KokkosSparse_spgemm.hpp> 26 #include <KokkosSparse_spadd.hpp> 27 #include <KokkosBatched_LU_Decl.hpp> 28 #include <KokkosBatched_InverseLU_Decl.hpp> 29 30 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 31 32 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 33 #include <KokkosSparse_Utils.hpp> 34 using KokkosSparse::sort_crs_matrix; 35 using KokkosSparse::Impl::transpose_matrix; 36 #else 37 #include <KokkosKernels_Sorting.hpp> 38 using KokkosKernels::sort_crs_matrix; 39 using KokkosKernels::Impl::transpose_matrix; 40 #endif 41 42 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0) 43 using KokkosSparse::spiluk_symbolic; 44 using KokkosSparse::spiluk_numeric; 45 using KokkosSparse::sptrsv_symbolic; 46 using KokkosSparse::sptrsv_solve; 47 using KokkosSparse::Experimental::SPTRSVAlgorithm; 48 using KokkosSparse::Experimental::SPILUKAlgorithm; 49 #else 50 using KokkosSparse::Experimental::spiluk_symbolic; 51 using KokkosSparse::Experimental::spiluk_numeric; 52 using KokkosSparse::Experimental::sptrsv_symbolic; 53 using KokkosSparse::Experimental::sptrsv_solve; 54 using KokkosSparse::Experimental::SPTRSVAlgorithm; 55 using KokkosSparse::Experimental::SPILUKAlgorithm; 56 #endif 57 58 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 59 60 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 61 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 62 In the latter case, it is important to set a_dual's sync state correctly. 63 */ 64 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 65 { 66 Mat_SeqAIJ *aijseq; 67 Mat_SeqAIJKokkos *aijkok; 68 69 PetscFunctionBegin; 70 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 71 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 72 73 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 74 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 75 76 /* If aijkok does not exist, we just copy i, j to device. 77 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 78 In both cases, we build a new aijkok structure. 79 */ 80 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 81 if (aijkok && aijkok->host_aij_allocated_by_kokkos) { /* Avoid accidently freeing much needed a,i,j on host when deleting aijkok */ 82 PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i)); 83 PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j)); 84 PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a)); 85 PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1)); 86 PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz())); 87 PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz())); 88 aijseq->free_a = PETSC_TRUE; 89 aijseq->free_ij = PETSC_TRUE; 90 /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where 91 we have the CsrMatrix on device first and then copy to host, followed by 92 MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL. 93 One could improve it by not using NULL garray. 94 */ 95 } 96 delete aijkok; 97 aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 98 A->spptr = aijkok; 99 } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag. 100 MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n); 101 auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h); 102 aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h); 103 } 104 PetscFunctionReturn(PETSC_SUCCESS); 105 } 106 107 /* Sync CSR data to device if not yet */ 108 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 109 { 110 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 111 112 PetscFunctionBegin; 113 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 114 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 115 if (aijkok->a_dual.need_sync_device()) { 116 PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 117 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 118 aijkok->hermitian_updated = PETSC_FALSE; 119 } 120 PetscFunctionReturn(PETSC_SUCCESS); 121 } 122 123 /* Mark the CSR data on device as modified */ 124 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 125 { 126 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 127 128 PetscFunctionBegin; 129 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 130 aijkok->a_dual.clear_sync_state(); 131 aijkok->a_dual.modify_device(); 132 aijkok->transpose_updated = PETSC_FALSE; 133 aijkok->hermitian_updated = PETSC_FALSE; 134 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 135 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 136 PetscFunctionReturn(PETSC_SUCCESS); 137 } 138 139 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 140 { 141 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 142 auto exec = PetscGetKokkosExecutionSpace(); 143 144 PetscFunctionBegin; 145 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 146 /* We do not expect one needs factors on host */ 147 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 148 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 149 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec)); 150 PetscFunctionReturn(PETSC_SUCCESS); 151 } 152 153 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 154 { 155 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 156 157 PetscFunctionBegin; 158 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 159 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 160 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 161 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 162 */ 163 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 164 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 165 *array = aijkok->a_dual.view_host().data(); 166 } else { /* Happens when calling MatSetValues on a newly created matrix */ 167 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 168 } 169 PetscFunctionReturn(PETSC_SUCCESS); 170 } 171 172 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 173 { 174 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 175 176 PetscFunctionBegin; 177 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 178 PetscFunctionReturn(PETSC_SUCCESS); 179 } 180 181 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 182 { 183 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 184 185 PetscFunctionBegin; 186 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 187 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 188 *array = aijkok->a_dual.view_host().data(); 189 } else { 190 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 191 } 192 PetscFunctionReturn(PETSC_SUCCESS); 193 } 194 195 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 196 { 197 PetscFunctionBegin; 198 *array = NULL; 199 PetscFunctionReturn(PETSC_SUCCESS); 200 } 201 202 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 203 { 204 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 205 206 PetscFunctionBegin; 207 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 208 *array = aijkok->a_dual.view_host().data(); 209 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 210 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 211 } 212 PetscFunctionReturn(PETSC_SUCCESS); 213 } 214 215 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 216 { 217 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 218 219 PetscFunctionBegin; 220 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 221 aijkok->a_dual.clear_sync_state(); 222 aijkok->a_dual.modify_host(); 223 } 224 PetscFunctionReturn(PETSC_SUCCESS); 225 } 226 227 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 228 { 229 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 230 231 PetscFunctionBegin; 232 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 233 234 if (i) *i = aijkok->i_device_data(); 235 if (j) *j = aijkok->j_device_data(); 236 if (a) { 237 PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 238 *a = aijkok->a_device_data(); 239 } 240 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 241 PetscFunctionReturn(PETSC_SUCCESS); 242 } 243 244 /* 245 Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 246 247 Input Parameter: 248 . A - the MATSEQAIJKOKKOS matrix 249 250 Output Parameters: 251 + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 252 - T_d - the transpose on device, whose value array is allocated but not initialized 253 */ 254 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 255 { 256 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 257 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 258 const PetscInt *Ai = aseq->i, *Aj = aseq->j; 259 MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 260 MatRowMapType *Ti = Ti_h.data(); 261 MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 262 MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 263 PetscInt *Tj = Tj_h.data(); 264 PetscInt *perm = perm_h.data(); 265 PetscInt *offset; 266 267 PetscFunctionBegin; 268 // Populate Ti 269 PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 270 Ti++; 271 for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 272 Ti--; 273 for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 274 275 // Populate Tj and the permutation array 276 PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 277 for (PetscInt i = 0; i < m; i++) { 278 for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 279 PetscInt r = Aj[j]; // row r of T 280 PetscInt disp = Ti[r] + offset[r]; 281 282 Tj[disp] = i; // col i of T 283 perm[disp] = j; 284 offset[r]++; 285 } 286 } 287 PetscCall(PetscFree(offset)); 288 289 // Sort each row of T, along with the permutation array 290 for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 291 292 // Output perm and T on device 293 auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 294 auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 295 PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 296 PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 297 PetscFunctionReturn(PETSC_SUCCESS); 298 } 299 300 // Generate the transpose on device and cache it internally 301 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 302 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 303 { 304 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 305 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 306 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 307 KokkosCsrMatrix &T = akok->csrmatT; 308 309 PetscFunctionBegin; 310 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 311 PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device 312 313 const auto &Aa = akok->a_dual.view_device(); 314 315 if (A->symmetric == PETSC_BOOL3_TRUE) { 316 *csrmatT = akok->csrmat; 317 } else { 318 // See if we already have a cached transpose and its value is up to date 319 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 320 if (!akok->transpose_updated) { // if the value is out of date, update the cached version 321 const auto &perm = akok->transpose_perm; // get the permutation array 322 auto &Ta = T.values; 323 324 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 325 } 326 } else { // Generate T of size n x m for the first time 327 MatRowMapKokkosView perm; 328 329 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 330 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 331 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 332 } 333 akok->transpose_updated = PETSC_TRUE; 334 *csrmatT = akok->csrmatT; 335 } 336 PetscFunctionReturn(PETSC_SUCCESS); 337 } 338 339 // Generate the Hermitian on device and cache it internally 340 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 341 { 342 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 343 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 344 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 345 KokkosCsrMatrix &T = akok->csrmatH; 346 347 PetscFunctionBegin; 348 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 349 PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device 350 351 const auto &Aa = akok->a_dual.view_device(); 352 353 if (A->hermitian == PETSC_BOOL3_TRUE) { 354 *csrmatH = akok->csrmat; 355 } else { 356 // See if we already have a cached hermitian and its value is up to date 357 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 358 if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 359 const auto &perm = akok->transpose_perm; // get the permutation array 360 auto &Ta = T.values; 361 362 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 363 } 364 } else { // Generate T of size n x m for the first time 365 MatRowMapKokkosView perm; 366 367 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 368 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 369 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 370 } 371 akok->hermitian_updated = PETSC_TRUE; 372 *csrmatH = akok->csrmatH; 373 } 374 PetscFunctionReturn(PETSC_SUCCESS); 375 } 376 377 /* y = A x */ 378 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 379 { 380 Mat_SeqAIJKokkos *aijkok; 381 ConstPetscScalarKokkosView xv; 382 PetscScalarKokkosView yv; 383 384 PetscFunctionBegin; 385 PetscCall(PetscLogGpuTimeBegin()); 386 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 387 PetscCall(VecGetKokkosView(xx, &xv)); 388 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 389 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 390 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 391 PetscCall(VecRestoreKokkosView(xx, &xv)); 392 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 393 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 394 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 395 PetscCall(PetscLogGpuTimeEnd()); 396 PetscFunctionReturn(PETSC_SUCCESS); 397 } 398 399 /* y = A^T x */ 400 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 401 { 402 Mat_SeqAIJKokkos *aijkok; 403 const char *mode; 404 ConstPetscScalarKokkosView xv; 405 PetscScalarKokkosView yv; 406 KokkosCsrMatrix csrmat; 407 408 PetscFunctionBegin; 409 PetscCall(PetscLogGpuTimeBegin()); 410 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 411 PetscCall(VecGetKokkosView(xx, &xv)); 412 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 413 if (A->form_explicit_transpose) { 414 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 415 mode = "N"; 416 } else { 417 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 418 csrmat = aijkok->csrmat; 419 mode = "T"; 420 } 421 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 422 PetscCall(VecRestoreKokkosView(xx, &xv)); 423 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 424 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 425 PetscCall(PetscLogGpuTimeEnd()); 426 PetscFunctionReturn(PETSC_SUCCESS); 427 } 428 429 /* y = A^H x */ 430 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 431 { 432 Mat_SeqAIJKokkos *aijkok; 433 const char *mode; 434 ConstPetscScalarKokkosView xv; 435 PetscScalarKokkosView yv; 436 KokkosCsrMatrix csrmat; 437 438 PetscFunctionBegin; 439 PetscCall(PetscLogGpuTimeBegin()); 440 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 441 PetscCall(VecGetKokkosView(xx, &xv)); 442 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 443 if (A->form_explicit_transpose) { 444 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 445 mode = "N"; 446 } else { 447 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 448 csrmat = aijkok->csrmat; 449 mode = "C"; 450 } 451 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 452 PetscCall(VecRestoreKokkosView(xx, &xv)); 453 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 454 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 455 PetscCall(PetscLogGpuTimeEnd()); 456 PetscFunctionReturn(PETSC_SUCCESS); 457 } 458 459 /* z = A x + y */ 460 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 461 { 462 Mat_SeqAIJKokkos *aijkok; 463 ConstPetscScalarKokkosView xv; 464 PetscScalarKokkosView zv; 465 466 PetscFunctionBegin; 467 PetscCall(PetscLogGpuTimeBegin()); 468 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 469 if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host 470 PetscCall(VecGetKokkosView(xx, &xv)); 471 PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device 472 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 473 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 474 PetscCall(VecRestoreKokkosView(xx, &xv)); 475 PetscCall(VecRestoreKokkosView(zz, &zv)); 476 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 477 PetscCall(PetscLogGpuTimeEnd()); 478 PetscFunctionReturn(PETSC_SUCCESS); 479 } 480 481 /* z = A^T x + y */ 482 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 483 { 484 Mat_SeqAIJKokkos *aijkok; 485 const char *mode; 486 ConstPetscScalarKokkosView xv; 487 PetscScalarKokkosView zv; 488 KokkosCsrMatrix csrmat; 489 490 PetscFunctionBegin; 491 PetscCall(PetscLogGpuTimeBegin()); 492 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 493 if (zz != yy) PetscCall(VecCopy(yy, zz)); 494 PetscCall(VecGetKokkosView(xx, &xv)); 495 PetscCall(VecGetKokkosView(zz, &zv)); 496 if (A->form_explicit_transpose) { 497 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 498 mode = "N"; 499 } else { 500 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 501 csrmat = aijkok->csrmat; 502 mode = "T"; 503 } 504 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 505 PetscCall(VecRestoreKokkosView(xx, &xv)); 506 PetscCall(VecRestoreKokkosView(zz, &zv)); 507 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 508 PetscCall(PetscLogGpuTimeEnd()); 509 PetscFunctionReturn(PETSC_SUCCESS); 510 } 511 512 /* z = A^H x + y */ 513 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 514 { 515 Mat_SeqAIJKokkos *aijkok; 516 const char *mode; 517 ConstPetscScalarKokkosView xv; 518 PetscScalarKokkosView zv; 519 KokkosCsrMatrix csrmat; 520 521 PetscFunctionBegin; 522 PetscCall(PetscLogGpuTimeBegin()); 523 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 524 if (zz != yy) PetscCall(VecCopy(yy, zz)); 525 PetscCall(VecGetKokkosView(xx, &xv)); 526 PetscCall(VecGetKokkosView(zz, &zv)); 527 if (A->form_explicit_transpose) { 528 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 529 mode = "N"; 530 } else { 531 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 532 csrmat = aijkok->csrmat; 533 mode = "C"; 534 } 535 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 536 PetscCall(VecRestoreKokkosView(xx, &xv)); 537 PetscCall(VecRestoreKokkosView(zz, &zv)); 538 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 539 PetscCall(PetscLogGpuTimeEnd()); 540 PetscFunctionReturn(PETSC_SUCCESS); 541 } 542 543 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 544 { 545 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 546 547 PetscFunctionBegin; 548 switch (op) { 549 case MAT_FORM_EXPLICIT_TRANSPOSE: 550 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 551 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 552 A->form_explicit_transpose = flg; 553 break; 554 default: 555 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 556 break; 557 } 558 PetscFunctionReturn(PETSC_SUCCESS); 559 } 560 561 /* Depending on reuse, either build a new mat, or use the existing mat */ 562 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 563 { 564 Mat_SeqAIJ *aseq; 565 566 PetscFunctionBegin; 567 PetscCall(PetscKokkosInitializeCheck()); 568 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 569 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 570 PetscCall(MatSetType(*newmat, mtype)); 571 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 572 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 573 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 574 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 575 PetscCall(PetscFree(A->defaultvectype)); 576 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 577 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 578 PetscCall(MatSetOps_SeqAIJKokkos(A)); 579 aseq = static_cast<Mat_SeqAIJ *>(A->data); 580 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 581 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 582 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE); 583 } 584 } 585 PetscFunctionReturn(PETSC_SUCCESS); 586 } 587 588 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 589 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 590 */ 591 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 592 { 593 Mat_SeqAIJ *bseq; 594 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 595 Mat mat; 596 597 PetscFunctionBegin; 598 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 599 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 600 mat = *B; 601 if (A->assembled) { 602 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 603 bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE); 604 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 605 /* Now copy values to B if needed */ 606 if (dupOption == MAT_COPY_VALUES) { 607 if (akok->a_dual.need_sync_device()) { 608 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 609 bkok->a_dual.modify_host(); 610 } else { /* If device has the latest data, we only copy data on device */ 611 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 612 bkok->a_dual.modify_device(); 613 } 614 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 615 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 616 bkok->a_dual.modify_host(); 617 } 618 mat->spptr = bkok; 619 } 620 621 PetscCall(PetscFree(mat->defaultvectype)); 622 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 623 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 624 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 625 PetscFunctionReturn(PETSC_SUCCESS); 626 } 627 628 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 629 { 630 Mat At; 631 KokkosCsrMatrix internT; 632 Mat_SeqAIJKokkos *atkok, *bkok; 633 634 PetscFunctionBegin; 635 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 636 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 637 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 638 /* Deep copy internT, as we want to isolate the internal transpose */ 639 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 640 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 641 if (reuse == MAT_INITIAL_MATRIX) *B = At; 642 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 643 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 644 if ((*B)->assembled) { 645 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 646 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 647 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 648 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 649 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 650 MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 651 MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 652 PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 653 PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 654 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 655 } 656 PetscFunctionReturn(PETSC_SUCCESS); 657 } 658 659 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 660 { 661 Mat_SeqAIJKokkos *aijkok; 662 663 PetscFunctionBegin; 664 if (A->factortype == MAT_FACTOR_NONE) { 665 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 666 delete aijkok; 667 } else { 668 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 669 } 670 A->spptr = NULL; 671 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 672 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 673 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 674 #if defined(PETSC_HAVE_HYPRE) 675 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL)); 676 #endif 677 PetscCall(MatDestroy_SeqAIJ(A)); 678 PetscFunctionReturn(PETSC_SUCCESS); 679 } 680 681 /*MC 682 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 683 684 A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types 685 686 Options Database Key: 687 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 688 689 Level: beginner 690 691 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 692 M*/ 693 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 694 { 695 PetscFunctionBegin; 696 PetscCall(PetscKokkosInitializeCheck()); 697 PetscCall(MatCreate_SeqAIJ(A)); 698 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 699 PetscFunctionReturn(PETSC_SUCCESS); 700 } 701 702 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 703 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 704 { 705 Mat_SeqAIJ *a, *b; 706 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 707 MatScalarKokkosView aa, ba, ca; 708 MatRowMapKokkosView ai, bi, ci; 709 MatColIdxKokkosView aj, bj, cj; 710 PetscInt m, n, nnz, aN; 711 712 PetscFunctionBegin; 713 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 714 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 715 PetscAssertPointer(C, 4); 716 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 717 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 718 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 719 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 720 721 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 722 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 723 a = static_cast<Mat_SeqAIJ *>(A->data); 724 b = static_cast<Mat_SeqAIJ *>(B->data); 725 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 726 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 727 aa = akok->a_dual.view_device(); 728 ai = akok->i_dual.view_device(); 729 ba = bkok->a_dual.view_device(); 730 bi = bkok->i_dual.view_device(); 731 m = A->rmap->n; /* M, N and nnz of C */ 732 n = A->cmap->n + B->cmap->n; 733 nnz = a->nz + b->nz; 734 aN = A->cmap->n; /* N of A */ 735 if (reuse == MAT_INITIAL_MATRIX) { 736 aj = akok->j_dual.view_device(); 737 bj = bkok->j_dual.view_device(); 738 auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 739 auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 740 auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 741 ca = ca_dual.view_device(); 742 ci = ci_dual.view_device(); 743 cj = cj_dual.view_device(); 744 745 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 746 Kokkos::parallel_for( 747 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 748 PetscInt i = t.league_rank(); /* row i */ 749 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 750 751 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 752 ci(i) = coffset; 753 if (i == m - 1) ci(m) = ai(m) + bi(m); 754 }); 755 756 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 757 if (k < alen) { 758 ca(coffset + k) = aa(ai(i) + k); 759 cj(coffset + k) = aj(ai(i) + k); 760 } else { 761 ca(coffset + k) = ba(bi(i) + k - alen); 762 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 763 } 764 }); 765 }); 766 ca_dual.modify_device(); 767 ci_dual.modify_device(); 768 cj_dual.modify_device(); 769 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 770 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 771 } else if (reuse == MAT_REUSE_MATRIX) { 772 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 773 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 774 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 775 ca = ckok->a_dual.view_device(); 776 ci = ckok->i_dual.view_device(); 777 778 Kokkos::parallel_for( 779 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 780 PetscInt i = t.league_rank(); /* row i */ 781 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 782 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 783 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 784 else ca(ci(i) + k) = ba(bi(i) + k - alen); 785 }); 786 }); 787 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 788 } 789 PetscFunctionReturn(PETSC_SUCCESS); 790 } 791 792 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 793 { 794 PetscFunctionBegin; 795 delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 796 PetscFunctionReturn(PETSC_SUCCESS); 797 } 798 799 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 800 { 801 Mat_Product *product = C->product; 802 Mat A, B; 803 bool transA, transB; /* use bool, since KK needs this type */ 804 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 805 Mat_SeqAIJ *c; 806 MatProductData_SeqAIJKokkos *pdata; 807 KokkosCsrMatrix csrmatA, csrmatB; 808 809 PetscFunctionBegin; 810 MatCheckProduct(C, 1); 811 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 812 pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 813 814 // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 815 // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 816 // we still do numeric. 817 if (pdata->reusesym) { // numeric reuses results from symbolic 818 pdata->reusesym = PETSC_FALSE; 819 PetscFunctionReturn(PETSC_SUCCESS); 820 } 821 822 switch (product->type) { 823 case MATPRODUCT_AB: 824 transA = false; 825 transB = false; 826 break; 827 case MATPRODUCT_AtB: 828 transA = true; 829 transB = false; 830 break; 831 case MATPRODUCT_ABt: 832 transA = false; 833 transB = true; 834 break; 835 default: 836 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 837 } 838 839 A = product->A; 840 B = product->B; 841 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 842 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 843 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 844 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 845 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 846 847 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 848 849 csrmatA = akok->csrmat; 850 csrmatB = bkok->csrmat; 851 852 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 853 if (transA) { 854 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 855 transA = false; 856 } 857 858 if (transB) { 859 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 860 transB = false; 861 } 862 PetscCall(PetscLogGpuTimeBegin()); 863 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 864 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 865 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 866 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 867 #endif 868 869 PetscCall(PetscLogGpuTimeEnd()); 870 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 871 /* shorter version of MatAssemblyEnd_SeqAIJ */ 872 c = (Mat_SeqAIJ *)C->data; 873 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 874 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 875 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 876 c->reallocs = 0; 877 C->info.mallocs = 0; 878 C->info.nz_unneeded = 0; 879 C->assembled = C->was_assembled = PETSC_TRUE; 880 C->num_ass++; 881 PetscFunctionReturn(PETSC_SUCCESS); 882 } 883 884 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 885 { 886 Mat_Product *product = C->product; 887 MatProductType ptype; 888 Mat A, B; 889 bool transA, transB; 890 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 891 MatProductData_SeqAIJKokkos *pdata; 892 MPI_Comm comm; 893 KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 894 895 PetscFunctionBegin; 896 MatCheckProduct(C, 1); 897 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 898 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 899 A = product->A; 900 B = product->B; 901 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 902 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 903 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 904 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 905 csrmatA = akok->csrmat; 906 csrmatB = bkok->csrmat; 907 908 ptype = product->type; 909 // Take advantage of the symmetry if true 910 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 911 ptype = MATPRODUCT_AB; 912 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 913 } 914 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 915 ptype = MATPRODUCT_AB; 916 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 917 } 918 919 switch (ptype) { 920 case MATPRODUCT_AB: 921 transA = false; 922 transB = false; 923 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 924 break; 925 case MATPRODUCT_AtB: 926 transA = true; 927 transB = false; 928 if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs)); 929 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 930 break; 931 case MATPRODUCT_ABt: 932 transA = false; 933 transB = true; 934 if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs)); 935 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 936 break; 937 default: 938 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 939 } 940 PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 941 pdata->reusesym = product->api_user; 942 943 /* TODO: add command line options to select spgemm algorithms */ 944 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 945 946 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 947 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 948 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 949 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 950 #endif 951 #endif 952 PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 953 954 PetscCall(PetscLogGpuTimeBegin()); 955 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 956 if (transA) { 957 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 958 transA = false; 959 } 960 961 if (transB) { 962 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 963 transB = false; 964 } 965 966 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 967 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 968 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 969 calling new Mat_SeqAIJKokkos(). 970 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 971 */ 972 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 973 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 974 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 975 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 976 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 977 #endif 978 PetscCall(PetscLogGpuTimeEnd()); 979 980 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 981 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 982 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 983 PetscFunctionReturn(PETSC_SUCCESS); 984 } 985 986 /* handles sparse matrix matrix ops */ 987 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 988 { 989 Mat_Product *product = mat->product; 990 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 991 992 PetscFunctionBegin; 993 MatCheckProduct(mat, 1); 994 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 995 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 996 if (Biskok && Ciskok) { 997 switch (product->type) { 998 case MATPRODUCT_AB: 999 case MATPRODUCT_AtB: 1000 case MATPRODUCT_ABt: 1001 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 1002 break; 1003 case MATPRODUCT_PtAP: 1004 case MATPRODUCT_RARt: 1005 case MATPRODUCT_ABC: 1006 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 1007 break; 1008 default: 1009 break; 1010 } 1011 } else { /* fallback for AIJ */ 1012 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 1013 } 1014 PetscFunctionReturn(PETSC_SUCCESS); 1015 } 1016 1017 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 1018 { 1019 Mat_SeqAIJKokkos *aijkok; 1020 1021 PetscFunctionBegin; 1022 PetscCall(PetscLogGpuTimeBegin()); 1023 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1024 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1025 KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 1026 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1027 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 1028 PetscCall(PetscLogGpuTimeEnd()); 1029 PetscFunctionReturn(PETSC_SUCCESS); 1030 } 1031 1032 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular) 1033 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a) 1034 { 1035 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1036 1037 PetscFunctionBegin; 1038 if (A->assembled && aijseq->diagonaldense) { // no missing diagonals 1039 PetscInt n = PetscMin(A->rmap->n, A->cmap->n); 1040 1041 PetscCall(PetscLogGpuTimeBegin()); 1042 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1043 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1044 const auto &Aa = aijkok->a_dual.view_device(); 1045 const auto &Adiag = aijkok->diag_dual.view_device(); 1046 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; })); 1047 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1048 PetscCall(PetscLogGpuFlops(n)); 1049 PetscCall(PetscLogGpuTimeEnd()); 1050 } else { // need reassembly, very slow! 1051 PetscCall(MatShift_Basic(A, a)); 1052 } 1053 PetscFunctionReturn(PETSC_SUCCESS); 1054 } 1055 1056 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is) 1057 { 1058 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data); 1059 1060 PetscFunctionBegin; 1061 if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals 1062 ConstPetscScalarKokkosView dv; 1063 PetscInt n, nv; 1064 1065 PetscCall(PetscLogGpuTimeBegin()); 1066 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1067 PetscCall(VecGetKokkosView(D, &dv)); 1068 PetscCall(VecGetLocalSize(D, &nv)); 1069 n = PetscMin(Y->rmap->n, Y->cmap->n); 1070 PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match"); 1071 1072 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1073 const auto &Aa = aijkok->a_dual.view_device(); 1074 const auto &Adiag = aijkok->diag_dual.view_device(); 1075 PetscCallCXX(Kokkos::parallel_for( 1076 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1077 if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i); 1078 else Aa(Adiag(i)) += dv(i); 1079 })); 1080 PetscCall(VecRestoreKokkosView(D, &dv)); 1081 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1082 PetscCall(PetscLogGpuFlops(n)); 1083 PetscCall(PetscLogGpuTimeEnd()); 1084 } else { // need reassembly, very slow! 1085 PetscCall(MatDiagonalSet_Default(Y, D, is)); 1086 } 1087 PetscFunctionReturn(PETSC_SUCCESS); 1088 } 1089 1090 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr) 1091 { 1092 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1093 PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz; 1094 ConstPetscScalarKokkosView lv, rv; 1095 1096 PetscFunctionBegin; 1097 PetscCall(PetscLogGpuTimeBegin()); 1098 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1099 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1100 const auto &Aa = aijkok->a_dual.view_device(); 1101 const auto &Ai = aijkok->i_dual.view_device(); 1102 const auto &Aj = aijkok->j_dual.view_device(); 1103 if (ll) { 1104 PetscCall(VecGetLocalSize(ll, &m)); 1105 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1106 PetscCall(VecGetKokkosView(ll, &lv)); 1107 PetscCallCXX(Kokkos::parallel_for( // for each row 1108 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1109 PetscInt i = t.league_rank(); // row i 1110 PetscInt len = Ai(i + 1) - Ai(i); 1111 // scale entries on the row 1112 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); }); 1113 })); 1114 PetscCall(VecRestoreKokkosView(ll, &lv)); 1115 PetscCall(PetscLogGpuFlops(nz)); 1116 } 1117 if (rr) { 1118 PetscCall(VecGetLocalSize(rr, &n)); 1119 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1120 PetscCall(VecGetKokkosView(rr, &rv)); 1121 PetscCallCXX(Kokkos::parallel_for( // for each nonzero 1122 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); })); 1123 PetscCall(VecRestoreKokkosView(rr, &lv)); 1124 PetscCall(PetscLogGpuFlops(nz)); 1125 } 1126 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1127 PetscCall(PetscLogGpuTimeEnd()); 1128 PetscFunctionReturn(PETSC_SUCCESS); 1129 } 1130 1131 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 1132 { 1133 Mat_SeqAIJKokkos *aijkok; 1134 1135 PetscFunctionBegin; 1136 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1137 if (aijkok) { /* Only zero the device if data is already there */ 1138 KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0); 1139 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1140 } else { /* Might be preallocated but not assembled */ 1141 PetscCall(MatZeroEntries_SeqAIJ(A)); 1142 } 1143 PetscFunctionReturn(PETSC_SUCCESS); 1144 } 1145 1146 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1147 { 1148 Mat_SeqAIJKokkos *aijkok; 1149 PetscInt n; 1150 PetscScalarKokkosView xv; 1151 1152 PetscFunctionBegin; 1153 PetscCall(VecGetLocalSize(x, &n)); 1154 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1155 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1156 1157 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1158 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1159 1160 const auto &Aa = aijkok->a_dual.view_device(); 1161 const auto &Ai = aijkok->i_dual.view_device(); 1162 const auto &Adiag = aijkok->diag_dual.view_device(); 1163 1164 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1165 Kokkos::parallel_for( 1166 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1167 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1168 else xv(i) = 0; 1169 }); 1170 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1171 PetscFunctionReturn(PETSC_SUCCESS); 1172 } 1173 1174 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1175 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1176 { 1177 Mat_SeqAIJKokkos *aijkok; 1178 1179 PetscFunctionBegin; 1180 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1181 PetscAssertPointer(kv, 2); 1182 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1183 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1184 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1185 *kv = aijkok->a_dual.view_device(); 1186 PetscFunctionReturn(PETSC_SUCCESS); 1187 } 1188 1189 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1190 { 1191 PetscFunctionBegin; 1192 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1193 PetscAssertPointer(kv, 2); 1194 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1195 PetscFunctionReturn(PETSC_SUCCESS); 1196 } 1197 1198 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1199 { 1200 Mat_SeqAIJKokkos *aijkok; 1201 1202 PetscFunctionBegin; 1203 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1204 PetscAssertPointer(kv, 2); 1205 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1206 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1207 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1208 *kv = aijkok->a_dual.view_device(); 1209 PetscFunctionReturn(PETSC_SUCCESS); 1210 } 1211 1212 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1213 { 1214 PetscFunctionBegin; 1215 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1216 PetscAssertPointer(kv, 2); 1217 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1218 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1219 PetscFunctionReturn(PETSC_SUCCESS); 1220 } 1221 1222 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1223 { 1224 Mat_SeqAIJKokkos *aijkok; 1225 1226 PetscFunctionBegin; 1227 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1228 PetscAssertPointer(kv, 2); 1229 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1230 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1231 *kv = aijkok->a_dual.view_device(); 1232 PetscFunctionReturn(PETSC_SUCCESS); 1233 } 1234 1235 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1236 { 1237 PetscFunctionBegin; 1238 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1239 PetscAssertPointer(kv, 2); 1240 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1241 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1242 PetscFunctionReturn(PETSC_SUCCESS); 1243 } 1244 1245 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A) 1246 { 1247 Mat_SeqAIJKokkos *akok; 1248 1249 PetscFunctionBegin; 1250 // Create host copies of the input aij 1251 auto i_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), i_d); 1252 auto j_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), j_d); 1253 // Don't copy the vals to the host now 1254 auto a_h = Kokkos::create_mirror_view(HostMirrorMemorySpace(), a_d); 1255 1256 MatScalarKokkosDualView a_dual = MatScalarKokkosDualView(a_d, a_h); 1257 // Note we have modified device data so it will copy lazily 1258 a_dual.modify_device(); 1259 MatRowMapKokkosDualView i_dual = MatRowMapKokkosDualView(i_d, i_h); 1260 MatColIdxKokkosDualView j_dual = MatColIdxKokkosDualView(j_d, j_h); 1261 1262 PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_dual.extent(0), i_dual, j_dual, a_dual)); 1263 PetscCall(MatCreate(comm, A)); 1264 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1265 PetscFunctionReturn(PETSC_SUCCESS); 1266 } 1267 1268 /* Computes Y += alpha X */ 1269 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1270 { 1271 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1272 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1273 ConstMatScalarKokkosView Xa; 1274 MatScalarKokkosView Ya; 1275 auto exec = PetscGetKokkosExecutionSpace(); 1276 1277 PetscFunctionBegin; 1278 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1279 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1280 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1281 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1282 PetscCall(PetscLogGpuTimeBegin()); 1283 1284 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1285 PetscBool e; 1286 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1287 if (e) { 1288 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1289 if (e) pattern = SAME_NONZERO_PATTERN; 1290 } 1291 } 1292 1293 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1294 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1295 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1296 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1297 */ 1298 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1299 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1300 Xa = xkok->a_dual.view_device(); 1301 Ya = ykok->a_dual.view_device(); 1302 1303 if (pattern == SAME_NONZERO_PATTERN) { 1304 KokkosBlas::axpy(exec, alpha, Xa, Ya); 1305 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1306 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1307 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1308 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1309 1310 Kokkos::parallel_for( 1311 Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1312 PetscInt i = t.league_rank(); // row i 1313 Kokkos::single(Kokkos::PerTeam(t), [=]() { 1314 // Only one thread works in a team 1315 PetscInt p, q = Yi(i); 1316 for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 1317 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 1318 if (Xj(p) == Yj(q)) { // Found it 1319 Ya(q) += alpha * Xa(p); 1320 q++; 1321 } else { 1322 // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1323 // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1324 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 1325 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 1326 #else 1327 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 1328 #endif 1329 } 1330 } 1331 }); 1332 }); 1333 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1334 } else { // different nonzero patterns 1335 Mat Z; 1336 KokkosCsrMatrix zcsr; 1337 KernelHandle kh; 1338 kh.create_spadd_handle(true); // X, Y are sorted 1339 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1340 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1341 zkok = new Mat_SeqAIJKokkos(zcsr); 1342 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1343 PetscCall(MatHeaderReplace(Y, &Z)); 1344 kh.destroy_spadd_handle(); 1345 } 1346 PetscCall(PetscLogGpuTimeEnd()); 1347 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 1348 PetscFunctionReturn(PETSC_SUCCESS); 1349 } 1350 1351 struct MatCOOStruct_SeqAIJKokkos { 1352 PetscCount n; 1353 PetscCount Atot; 1354 PetscInt nz; 1355 PetscCountKokkosView jmap; 1356 PetscCountKokkosView perm; 1357 1358 MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h) 1359 { 1360 nz = coo_h->nz; 1361 n = coo_h->n; 1362 Atot = coo_h->Atot; 1363 jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1)); 1364 perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot)); 1365 } 1366 }; 1367 1368 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data) 1369 { 1370 PetscFunctionBegin; 1371 PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data)); 1372 PetscFunctionReturn(PETSC_SUCCESS); 1373 } 1374 1375 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1376 { 1377 Mat_SeqAIJKokkos *akok; 1378 Mat_SeqAIJ *aseq; 1379 PetscContainer container_h; 1380 MatCOOStruct_SeqAIJ *coo_h; 1381 MatCOOStruct_SeqAIJKokkos *coo_d; 1382 1383 PetscFunctionBegin; 1384 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1385 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1386 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1387 delete akok; 1388 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE); 1389 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1390 1391 // Copy the COO struct to device 1392 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 1393 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 1394 PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h)); 1395 1396 // Put the COO struct in a container and then attach that to the matrix 1397 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos)); 1398 PetscFunctionReturn(PETSC_SUCCESS); 1399 } 1400 1401 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1402 { 1403 MatScalarKokkosView Aa; 1404 ConstMatScalarKokkosView kv; 1405 PetscMemType memtype; 1406 PetscContainer container; 1407 MatCOOStruct_SeqAIJKokkos *coo; 1408 1409 PetscFunctionBegin; 1410 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 1411 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 1412 1413 const auto &n = coo->n; 1414 const auto &Annz = coo->nz; 1415 const auto &jmap = coo->jmap; 1416 const auto &perm = coo->perm; 1417 1418 PetscCall(PetscGetMemType(v, &memtype)); 1419 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1420 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n)); 1421 } else { 1422 kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */ 1423 } 1424 1425 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1426 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1427 1428 PetscCall(PetscLogGpuTimeBegin()); 1429 Kokkos::parallel_for( 1430 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) { 1431 PetscScalar sum = 0.0; 1432 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1433 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1434 }); 1435 PetscCall(PetscLogGpuTimeEnd()); 1436 1437 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1438 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1439 PetscFunctionReturn(PETSC_SUCCESS); 1440 } 1441 1442 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1443 { 1444 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1445 1446 PetscFunctionBegin; 1447 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1448 A->boundtocpu = PETSC_FALSE; 1449 1450 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1451 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1452 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1453 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1454 A->ops->scale = MatScale_SeqAIJKokkos; 1455 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1456 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1457 A->ops->mult = MatMult_SeqAIJKokkos; 1458 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1459 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1460 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1461 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1462 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1463 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1464 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1465 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1466 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1467 A->ops->shift = MatShift_SeqAIJKokkos; 1468 A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos; 1469 A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos; 1470 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1471 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1472 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1473 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1474 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1475 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1476 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1477 1478 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1479 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1480 #if defined(PETSC_HAVE_HYPRE) 1481 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE)); 1482 #endif 1483 PetscFunctionReturn(PETSC_SUCCESS); 1484 } 1485 1486 /* 1487 Extract the (prescribled) diagonal blocks of the matrix and then invert them 1488 1489 Input Parameters: 1490 + A - the MATSEQAIJKOKKOS matrix 1491 . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i) 1492 . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[] 1493 . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i) 1494 - work - a pre-allocated work buffer (as big as diagVal) for use by this routine 1495 1496 Output Parameter: 1497 . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order) 1498 */ 1499 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal) 1500 { 1501 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1502 PetscInt nblocks = bs.extent(0) - 1; 1503 1504 PetscFunctionBegin; 1505 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device 1506 1507 // Pull out the diagonal blocks of the matrix and then invert the blocks 1508 auto Aa = akok->a_dual.view_device(); 1509 auto Ai = akok->i_dual.view_device(); 1510 auto Aj = akok->j_dual.view_device(); 1511 auto Adiag = akok->diag_dual.view_device(); 1512 // TODO: how to tune the team size? 1513 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY) 1514 auto ts = Kokkos::AUTO(); 1515 #else 1516 auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs 1517 #endif 1518 PetscCallCXX(Kokkos::parallel_for( 1519 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) { 1520 const PetscInt bid = teamMember.league_rank(); // block id 1521 const PetscInt rstart = bs(bid); // this block starts from this row 1522 const PetscInt m = bs(bid + 1) - bs(bid); // size of this block 1523 const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order 1524 const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m); 1525 1526 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B 1527 PetscInt i = rstart + r; // i-th row in A 1528 1529 if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case) 1530 PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row 1531 1532 for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet 1533 if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero 1534 B(r, c) = 0.0; 1535 } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column 1536 B(r, c) = Aa(first + c); 1537 } else { // this entry does not show up in the CSR 1538 B(r, c) = 0.0; 1539 } 1540 } 1541 } else { // rare case that the diagonal does not exist 1542 const PetscInt begin = Ai(i); 1543 const PetscInt end = Ai(i + 1); 1544 for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0; 1545 for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not. 1546 if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j); 1547 else if (Aj(j) >= rstart + m) break; 1548 } 1549 } 1550 }); 1551 1552 // LU-decompose B (w/o pivoting) and then invert B 1553 KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0); 1554 KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W); 1555 })); 1556 // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels 1557 PetscFunctionReturn(PETSC_SUCCESS); 1558 } 1559 1560 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1561 { 1562 Mat_SeqAIJ *aseq; 1563 PetscInt i, m, n; 1564 auto exec = PetscGetKokkosExecutionSpace(); 1565 1566 PetscFunctionBegin; 1567 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1568 1569 m = akok->nrows(); 1570 n = akok->ncols(); 1571 PetscCall(MatSetSizes(A, m, n, m, n)); 1572 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1573 1574 /* Set up data structures of A as a MATSEQAIJ */ 1575 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1576 aseq = (Mat_SeqAIJ *)A->data; 1577 1578 PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */ 1579 PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec)); 1580 1581 aseq->i = akok->i_host_data(); 1582 aseq->j = akok->j_host_data(); 1583 aseq->a = akok->a_host_data(); 1584 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1585 aseq->free_a = PETSC_FALSE; 1586 aseq->free_ij = PETSC_FALSE; 1587 aseq->nz = akok->nnz(); 1588 aseq->maxnz = aseq->nz; 1589 1590 PetscCall(PetscMalloc1(m, &aseq->imax)); 1591 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1592 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1593 1594 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1595 akok->nonzerostate = A->nonzerostate; 1596 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1597 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1598 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1599 PetscFunctionReturn(PETSC_SUCCESS); 1600 } 1601 1602 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 1603 { 1604 PetscFunctionBegin; 1605 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1606 *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 1607 PetscFunctionReturn(PETSC_SUCCESS); 1608 } 1609 1610 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 1611 { 1612 Mat_SeqAIJKokkos *akok; 1613 1614 PetscFunctionBegin; 1615 PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 1616 PetscCall(MatCreate(comm, A)); 1617 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1618 PetscFunctionReturn(PETSC_SUCCESS); 1619 } 1620 1621 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1622 1623 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1624 */ 1625 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1626 { 1627 PetscFunctionBegin; 1628 PetscCall(MatCreate(comm, A)); 1629 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1630 PetscFunctionReturn(PETSC_SUCCESS); 1631 } 1632 1633 /*@C 1634 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1635 (the default parallel PETSc format). This matrix will ultimately be handled by 1636 Kokkos for calculations. 1637 1638 Collective 1639 1640 Input Parameters: 1641 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1642 . m - number of rows 1643 . n - number of columns 1644 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 1645 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 1646 1647 Output Parameter: 1648 . A - the matrix 1649 1650 Level: intermediate 1651 1652 Notes: 1653 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1654 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1655 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1656 1657 The AIJ format, also called 1658 compressed row storage, is fully compatible with standard Fortran 1659 storage. That is, the stored row and column indices can begin at 1660 either one (as in Fortran) or zero. 1661 1662 Specify the preallocated storage with either `nz` or `nnz` (not both). 1663 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1664 allocation. 1665 1666 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()` 1667 @*/ 1668 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1669 { 1670 PetscFunctionBegin; 1671 PetscCall(PetscKokkosInitializeCheck()); 1672 PetscCall(MatCreate(comm, A)); 1673 PetscCall(MatSetSizes(*A, m, n, m, n)); 1674 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1675 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1676 PetscFunctionReturn(PETSC_SUCCESS); 1677 } 1678 1679 // After matrix numeric factorization, there are still steps to do before triangular solve can be called. 1680 // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does). 1681 // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve(). 1682 // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck. 1683 static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A) 1684 { 1685 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1686 const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy 1687 const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy 1688 1689 PetscFunctionBegin; 1690 if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet 1691 if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d)); 1692 if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d)); 1693 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1694 } 1695 PetscFunctionReturn(PETSC_SUCCESS); 1696 } 1697 1698 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1699 { 1700 const PetscInt n = A->rmap->n; 1701 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1702 const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy 1703 const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy 1704 1705 PetscFunctionBegin; 1706 if (!factors->transpose_updated) { 1707 if (has_upper) { 1708 if (!factors->iUt_d.extent(0)) { // Allocate Ut on device if not yet 1709 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix 1710 factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 1711 factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 1712 } 1713 1714 if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host 1715 if (!factors->U) { 1716 Mat_SeqAIJ *seq; 1717 1718 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U)); 1719 PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut)); 1720 1721 seq = static_cast<Mat_SeqAIJ *>(factors->Ut->data); 1722 factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1); 1723 factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz); 1724 factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz); 1725 } else { 1726 PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h 1727 } 1728 // Copy Ut from host to device 1729 PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h)); 1730 PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h)); 1731 PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h)); 1732 } else { // If U was computed on device, we also compute the transpose there 1733 // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. 1734 PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, 1735 factors->jU_d, factors->aU_d, 1736 factors->iUt_d, factors->jUt_d, 1737 factors->aUt_d)); 1738 PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d)); 1739 } 1740 PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d)); 1741 } 1742 1743 // do the same for L with LU 1744 if (has_lower) { 1745 if (!factors->iLt_d.extent(0)) { // Allocate Lt on device if not yet 1746 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix 1747 factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 1748 factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 1749 } 1750 1751 if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host 1752 if (!factors->L) { 1753 Mat_SeqAIJ *seq; 1754 1755 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L)); 1756 PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt)); 1757 1758 seq = static_cast<Mat_SeqAIJ *>(factors->Lt->data); 1759 factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1); 1760 factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz); 1761 factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz); 1762 } else { 1763 PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h 1764 } 1765 // Copy Lt from host to device 1766 PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h)); 1767 PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h)); 1768 PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h)); 1769 } else { // If L was computed on device, we also compute the transpose there 1770 // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. 1771 PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, 1772 factors->jL_d, factors->aL_d, 1773 factors->iLt_d, factors->jLt_d, 1774 factors->aLt_d)); 1775 PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d)); 1776 } 1777 PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d)); 1778 } 1779 1780 factors->transpose_updated = PETSC_TRUE; 1781 } 1782 PetscFunctionReturn(PETSC_SUCCESS); 1783 } 1784 1785 // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A. 1786 // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty. 1787 static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx) 1788 { 1789 auto exec = PetscGetKokkosExecutionSpace(); 1790 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1791 PetscInt m = A->rmap->n; 1792 PetscScalarKokkosView D = factors->D_d; 1793 PetscScalarKokkosView X, Y, B; // alias 1794 ConstPetscScalarKokkosView b; 1795 PetscScalarKokkosView x; 1796 PetscIntKokkosView &rowperm = factors->rowperm; 1797 PetscBool identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1798 1799 PetscFunctionBegin; 1800 PetscCall(PetscLogGpuTimeBegin()); 1801 PetscCall(MatSeqAIJKokkosSolveCheck(A)); // for UX = T 1802 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B 1803 PetscCall(VecGetKokkosView(bb, &b)); 1804 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1805 1806 // Solve U^T Y = B 1807 if (identity) { // Reorder b with the row permutation 1808 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1809 Y = factors->workVector; 1810 } else { 1811 B = factors->workVector; 1812 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); })); 1813 Y = x; 1814 } 1815 PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y)); 1816 1817 // Solve diag(D) Y' = Y. 1818 // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication. 1819 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); })); 1820 1821 // Solve UX = Y 1822 if (identity) { 1823 X = x; 1824 } else { 1825 X = factors->workVector; // B is not needed anymore 1826 } 1827 PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X)); 1828 1829 // Reorder X with the inverse column (row) permutation 1830 if (!identity) { 1831 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); })); 1832 } 1833 1834 PetscCall(VecRestoreKokkosView(bb, &b)); 1835 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1836 PetscCall(PetscLogGpuTimeEnd()); 1837 PetscFunctionReturn(PETSC_SUCCESS); 1838 } 1839 1840 // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively. 1841 // R and C are represented by rowperm and colperm in factors. 1842 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty. 1843 static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx) 1844 { 1845 auto exec = PetscGetKokkosExecutionSpace(); 1846 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1847 PetscInt m = A->rmap->n; 1848 PetscScalarKokkosView X, Y, B; // alias 1849 ConstPetscScalarKokkosView b; 1850 PetscScalarKokkosView x; 1851 PetscIntKokkosView &rowperm = factors->rowperm; 1852 PetscIntKokkosView &colperm = factors->colperm; 1853 PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1854 PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1855 1856 PetscFunctionBegin; 1857 PetscCall(PetscLogGpuTimeBegin()); 1858 PetscCall(MatSeqAIJKokkosSolveCheck(A)); 1859 PetscCall(VecGetKokkosView(bb, &b)); 1860 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1861 1862 // Solve L Y = B (i.e., L (U C^- x) = R b). R b indicates applying the row permutation on b. 1863 if (row_identity) { 1864 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1865 Y = factors->workVector; 1866 } else { 1867 B = factors->workVector; 1868 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); })); 1869 Y = x; 1870 } 1871 PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y)); 1872 1873 // Solve U C^- x = Y 1874 if (col_identity) { 1875 X = x; 1876 } else { 1877 X = factors->workVector; 1878 } 1879 PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X)); 1880 1881 // x = C X; Reorder X with the inverse col permutation 1882 if (!col_identity) { 1883 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); })); 1884 } 1885 1886 PetscCall(VecRestoreKokkosView(bb, &b)); 1887 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1888 PetscCall(PetscLogGpuTimeEnd()); 1889 PetscFunctionReturn(PETSC_SUCCESS); 1890 } 1891 1892 // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively. 1893 // R and C are represented by rowperm and colperm in factors. 1894 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty. 1895 // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R. 1896 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx) 1897 { 1898 auto exec = PetscGetKokkosExecutionSpace(); 1899 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1900 PetscInt m = A->rmap->n; 1901 PetscScalarKokkosView X, Y, B; // alias 1902 ConstPetscScalarKokkosView b; 1903 PetscScalarKokkosView x; 1904 PetscIntKokkosView &rowperm = factors->rowperm; 1905 PetscIntKokkosView &colperm = factors->colperm; 1906 PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1907 PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1908 1909 PetscFunctionBegin; 1910 PetscCall(PetscLogGpuTimeBegin()); 1911 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T 1912 PetscCall(VecGetKokkosView(bb, &b)); 1913 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1914 1915 // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b). Note C^- b = C^T b, which means applying the column permutation on b. 1916 if (col_identity) { // Reorder b with the col permutation 1917 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1918 Y = factors->workVector; 1919 } else { 1920 B = factors->workVector; 1921 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); })); 1922 Y = x; 1923 } 1924 PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y)); 1925 1926 // Solve L^T X = Y 1927 if (row_identity) { 1928 X = x; 1929 } else { 1930 X = factors->workVector; 1931 } 1932 PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X)); 1933 1934 // x = R^- X = R^T X; Reorder X with the inverse row permutation 1935 if (!row_identity) { 1936 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); })); 1937 } 1938 1939 PetscCall(VecRestoreKokkosView(bb, &b)); 1940 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1941 PetscCall(PetscLogGpuTimeEnd()); 1942 PetscFunctionReturn(PETSC_SUCCESS); 1943 } 1944 1945 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1946 { 1947 PetscFunctionBegin; 1948 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1949 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1950 1951 if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device 1952 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1953 Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data); 1954 const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag; 1955 const MatScalar *Ba = b->a; 1956 PetscInt m = B->rmap->n, n = B->cmap->n; 1957 1958 if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time 1959 // Allocate memory and copy the structure 1960 factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1); 1961 factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries 1962 factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m); 1963 factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1); 1964 factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m])); 1965 factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m])); 1966 1967 PetscInt *Li = factors->iL_h.data(); 1968 PetscInt *Lj = factors->jL_h.data(); 1969 PetscInt *Ui = factors->iU_h.data(); 1970 PetscInt *Uj = factors->jU_h.data(); 1971 1972 Li[0] = Ui[0] = 0; 1973 for (PetscInt i = 0; i < m; i++) { 1974 PetscInt llen = Bi[i + 1] - Bi[i]; // exclusive of the diagonal entry 1975 PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry 1976 1977 PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen); // entries of L on the left of the diagonal 1978 Lj[Li[i] + llen] = i; // diagonal entry of L 1979 1980 Uj[Ui[i]] = i; // diagonal entry of U 1981 PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1); // entries of U on the right of the diagonal 1982 1983 Li[i + 1] = Li[i] + llen + 1; 1984 Ui[i + 1] = Ui[i] + ulen; 1985 } 1986 1987 factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h); 1988 factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h); 1989 factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h); 1990 factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h); 1991 factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h); 1992 factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h); 1993 1994 // Copy row/col permutation to device 1995 IS rowperm = ((Mat_SeqAIJ *)B->data)->row; 1996 PetscBool row_identity; 1997 PetscCall(ISIdentity(rowperm, &row_identity)); 1998 if (!row_identity) { 1999 const PetscInt *ip; 2000 2001 PetscCall(ISGetIndices(rowperm, &ip)); 2002 factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m); 2003 PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m))); 2004 PetscCall(ISRestoreIndices(rowperm, &ip)); 2005 PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt))); 2006 } 2007 2008 IS colperm = ((Mat_SeqAIJ *)B->data)->col; 2009 PetscBool col_identity; 2010 PetscCall(ISIdentity(colperm, &col_identity)); 2011 if (!col_identity) { 2012 const PetscInt *ip; 2013 2014 PetscCall(ISGetIndices(colperm, &ip)); 2015 factors->colperm = PetscIntKokkosView(NoInit("colperm"), n); 2016 PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n))); 2017 PetscCall(ISRestoreIndices(colperm, &ip)); 2018 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 2019 } 2020 2021 /* Create sptrsv handles for L, U and their transpose */ 2022 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2023 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2024 #else 2025 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2026 #endif 2027 factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */); 2028 factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */); 2029 factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */); 2030 factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */); 2031 } 2032 2033 // Copy the value 2034 for (PetscInt i = 0; i < m; i++) { 2035 PetscInt llen = Bi[i + 1] - Bi[i]; 2036 PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; 2037 const PetscInt *Li = factors->iL_h.data(); 2038 const PetscInt *Ui = factors->iU_h.data(); 2039 2040 PetscScalar *La = factors->aL_h.data(); 2041 PetscScalar *Ua = factors->aU_h.data(); 2042 2043 PetscArraycpy(La + Li[i], Ba + Bi[i], llen); // entries of L 2044 La[Li[i] + llen] = 1.0; // diagonal entry 2045 2046 Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]]; // diagonal entry 2047 PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1); // entries of U 2048 } 2049 2050 PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h)); 2051 PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h)); 2052 // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic 2053 factors->transpose_updated = PETSC_FALSE; 2054 factors->sptrsv_symbolic_completed = PETSC_FALSE; 2055 2056 B->ops->solve = MatSolve_SeqAIJKokkos_LU; 2057 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU; 2058 } 2059 2060 B->ops->matsolve = NULL; 2061 B->ops->matsolvetranspose = NULL; 2062 PetscFunctionReturn(PETSC_SUCCESS); 2063 } 2064 2065 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info) 2066 { 2067 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 2068 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2069 PetscInt fill_lev = info->levels; 2070 2071 PetscFunctionBegin; 2072 PetscCall(PetscLogGpuTimeBegin()); 2073 PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false"); 2074 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2075 2076 auto a_d = aijkok->a_dual.view_device(); 2077 auto i_d = aijkok->i_dual.view_device(); 2078 auto j_d = aijkok->j_dual.view_device(); 2079 2080 PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d)); 2081 2082 B->assembled = PETSC_TRUE; 2083 B->preallocated = PETSC_TRUE; 2084 B->ops->solve = MatSolve_SeqAIJKokkos_LU; 2085 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU; 2086 B->ops->matsolve = NULL; 2087 B->ops->matsolvetranspose = NULL; 2088 2089 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 2090 factors->transpose_updated = PETSC_FALSE; 2091 factors->sptrsv_symbolic_completed = PETSC_FALSE; 2092 /* TODO: log flops, but how to know that? */ 2093 PetscCall(PetscLogGpuTimeEnd()); 2094 PetscFunctionReturn(PETSC_SUCCESS); 2095 } 2096 2097 // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering 2098 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info) 2099 { 2100 Mat_SeqAIJKokkos *aijkok; 2101 Mat_SeqAIJ *b; 2102 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2103 PetscInt fill_lev = info->levels; 2104 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 2105 PetscInt n = A->rmap->n; 2106 2107 PetscFunctionBegin; 2108 PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now"); 2109 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2110 2111 /* Create a spiluk handle and then do symbolic factorization */ 2112 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 2113 factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 2114 2115 auto spiluk_handle = factors->kh.get_spiluk_handle(); 2116 2117 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 2118 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 2119 Kokkos::realloc(factors->iU_d, n + 1); 2120 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 2121 2122 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 2123 auto i_d = aijkok->i_dual.view_device(); 2124 auto j_d = aijkok->j_dual.view_device(); 2125 PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d)); 2126 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 2127 2128 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 2129 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 2130 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 2131 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 2132 2133 /* TODO: add options to select sptrsv algorithms */ 2134 /* Create sptrsv handles for L, U and their transpose */ 2135 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2136 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2137 #else 2138 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2139 #endif 2140 2141 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 2142 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 2143 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 2144 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 2145 2146 /* Fill fields of the factor matrix B */ 2147 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 2148 b = (Mat_SeqAIJ *)B->data; 2149 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 2150 B->info.fill_ratio_given = info->fill; 2151 B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 2152 2153 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0; 2154 PetscFunctionReturn(PETSC_SUCCESS); 2155 } 2156 2157 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2158 { 2159 PetscFunctionBegin; 2160 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2161 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2162 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2163 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 2164 PetscFunctionReturn(PETSC_SUCCESS); 2165 } 2166 2167 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2168 { 2169 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2170 2171 PetscFunctionBegin; 2172 if (!info->factoronhost) { 2173 PetscCall(ISIdentity(isrow, &row_identity)); 2174 PetscCall(ISIdentity(iscol, &col_identity)); 2175 } 2176 2177 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2178 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2179 2180 if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering 2181 PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info)); 2182 } else { 2183 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host 2184 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 2185 } 2186 PetscFunctionReturn(PETSC_SUCCESS); 2187 } 2188 2189 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 2190 { 2191 PetscFunctionBegin; 2192 PetscCall(MatSeqAIJKokkosSyncHost(A)); 2193 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 2194 2195 if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device 2196 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2197 Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data); 2198 const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag; 2199 const MatScalar *Ba = b->a; 2200 PetscInt m = B->rmap->n; 2201 2202 if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization 2203 // Allocate memory and copy the structure 2204 factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h 2205 factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]); 2206 factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]); 2207 factors->D_h = MatScalarKokkosViewHost(NoInit("D_h"), m); 2208 factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h); 2209 factors->D_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h); 2210 2211 // Build jU_h from the skewed Aj 2212 PetscInt *Uj = factors->jU_h.data(); 2213 for (PetscInt i = 0; i < m; i++) { 2214 PetscInt ulen = Bi[i + 1] - Bi[i]; 2215 Uj[Bi[i]] = i; // diagonal entry 2216 PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal 2217 } 2218 2219 // Copy iU, jU to device 2220 PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h)); 2221 PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h)); 2222 2223 // Copy row/col permutation to device 2224 IS rowperm = ((Mat_SeqAIJ *)B->data)->row; 2225 PetscBool row_identity; 2226 PetscCall(ISIdentity(rowperm, &row_identity)); 2227 if (!row_identity) { 2228 const PetscInt *ip; 2229 2230 PetscCall(ISGetIndices(rowperm, &ip)); 2231 PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m)); 2232 PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m))); 2233 PetscCall(ISRestoreIndices(rowperm, &ip)); 2234 PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt))); 2235 } 2236 2237 // Create sptrsv handles for U and U^T 2238 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2239 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2240 #else 2241 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2242 #endif 2243 factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */); 2244 factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */); 2245 } 2246 // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them 2247 B->ops->solve = MatSolve_SeqAIJKokkos_Cholesky; 2248 B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky; 2249 2250 // Copy the value 2251 PetscScalar *Ua = factors->aU_h.data(); 2252 PetscScalar *D = factors->D_h.data(); 2253 for (PetscInt i = 0; i < m; i++) { 2254 D[i] = Ba[Bdiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 2255 Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U 2256 for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k]; 2257 } 2258 PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h)); 2259 PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h)); 2260 2261 factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again 2262 factors->transpose_updated = PETSC_FALSE; 2263 } 2264 2265 B->ops->matsolve = NULL; 2266 B->ops->matsolvetranspose = NULL; 2267 PetscFunctionReturn(PETSC_SUCCESS); 2268 } 2269 2270 static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2271 { 2272 PetscFunctionBegin; 2273 if (info->solveonhost) { 2274 // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc. 2275 PetscCall(MatSetType(B, MATSEQSBAIJ)); 2276 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 2277 } 2278 2279 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2280 2281 if (!info->solveonhost) { 2282 // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr 2283 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2284 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2285 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos; 2286 } 2287 PetscFunctionReturn(PETSC_SUCCESS); 2288 } 2289 2290 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2291 { 2292 PetscFunctionBegin; 2293 if (info->solveonhost) { 2294 // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc. 2295 PetscCall(MatSetType(B, MATSEQSBAIJ)); 2296 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 2297 } 2298 2299 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm 2300 2301 if (!info->solveonhost) { 2302 // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr 2303 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2304 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2305 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos; 2306 } 2307 PetscFunctionReturn(PETSC_SUCCESS); 2308 } 2309 2310 // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix 2311 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type) 2312 { 2313 PetscFunctionBegin; 2314 *type = MATSOLVERKOKKOS; 2315 PetscFunctionReturn(PETSC_SUCCESS); 2316 } 2317 2318 /*MC 2319 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 2320 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 2321 2322 Level: beginner 2323 2324 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 2325 M*/ 2326 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 2327 { 2328 PetscInt n = A->rmap->n; 2329 MPI_Comm comm; 2330 2331 PetscFunctionBegin; 2332 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 2333 PetscCall(MatCreate(comm, B)); 2334 PetscCall(MatSetSizes(*B, n, n, n, n)); 2335 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2336 (*B)->factortype = ftype; 2337 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 2338 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2339 PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2340 2341 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2342 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 2343 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 2344 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2345 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2346 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2347 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2348 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJKokkos; 2349 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos; 2350 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2351 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2352 } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 2353 2354 // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host 2355 (*B)->canuseordering = PETSC_TRUE; 2356 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos)); 2357 PetscFunctionReturn(PETSC_SUCCESS); 2358 } 2359 2360 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void) 2361 { 2362 PetscFunctionBegin; 2363 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 2364 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos)); 2365 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 2366 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos)); 2367 PetscFunctionReturn(PETSC_SUCCESS); 2368 } 2369 2370 /* Utility to print out a KokkosCsrMatrix for debugging */ 2371 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 2372 { 2373 const auto &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map); 2374 const auto &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries); 2375 const auto &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values); 2376 const PetscInt *i = iv.data(); 2377 const PetscInt *j = jv.data(); 2378 const PetscScalar *a = av.data(); 2379 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 2380 2381 PetscFunctionBegin; 2382 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 2383 for (PetscInt k = 0; k < m; k++) { 2384 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 2385 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 2386 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 2387 } 2388 PetscFunctionReturn(PETSC_SUCCESS); 2389 } 2390