1 #include <petsc_kokkos.hpp> 2 #include <petscvec_kokkos.hpp> 3 #include <petscmat_kokkos.hpp> 4 #include <petscpkg_version.h> 5 #include <petsc/private/petscimpl.h> 6 #include <petsc/private/sfimpl.h> 7 #include <petsc/private/kokkosimpl.hpp> 8 #include <petscsys.h> 9 10 #include <Kokkos_Core.hpp> 11 #include <KokkosBlas.hpp> 12 #include <KokkosSparse_CrsMatrix.hpp> 13 14 // To suppress compiler warnings: 15 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63: 16 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, 17 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*, 18 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations] 19 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations") 20 #include <KokkosSparse_spmv.hpp> 21 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END() 22 23 #include <KokkosSparse_spiluk.hpp> 24 #include <KokkosSparse_sptrsv.hpp> 25 #include <KokkosSparse_spgemm.hpp> 26 #include <KokkosSparse_spadd.hpp> 27 #include <KokkosBatched_LU_Decl.hpp> 28 #include <KokkosBatched_InverseLU_Decl.hpp> 29 30 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 31 32 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 33 #include <KokkosSparse_Utils.hpp> 34 using KokkosSparse::sort_crs_matrix; 35 using KokkosSparse::Impl::transpose_matrix; 36 #else 37 #include <KokkosKernels_Sorting.hpp> 38 using KokkosKernels::sort_crs_matrix; 39 using KokkosKernels::Impl::transpose_matrix; 40 #endif 41 42 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0) 43 using KokkosSparse::spiluk_symbolic; 44 using KokkosSparse::spiluk_numeric; 45 using KokkosSparse::sptrsv_symbolic; 46 using KokkosSparse::sptrsv_solve; 47 using KokkosSparse::Experimental::SPTRSVAlgorithm; 48 using KokkosSparse::Experimental::SPILUKAlgorithm; 49 #else 50 using KokkosSparse::Experimental::spiluk_symbolic; 51 using KokkosSparse::Experimental::spiluk_numeric; 52 using KokkosSparse::Experimental::sptrsv_symbolic; 53 using KokkosSparse::Experimental::sptrsv_solve; 54 using KokkosSparse::Experimental::SPTRSVAlgorithm; 55 using KokkosSparse::Experimental::SPILUKAlgorithm; 56 #endif 57 58 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 59 60 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 61 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 62 In the latter case, it is important to set a_dual's sync state correctly. 63 */ 64 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 65 { 66 Mat_SeqAIJ *aijseq; 67 Mat_SeqAIJKokkos *aijkok; 68 69 PetscFunctionBegin; 70 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 71 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 72 73 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 74 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 75 76 /* If aijkok does not exist, we just copy i, j to device. 77 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 78 In both cases, we build a new aijkok structure. 79 */ 80 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 81 if (aijkok && aijkok->host_aij_allocated_by_kokkos) { /* Avoid accidently freeing much needed a,i,j on host when deleting aijkok */ 82 PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i)); 83 PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j)); 84 PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a)); 85 PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1)); 86 PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz())); 87 PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz())); 88 aijseq->free_a = PETSC_TRUE; 89 aijseq->free_ij = PETSC_TRUE; 90 /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where 91 we have the CsrMatrix on device first and then copy to host, followed by 92 MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL. 93 One could improve it by not using NULL garray. 94 */ 95 } 96 delete aijkok; 97 aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 98 A->spptr = aijkok; 99 } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag. 100 MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n); 101 auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h); 102 aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h); 103 } 104 PetscFunctionReturn(PETSC_SUCCESS); 105 } 106 107 /* Sync CSR data to device if not yet */ 108 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 109 { 110 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 111 112 PetscFunctionBegin; 113 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 114 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 115 if (aijkok->a_dual.need_sync_device()) { 116 aijkok->a_dual.sync_device(); 117 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 118 aijkok->hermitian_updated = PETSC_FALSE; 119 } 120 PetscFunctionReturn(PETSC_SUCCESS); 121 } 122 123 /* Mark the CSR data on device as modified */ 124 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 125 { 126 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 127 128 PetscFunctionBegin; 129 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 130 aijkok->a_dual.clear_sync_state(); 131 aijkok->a_dual.modify_device(); 132 aijkok->transpose_updated = PETSC_FALSE; 133 aijkok->hermitian_updated = PETSC_FALSE; 134 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 135 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 136 PetscFunctionReturn(PETSC_SUCCESS); 137 } 138 139 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 140 { 141 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 142 auto exec = PetscGetKokkosExecutionSpace(); 143 144 PetscFunctionBegin; 145 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 146 /* We do not expect one needs factors on host */ 147 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 148 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 149 PetscCall(KokkosDualViewSync<HostMirrorMemorySpace>(aijkok->a_dual, exec)); 150 PetscFunctionReturn(PETSC_SUCCESS); 151 } 152 153 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 154 { 155 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 156 157 PetscFunctionBegin; 158 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 159 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 160 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 161 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 162 */ 163 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 164 auto exec = PetscGetKokkosExecutionSpace(); 165 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 166 PetscCallCXX(exec.fence()); 167 *array = aijkok->a_dual.view_host().data(); 168 } else { /* Happens when calling MatSetValues on a newly created matrix */ 169 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 170 } 171 PetscFunctionReturn(PETSC_SUCCESS); 172 } 173 174 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 175 { 176 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 177 178 PetscFunctionBegin; 179 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 180 PetscFunctionReturn(PETSC_SUCCESS); 181 } 182 183 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 184 { 185 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 186 187 PetscFunctionBegin; 188 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 189 auto exec = PetscGetKokkosExecutionSpace(); 190 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 191 PetscCallCXX(exec.fence()); 192 *array = aijkok->a_dual.view_host().data(); 193 } else { 194 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 195 } 196 PetscFunctionReturn(PETSC_SUCCESS); 197 } 198 199 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 200 { 201 PetscFunctionBegin; 202 *array = NULL; 203 PetscFunctionReturn(PETSC_SUCCESS); 204 } 205 206 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 207 { 208 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 209 210 PetscFunctionBegin; 211 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 212 *array = aijkok->a_dual.view_host().data(); 213 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 214 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 215 } 216 PetscFunctionReturn(PETSC_SUCCESS); 217 } 218 219 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 220 { 221 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 222 223 PetscFunctionBegin; 224 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 225 aijkok->a_dual.clear_sync_state(); 226 aijkok->a_dual.modify_host(); 227 } 228 PetscFunctionReturn(PETSC_SUCCESS); 229 } 230 231 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 232 { 233 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 234 235 PetscFunctionBegin; 236 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 237 238 if (i) *i = aijkok->i_device_data(); 239 if (j) *j = aijkok->j_device_data(); 240 if (a) { 241 aijkok->a_dual.sync_device(); 242 *a = aijkok->a_device_data(); 243 } 244 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 245 PetscFunctionReturn(PETSC_SUCCESS); 246 } 247 248 /* 249 Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 250 251 Input Parameter: 252 . A - the MATSEQAIJKOKKOS matrix 253 254 Output Parameters: 255 + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 256 - T_d - the transpose on device, whose value array is allocated but not initialized 257 */ 258 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 259 { 260 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 261 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 262 const PetscInt *Ai = aseq->i, *Aj = aseq->j; 263 MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 264 MatRowMapType *Ti = Ti_h.data(); 265 MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 266 MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 267 PetscInt *Tj = Tj_h.data(); 268 PetscInt *perm = perm_h.data(); 269 PetscInt *offset; 270 271 PetscFunctionBegin; 272 // Populate Ti 273 PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 274 Ti++; 275 for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 276 Ti--; 277 for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 278 279 // Populate Tj and the permutation array 280 PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 281 for (PetscInt i = 0; i < m; i++) { 282 for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 283 PetscInt r = Aj[j]; // row r of T 284 PetscInt disp = Ti[r] + offset[r]; 285 286 Tj[disp] = i; // col i of T 287 perm[disp] = j; 288 offset[r]++; 289 } 290 } 291 PetscCall(PetscFree(offset)); 292 293 // Sort each row of T, along with the permutation array 294 for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 295 296 // Output perm and T on device 297 auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 298 auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 299 PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 300 PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 301 PetscFunctionReturn(PETSC_SUCCESS); 302 } 303 304 // Generate the transpose on device and cache it internally 305 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 306 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 307 { 308 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 309 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 310 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 311 KokkosCsrMatrix &T = akok->csrmatT; 312 313 PetscFunctionBegin; 314 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 315 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 316 317 const auto &Aa = akok->a_dual.view_device(); 318 319 if (A->symmetric == PETSC_BOOL3_TRUE) { 320 *csrmatT = akok->csrmat; 321 } else { 322 // See if we already have a cached transpose and its value is up to date 323 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 324 if (!akok->transpose_updated) { // if the value is out of date, update the cached version 325 const auto &perm = akok->transpose_perm; // get the permutation array 326 auto &Ta = T.values; 327 328 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 329 } 330 } else { // Generate T of size n x m for the first time 331 MatRowMapKokkosView perm; 332 333 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 334 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 335 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 336 } 337 akok->transpose_updated = PETSC_TRUE; 338 *csrmatT = akok->csrmatT; 339 } 340 PetscFunctionReturn(PETSC_SUCCESS); 341 } 342 343 // Generate the Hermitian on device and cache it internally 344 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 345 { 346 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 347 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 348 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 349 KokkosCsrMatrix &T = akok->csrmatH; 350 351 PetscFunctionBegin; 352 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 353 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 354 355 const auto &Aa = akok->a_dual.view_device(); 356 357 if (A->hermitian == PETSC_BOOL3_TRUE) { 358 *csrmatH = akok->csrmat; 359 } else { 360 // See if we already have a cached hermitian and its value is up to date 361 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 362 if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 363 const auto &perm = akok->transpose_perm; // get the permutation array 364 auto &Ta = T.values; 365 366 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 367 } 368 } else { // Generate T of size n x m for the first time 369 MatRowMapKokkosView perm; 370 371 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 372 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 373 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 374 } 375 akok->hermitian_updated = PETSC_TRUE; 376 *csrmatH = akok->csrmatH; 377 } 378 PetscFunctionReturn(PETSC_SUCCESS); 379 } 380 381 /* y = A x */ 382 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 383 { 384 Mat_SeqAIJKokkos *aijkok; 385 ConstPetscScalarKokkosView xv; 386 PetscScalarKokkosView yv; 387 388 PetscFunctionBegin; 389 PetscCall(PetscLogGpuTimeBegin()); 390 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 391 PetscCall(VecGetKokkosView(xx, &xv)); 392 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 393 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 394 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 395 PetscCall(VecRestoreKokkosView(xx, &xv)); 396 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 397 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 398 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 399 PetscCall(PetscLogGpuTimeEnd()); 400 PetscFunctionReturn(PETSC_SUCCESS); 401 } 402 403 /* y = A^T x */ 404 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 405 { 406 Mat_SeqAIJKokkos *aijkok; 407 const char *mode; 408 ConstPetscScalarKokkosView xv; 409 PetscScalarKokkosView yv; 410 KokkosCsrMatrix csrmat; 411 412 PetscFunctionBegin; 413 PetscCall(PetscLogGpuTimeBegin()); 414 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 415 PetscCall(VecGetKokkosView(xx, &xv)); 416 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 417 if (A->form_explicit_transpose) { 418 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 419 mode = "N"; 420 } else { 421 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 422 csrmat = aijkok->csrmat; 423 mode = "T"; 424 } 425 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 426 PetscCall(VecRestoreKokkosView(xx, &xv)); 427 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 428 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 429 PetscCall(PetscLogGpuTimeEnd()); 430 PetscFunctionReturn(PETSC_SUCCESS); 431 } 432 433 /* y = A^H x */ 434 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 435 { 436 Mat_SeqAIJKokkos *aijkok; 437 const char *mode; 438 ConstPetscScalarKokkosView xv; 439 PetscScalarKokkosView yv; 440 KokkosCsrMatrix csrmat; 441 442 PetscFunctionBegin; 443 PetscCall(PetscLogGpuTimeBegin()); 444 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 445 PetscCall(VecGetKokkosView(xx, &xv)); 446 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 447 if (A->form_explicit_transpose) { 448 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 449 mode = "N"; 450 } else { 451 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 452 csrmat = aijkok->csrmat; 453 mode = "C"; 454 } 455 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 456 PetscCall(VecRestoreKokkosView(xx, &xv)); 457 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 458 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 459 PetscCall(PetscLogGpuTimeEnd()); 460 PetscFunctionReturn(PETSC_SUCCESS); 461 } 462 463 /* z = A x + y */ 464 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 465 { 466 Mat_SeqAIJKokkos *aijkok; 467 ConstPetscScalarKokkosView xv; 468 PetscScalarKokkosView zv; 469 470 PetscFunctionBegin; 471 PetscCall(PetscLogGpuTimeBegin()); 472 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 473 if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host 474 PetscCall(VecGetKokkosView(xx, &xv)); 475 PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device 476 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 477 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 478 PetscCall(VecRestoreKokkosView(xx, &xv)); 479 PetscCall(VecRestoreKokkosView(zz, &zv)); 480 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 481 PetscCall(PetscLogGpuTimeEnd()); 482 PetscFunctionReturn(PETSC_SUCCESS); 483 } 484 485 /* z = A^T x + y */ 486 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 487 { 488 Mat_SeqAIJKokkos *aijkok; 489 const char *mode; 490 ConstPetscScalarKokkosView xv; 491 PetscScalarKokkosView zv; 492 KokkosCsrMatrix csrmat; 493 494 PetscFunctionBegin; 495 PetscCall(PetscLogGpuTimeBegin()); 496 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 497 if (zz != yy) PetscCall(VecCopy(yy, zz)); 498 PetscCall(VecGetKokkosView(xx, &xv)); 499 PetscCall(VecGetKokkosView(zz, &zv)); 500 if (A->form_explicit_transpose) { 501 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 502 mode = "N"; 503 } else { 504 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 505 csrmat = aijkok->csrmat; 506 mode = "T"; 507 } 508 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 509 PetscCall(VecRestoreKokkosView(xx, &xv)); 510 PetscCall(VecRestoreKokkosView(zz, &zv)); 511 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 512 PetscCall(PetscLogGpuTimeEnd()); 513 PetscFunctionReturn(PETSC_SUCCESS); 514 } 515 516 /* z = A^H x + y */ 517 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 518 { 519 Mat_SeqAIJKokkos *aijkok; 520 const char *mode; 521 ConstPetscScalarKokkosView xv; 522 PetscScalarKokkosView zv; 523 KokkosCsrMatrix csrmat; 524 525 PetscFunctionBegin; 526 PetscCall(PetscLogGpuTimeBegin()); 527 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 528 if (zz != yy) PetscCall(VecCopy(yy, zz)); 529 PetscCall(VecGetKokkosView(xx, &xv)); 530 PetscCall(VecGetKokkosView(zz, &zv)); 531 if (A->form_explicit_transpose) { 532 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 533 mode = "N"; 534 } else { 535 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 536 csrmat = aijkok->csrmat; 537 mode = "C"; 538 } 539 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 540 PetscCall(VecRestoreKokkosView(xx, &xv)); 541 PetscCall(VecRestoreKokkosView(zz, &zv)); 542 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 543 PetscCall(PetscLogGpuTimeEnd()); 544 PetscFunctionReturn(PETSC_SUCCESS); 545 } 546 547 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 548 { 549 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 550 551 PetscFunctionBegin; 552 switch (op) { 553 case MAT_FORM_EXPLICIT_TRANSPOSE: 554 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 555 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 556 A->form_explicit_transpose = flg; 557 break; 558 default: 559 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 560 break; 561 } 562 PetscFunctionReturn(PETSC_SUCCESS); 563 } 564 565 /* Depending on reuse, either build a new mat, or use the existing mat */ 566 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 567 { 568 Mat_SeqAIJ *aseq; 569 570 PetscFunctionBegin; 571 PetscCall(PetscKokkosInitializeCheck()); 572 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 573 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 574 PetscCall(MatSetType(*newmat, mtype)); 575 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 576 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 577 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 578 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 579 PetscCall(PetscFree(A->defaultvectype)); 580 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 581 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 582 PetscCall(MatSetOps_SeqAIJKokkos(A)); 583 aseq = static_cast<Mat_SeqAIJ *>(A->data); 584 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 585 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 586 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE); 587 } 588 } 589 PetscFunctionReturn(PETSC_SUCCESS); 590 } 591 592 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 593 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 594 */ 595 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 596 { 597 Mat_SeqAIJ *bseq; 598 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 599 Mat mat; 600 601 PetscFunctionBegin; 602 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 603 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 604 mat = *B; 605 if (A->assembled) { 606 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 607 bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE); 608 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 609 /* Now copy values to B if needed */ 610 if (dupOption == MAT_COPY_VALUES) { 611 if (akok->a_dual.need_sync_device()) { 612 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 613 bkok->a_dual.modify_host(); 614 } else { /* If device has the latest data, we only copy data on device */ 615 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 616 bkok->a_dual.modify_device(); 617 } 618 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 619 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 620 bkok->a_dual.modify_host(); 621 } 622 mat->spptr = bkok; 623 } 624 625 PetscCall(PetscFree(mat->defaultvectype)); 626 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 627 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 628 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 629 PetscFunctionReturn(PETSC_SUCCESS); 630 } 631 632 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 633 { 634 Mat At; 635 KokkosCsrMatrix internT; 636 Mat_SeqAIJKokkos *atkok, *bkok; 637 638 PetscFunctionBegin; 639 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 640 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 641 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 642 /* Deep copy internT, as we want to isolate the internal transpose */ 643 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 644 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 645 if (reuse == MAT_INITIAL_MATRIX) *B = At; 646 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 647 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 648 if ((*B)->assembled) { 649 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 650 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 651 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 652 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 653 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 654 MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 655 MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 656 PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 657 PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 658 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 659 } 660 PetscFunctionReturn(PETSC_SUCCESS); 661 } 662 663 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 664 { 665 Mat_SeqAIJKokkos *aijkok; 666 667 PetscFunctionBegin; 668 if (A->factortype == MAT_FACTOR_NONE) { 669 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 670 delete aijkok; 671 } else { 672 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 673 } 674 A->spptr = NULL; 675 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 676 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 677 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 678 #if defined(PETSC_HAVE_HYPRE) 679 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL)); 680 #endif 681 PetscCall(MatDestroy_SeqAIJ(A)); 682 PetscFunctionReturn(PETSC_SUCCESS); 683 } 684 685 /*MC 686 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 687 688 A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types 689 690 Options Database Key: 691 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 692 693 Level: beginner 694 695 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 696 M*/ 697 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 698 { 699 PetscFunctionBegin; 700 PetscCall(PetscKokkosInitializeCheck()); 701 PetscCall(MatCreate_SeqAIJ(A)); 702 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 703 PetscFunctionReturn(PETSC_SUCCESS); 704 } 705 706 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 707 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 708 { 709 Mat_SeqAIJ *a, *b; 710 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 711 MatScalarKokkosView aa, ba, ca; 712 MatRowMapKokkosView ai, bi, ci; 713 MatColIdxKokkosView aj, bj, cj; 714 PetscInt m, n, nnz, aN; 715 716 PetscFunctionBegin; 717 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 718 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 719 PetscAssertPointer(C, 4); 720 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 721 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 722 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 723 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 724 725 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 726 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 727 a = static_cast<Mat_SeqAIJ *>(A->data); 728 b = static_cast<Mat_SeqAIJ *>(B->data); 729 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 730 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 731 aa = akok->a_dual.view_device(); 732 ai = akok->i_dual.view_device(); 733 ba = bkok->a_dual.view_device(); 734 bi = bkok->i_dual.view_device(); 735 m = A->rmap->n; /* M, N and nnz of C */ 736 n = A->cmap->n + B->cmap->n; 737 nnz = a->nz + b->nz; 738 aN = A->cmap->n; /* N of A */ 739 if (reuse == MAT_INITIAL_MATRIX) { 740 aj = akok->j_dual.view_device(); 741 bj = bkok->j_dual.view_device(); 742 auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 743 auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 744 auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 745 ca = ca_dual.view_device(); 746 ci = ci_dual.view_device(); 747 cj = cj_dual.view_device(); 748 749 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 750 Kokkos::parallel_for( 751 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 752 PetscInt i = t.league_rank(); /* row i */ 753 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 754 755 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 756 ci(i) = coffset; 757 if (i == m - 1) ci(m) = ai(m) + bi(m); 758 }); 759 760 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 761 if (k < alen) { 762 ca(coffset + k) = aa(ai(i) + k); 763 cj(coffset + k) = aj(ai(i) + k); 764 } else { 765 ca(coffset + k) = ba(bi(i) + k - alen); 766 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 767 } 768 }); 769 }); 770 ca_dual.modify_device(); 771 ci_dual.modify_device(); 772 cj_dual.modify_device(); 773 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 774 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 775 } else if (reuse == MAT_REUSE_MATRIX) { 776 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 777 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 778 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 779 ca = ckok->a_dual.view_device(); 780 ci = ckok->i_dual.view_device(); 781 782 Kokkos::parallel_for( 783 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 784 PetscInt i = t.league_rank(); /* row i */ 785 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 786 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 787 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 788 else ca(ci(i) + k) = ba(bi(i) + k - alen); 789 }); 790 }); 791 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 792 } 793 PetscFunctionReturn(PETSC_SUCCESS); 794 } 795 796 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 797 { 798 PetscFunctionBegin; 799 delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 800 PetscFunctionReturn(PETSC_SUCCESS); 801 } 802 803 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 804 { 805 Mat_Product *product = C->product; 806 Mat A, B; 807 bool transA, transB; /* use bool, since KK needs this type */ 808 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 809 Mat_SeqAIJ *c; 810 MatProductData_SeqAIJKokkos *pdata; 811 KokkosCsrMatrix csrmatA, csrmatB; 812 813 PetscFunctionBegin; 814 MatCheckProduct(C, 1); 815 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 816 pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 817 818 // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 819 // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 820 // we still do numeric. 821 if (pdata->reusesym) { // numeric reuses results from symbolic 822 pdata->reusesym = PETSC_FALSE; 823 PetscFunctionReturn(PETSC_SUCCESS); 824 } 825 826 switch (product->type) { 827 case MATPRODUCT_AB: 828 transA = false; 829 transB = false; 830 break; 831 case MATPRODUCT_AtB: 832 transA = true; 833 transB = false; 834 break; 835 case MATPRODUCT_ABt: 836 transA = false; 837 transB = true; 838 break; 839 default: 840 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 841 } 842 843 A = product->A; 844 B = product->B; 845 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 846 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 847 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 848 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 849 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 850 851 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 852 853 csrmatA = akok->csrmat; 854 csrmatB = bkok->csrmat; 855 856 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 857 if (transA) { 858 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 859 transA = false; 860 } 861 862 if (transB) { 863 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 864 transB = false; 865 } 866 PetscCall(PetscLogGpuTimeBegin()); 867 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 868 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 869 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 870 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 871 #endif 872 873 PetscCall(PetscLogGpuTimeEnd()); 874 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 875 /* shorter version of MatAssemblyEnd_SeqAIJ */ 876 c = (Mat_SeqAIJ *)C->data; 877 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 878 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 879 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 880 c->reallocs = 0; 881 C->info.mallocs = 0; 882 C->info.nz_unneeded = 0; 883 C->assembled = C->was_assembled = PETSC_TRUE; 884 C->num_ass++; 885 PetscFunctionReturn(PETSC_SUCCESS); 886 } 887 888 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 889 { 890 Mat_Product *product = C->product; 891 MatProductType ptype; 892 Mat A, B; 893 bool transA, transB; 894 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 895 MatProductData_SeqAIJKokkos *pdata; 896 MPI_Comm comm; 897 KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 898 899 PetscFunctionBegin; 900 MatCheckProduct(C, 1); 901 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 902 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 903 A = product->A; 904 B = product->B; 905 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 906 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 907 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 908 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 909 csrmatA = akok->csrmat; 910 csrmatB = bkok->csrmat; 911 912 ptype = product->type; 913 // Take advantage of the symmetry if true 914 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 915 ptype = MATPRODUCT_AB; 916 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 917 } 918 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 919 ptype = MATPRODUCT_AB; 920 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 921 } 922 923 switch (ptype) { 924 case MATPRODUCT_AB: 925 transA = false; 926 transB = false; 927 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 928 break; 929 case MATPRODUCT_AtB: 930 transA = true; 931 transB = false; 932 if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs)); 933 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 934 break; 935 case MATPRODUCT_ABt: 936 transA = false; 937 transB = true; 938 if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs)); 939 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 940 break; 941 default: 942 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 943 } 944 PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 945 pdata->reusesym = product->api_user; 946 947 /* TODO: add command line options to select spgemm algorithms */ 948 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 949 950 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 951 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 952 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 953 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 954 #endif 955 #endif 956 PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 957 958 PetscCall(PetscLogGpuTimeBegin()); 959 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 960 if (transA) { 961 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 962 transA = false; 963 } 964 965 if (transB) { 966 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 967 transB = false; 968 } 969 970 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 971 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 972 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 973 calling new Mat_SeqAIJKokkos(). 974 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 975 */ 976 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 977 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 978 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 979 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 980 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 981 #endif 982 PetscCall(PetscLogGpuTimeEnd()); 983 984 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 985 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 986 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 987 PetscFunctionReturn(PETSC_SUCCESS); 988 } 989 990 /* handles sparse matrix matrix ops */ 991 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 992 { 993 Mat_Product *product = mat->product; 994 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 995 996 PetscFunctionBegin; 997 MatCheckProduct(mat, 1); 998 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 999 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 1000 if (Biskok && Ciskok) { 1001 switch (product->type) { 1002 case MATPRODUCT_AB: 1003 case MATPRODUCT_AtB: 1004 case MATPRODUCT_ABt: 1005 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 1006 break; 1007 case MATPRODUCT_PtAP: 1008 case MATPRODUCT_RARt: 1009 case MATPRODUCT_ABC: 1010 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 1011 break; 1012 default: 1013 break; 1014 } 1015 } else { /* fallback for AIJ */ 1016 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 1017 } 1018 PetscFunctionReturn(PETSC_SUCCESS); 1019 } 1020 1021 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 1022 { 1023 Mat_SeqAIJKokkos *aijkok; 1024 1025 PetscFunctionBegin; 1026 PetscCall(PetscLogGpuTimeBegin()); 1027 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1028 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1029 KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 1030 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1031 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 1032 PetscCall(PetscLogGpuTimeEnd()); 1033 PetscFunctionReturn(PETSC_SUCCESS); 1034 } 1035 1036 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular) 1037 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a) 1038 { 1039 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1040 1041 PetscFunctionBegin; 1042 if (A->assembled && aijseq->diagonaldense) { // no missing diagonals 1043 PetscInt n = PetscMin(A->rmap->n, A->cmap->n); 1044 1045 PetscCall(PetscLogGpuTimeBegin()); 1046 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1047 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1048 const auto &Aa = aijkok->a_dual.view_device(); 1049 const auto &Adiag = aijkok->diag_dual.view_device(); 1050 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; })); 1051 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1052 PetscCall(PetscLogGpuFlops(n)); 1053 PetscCall(PetscLogGpuTimeEnd()); 1054 } else { // need reassembly, very slow! 1055 PetscCall(MatShift_Basic(A, a)); 1056 } 1057 PetscFunctionReturn(PETSC_SUCCESS); 1058 } 1059 1060 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is) 1061 { 1062 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data); 1063 1064 PetscFunctionBegin; 1065 if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals 1066 ConstPetscScalarKokkosView dv; 1067 PetscInt n, nv; 1068 1069 PetscCall(PetscLogGpuTimeBegin()); 1070 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1071 PetscCall(VecGetKokkosView(D, &dv)); 1072 PetscCall(VecGetLocalSize(D, &nv)); 1073 n = PetscMin(Y->rmap->n, Y->cmap->n); 1074 PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match"); 1075 1076 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1077 const auto &Aa = aijkok->a_dual.view_device(); 1078 const auto &Adiag = aijkok->diag_dual.view_device(); 1079 PetscCallCXX(Kokkos::parallel_for( 1080 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1081 if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i); 1082 else Aa(Adiag(i)) += dv(i); 1083 })); 1084 PetscCall(VecRestoreKokkosView(D, &dv)); 1085 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1086 PetscCall(PetscLogGpuFlops(n)); 1087 PetscCall(PetscLogGpuTimeEnd()); 1088 } else { // need reassembly, very slow! 1089 PetscCall(MatDiagonalSet_Default(Y, D, is)); 1090 } 1091 PetscFunctionReturn(PETSC_SUCCESS); 1092 } 1093 1094 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr) 1095 { 1096 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1097 PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz; 1098 ConstPetscScalarKokkosView lv, rv; 1099 1100 PetscFunctionBegin; 1101 PetscCall(PetscLogGpuTimeBegin()); 1102 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1103 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1104 const auto &Aa = aijkok->a_dual.view_device(); 1105 const auto &Ai = aijkok->i_dual.view_device(); 1106 const auto &Aj = aijkok->j_dual.view_device(); 1107 if (ll) { 1108 PetscCall(VecGetLocalSize(ll, &m)); 1109 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1110 PetscCall(VecGetKokkosView(ll, &lv)); 1111 PetscCallCXX(Kokkos::parallel_for( // for each row 1112 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1113 PetscInt i = t.league_rank(); // row i 1114 PetscInt len = Ai(i + 1) - Ai(i); 1115 // scale entries on the row 1116 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); }); 1117 })); 1118 PetscCall(VecRestoreKokkosView(ll, &lv)); 1119 PetscCall(PetscLogGpuFlops(nz)); 1120 } 1121 if (rr) { 1122 PetscCall(VecGetLocalSize(rr, &n)); 1123 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1124 PetscCall(VecGetKokkosView(rr, &rv)); 1125 PetscCallCXX(Kokkos::parallel_for( // for each nonzero 1126 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); })); 1127 PetscCall(VecRestoreKokkosView(rr, &lv)); 1128 PetscCall(PetscLogGpuFlops(nz)); 1129 } 1130 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1131 PetscCall(PetscLogGpuTimeEnd()); 1132 PetscFunctionReturn(PETSC_SUCCESS); 1133 } 1134 1135 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 1136 { 1137 Mat_SeqAIJKokkos *aijkok; 1138 1139 PetscFunctionBegin; 1140 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1141 if (aijkok) { /* Only zero the device if data is already there */ 1142 KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0); 1143 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1144 } else { /* Might be preallocated but not assembled */ 1145 PetscCall(MatZeroEntries_SeqAIJ(A)); 1146 } 1147 PetscFunctionReturn(PETSC_SUCCESS); 1148 } 1149 1150 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1151 { 1152 Mat_SeqAIJKokkos *aijkok; 1153 PetscInt n; 1154 PetscScalarKokkosView xv; 1155 1156 PetscFunctionBegin; 1157 PetscCall(VecGetLocalSize(x, &n)); 1158 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1159 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1160 1161 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1162 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1163 1164 const auto &Aa = aijkok->a_dual.view_device(); 1165 const auto &Ai = aijkok->i_dual.view_device(); 1166 const auto &Adiag = aijkok->diag_dual.view_device(); 1167 1168 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1169 Kokkos::parallel_for( 1170 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1171 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1172 else xv(i) = 0; 1173 }); 1174 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1175 PetscFunctionReturn(PETSC_SUCCESS); 1176 } 1177 1178 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1179 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1180 { 1181 Mat_SeqAIJKokkos *aijkok; 1182 1183 PetscFunctionBegin; 1184 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1185 PetscAssertPointer(kv, 2); 1186 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1187 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1188 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1189 *kv = aijkok->a_dual.view_device(); 1190 PetscFunctionReturn(PETSC_SUCCESS); 1191 } 1192 1193 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1194 { 1195 PetscFunctionBegin; 1196 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1197 PetscAssertPointer(kv, 2); 1198 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1199 PetscFunctionReturn(PETSC_SUCCESS); 1200 } 1201 1202 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1203 { 1204 Mat_SeqAIJKokkos *aijkok; 1205 1206 PetscFunctionBegin; 1207 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1208 PetscAssertPointer(kv, 2); 1209 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1210 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1211 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1212 *kv = aijkok->a_dual.view_device(); 1213 PetscFunctionReturn(PETSC_SUCCESS); 1214 } 1215 1216 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1217 { 1218 PetscFunctionBegin; 1219 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1220 PetscAssertPointer(kv, 2); 1221 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1222 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1223 PetscFunctionReturn(PETSC_SUCCESS); 1224 } 1225 1226 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1227 { 1228 Mat_SeqAIJKokkos *aijkok; 1229 1230 PetscFunctionBegin; 1231 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1232 PetscAssertPointer(kv, 2); 1233 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1234 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1235 *kv = aijkok->a_dual.view_device(); 1236 PetscFunctionReturn(PETSC_SUCCESS); 1237 } 1238 1239 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1240 { 1241 PetscFunctionBegin; 1242 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1243 PetscAssertPointer(kv, 2); 1244 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1245 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1246 PetscFunctionReturn(PETSC_SUCCESS); 1247 } 1248 1249 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A) 1250 { 1251 Mat_SeqAIJKokkos *akok; 1252 1253 PetscFunctionBegin; 1254 // Create host copies of the input aij 1255 auto i_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), i_d); 1256 auto j_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), j_d); 1257 // Don't copy the vals to the host now 1258 auto a_h = Kokkos::create_mirror_view(HostMirrorMemorySpace(), a_d); 1259 1260 MatScalarKokkosDualView a_dual = MatScalarKokkosDualView(a_d, a_h); 1261 // Note we have modified device data so it will copy lazily 1262 a_dual.modify_device(); 1263 MatRowMapKokkosDualView i_dual = MatRowMapKokkosDualView(i_d, i_h); 1264 MatColIdxKokkosDualView j_dual = MatColIdxKokkosDualView(j_d, j_h); 1265 1266 PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_dual.extent(0), i_dual, j_dual, a_dual)); 1267 PetscCall(MatCreate(comm, A)); 1268 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1269 PetscFunctionReturn(PETSC_SUCCESS); 1270 } 1271 1272 /* Computes Y += alpha X */ 1273 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1274 { 1275 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1276 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1277 ConstMatScalarKokkosView Xa; 1278 MatScalarKokkosView Ya; 1279 auto exec = PetscGetKokkosExecutionSpace(); 1280 1281 PetscFunctionBegin; 1282 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1283 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1284 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1285 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1286 PetscCall(PetscLogGpuTimeBegin()); 1287 1288 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1289 PetscBool e; 1290 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1291 if (e) { 1292 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1293 if (e) pattern = SAME_NONZERO_PATTERN; 1294 } 1295 } 1296 1297 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1298 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1299 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1300 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1301 */ 1302 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1303 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1304 Xa = xkok->a_dual.view_device(); 1305 Ya = ykok->a_dual.view_device(); 1306 1307 if (pattern == SAME_NONZERO_PATTERN) { 1308 KokkosBlas::axpy(exec, alpha, Xa, Ya); 1309 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1310 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1311 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1312 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1313 1314 Kokkos::parallel_for( 1315 Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1316 PetscInt i = t.league_rank(); // row i 1317 Kokkos::single(Kokkos::PerTeam(t), [=]() { 1318 // Only one thread works in a team 1319 PetscInt p, q = Yi(i); 1320 for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 1321 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 1322 if (Xj(p) == Yj(q)) { // Found it 1323 Ya(q) += alpha * Xa(p); 1324 q++; 1325 } else { 1326 // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1327 // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1328 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 1329 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 1330 #else 1331 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 1332 #endif 1333 } 1334 } 1335 }); 1336 }); 1337 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1338 } else { // different nonzero patterns 1339 Mat Z; 1340 KokkosCsrMatrix zcsr; 1341 KernelHandle kh; 1342 kh.create_spadd_handle(true); // X, Y are sorted 1343 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1344 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1345 zkok = new Mat_SeqAIJKokkos(zcsr); 1346 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1347 PetscCall(MatHeaderReplace(Y, &Z)); 1348 kh.destroy_spadd_handle(); 1349 } 1350 PetscCall(PetscLogGpuTimeEnd()); 1351 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 1352 PetscFunctionReturn(PETSC_SUCCESS); 1353 } 1354 1355 struct MatCOOStruct_SeqAIJKokkos { 1356 PetscCount n; 1357 PetscCount Atot; 1358 PetscInt nz; 1359 PetscCountKokkosView jmap; 1360 PetscCountKokkosView perm; 1361 1362 MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h) 1363 { 1364 nz = coo_h->nz; 1365 n = coo_h->n; 1366 Atot = coo_h->Atot; 1367 jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1)); 1368 perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot)); 1369 } 1370 }; 1371 1372 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data) 1373 { 1374 PetscFunctionBegin; 1375 PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data)); 1376 PetscFunctionReturn(PETSC_SUCCESS); 1377 } 1378 1379 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1380 { 1381 Mat_SeqAIJKokkos *akok; 1382 Mat_SeqAIJ *aseq; 1383 PetscContainer container_h; 1384 MatCOOStruct_SeqAIJ *coo_h; 1385 MatCOOStruct_SeqAIJKokkos *coo_d; 1386 1387 PetscFunctionBegin; 1388 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1389 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1390 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1391 delete akok; 1392 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE); 1393 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1394 1395 // Copy the COO struct to device 1396 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 1397 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 1398 PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h)); 1399 1400 // Put the COO struct in a container and then attach that to the matrix 1401 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos)); 1402 PetscFunctionReturn(PETSC_SUCCESS); 1403 } 1404 1405 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1406 { 1407 MatScalarKokkosView Aa; 1408 ConstMatScalarKokkosView kv; 1409 PetscMemType memtype; 1410 PetscContainer container; 1411 MatCOOStruct_SeqAIJKokkos *coo; 1412 1413 PetscFunctionBegin; 1414 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 1415 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 1416 1417 const auto &n = coo->n; 1418 const auto &Annz = coo->nz; 1419 const auto &jmap = coo->jmap; 1420 const auto &perm = coo->perm; 1421 1422 PetscCall(PetscGetMemType(v, &memtype)); 1423 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1424 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n)); 1425 } else { 1426 kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */ 1427 } 1428 1429 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1430 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1431 1432 PetscCall(PetscLogGpuTimeBegin()); 1433 Kokkos::parallel_for( 1434 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) { 1435 PetscScalar sum = 0.0; 1436 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1437 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1438 }); 1439 PetscCall(PetscLogGpuTimeEnd()); 1440 1441 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1442 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1443 PetscFunctionReturn(PETSC_SUCCESS); 1444 } 1445 1446 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1447 { 1448 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1449 1450 PetscFunctionBegin; 1451 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1452 A->boundtocpu = PETSC_FALSE; 1453 1454 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1455 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1456 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1457 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1458 A->ops->scale = MatScale_SeqAIJKokkos; 1459 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1460 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1461 A->ops->mult = MatMult_SeqAIJKokkos; 1462 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1463 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1464 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1465 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1466 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1467 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1468 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1469 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1470 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1471 A->ops->shift = MatShift_SeqAIJKokkos; 1472 A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos; 1473 A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos; 1474 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1475 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1476 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1477 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1478 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1479 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1480 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1481 1482 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1483 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1484 #if defined(PETSC_HAVE_HYPRE) 1485 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE)); 1486 #endif 1487 PetscFunctionReturn(PETSC_SUCCESS); 1488 } 1489 1490 /* 1491 Extract the (prescribled) diagonal blocks of the matrix and then invert them 1492 1493 Input Parameters: 1494 + A - the MATSEQAIJKOKKOS matrix 1495 . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i) 1496 . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[] 1497 . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i) 1498 - work - a pre-allocated work buffer (as big as diagVal) for use by this routine 1499 1500 Output Parameter: 1501 . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order) 1502 */ 1503 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal) 1504 { 1505 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1506 PetscInt nblocks = bs.extent(0) - 1; 1507 1508 PetscFunctionBegin; 1509 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device 1510 1511 // Pull out the diagonal blocks of the matrix and then invert the blocks 1512 auto Aa = akok->a_dual.view_device(); 1513 auto Ai = akok->i_dual.view_device(); 1514 auto Aj = akok->j_dual.view_device(); 1515 auto Adiag = akok->diag_dual.view_device(); 1516 // TODO: how to tune the team size? 1517 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY) 1518 auto ts = Kokkos::AUTO(); 1519 #else 1520 auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs 1521 #endif 1522 PetscCallCXX(Kokkos::parallel_for( 1523 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) { 1524 const PetscInt bid = teamMember.league_rank(); // block id 1525 const PetscInt rstart = bs(bid); // this block starts from this row 1526 const PetscInt m = bs(bid + 1) - bs(bid); // size of this block 1527 const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order 1528 const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m); 1529 1530 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B 1531 PetscInt i = rstart + r; // i-th row in A 1532 1533 if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case) 1534 PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row 1535 1536 for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet 1537 if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero 1538 B(r, c) = 0.0; 1539 } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column 1540 B(r, c) = Aa(first + c); 1541 } else { // this entry does not show up in the CSR 1542 B(r, c) = 0.0; 1543 } 1544 } 1545 } else { // rare case that the diagonal does not exist 1546 const PetscInt begin = Ai(i); 1547 const PetscInt end = Ai(i + 1); 1548 for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0; 1549 for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not. 1550 if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j); 1551 else if (Aj(j) >= rstart + m) break; 1552 } 1553 } 1554 }); 1555 1556 // LU-decompose B (w/o pivoting) and then invert B 1557 KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0); 1558 KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W); 1559 })); 1560 // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels 1561 PetscFunctionReturn(PETSC_SUCCESS); 1562 } 1563 1564 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1565 { 1566 Mat_SeqAIJ *aseq; 1567 PetscInt i, m, n; 1568 auto exec = PetscGetKokkosExecutionSpace(); 1569 1570 PetscFunctionBegin; 1571 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1572 1573 m = akok->nrows(); 1574 n = akok->ncols(); 1575 PetscCall(MatSetSizes(A, m, n, m, n)); 1576 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1577 1578 /* Set up data structures of A as a MATSEQAIJ */ 1579 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1580 aseq = (Mat_SeqAIJ *)A->data; 1581 1582 PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */ 1583 PetscCallCXX(akok->j_dual.sync_host(exec)); 1584 PetscCallCXX(exec.fence()); 1585 1586 aseq->i = akok->i_host_data(); 1587 aseq->j = akok->j_host_data(); 1588 aseq->a = akok->a_host_data(); 1589 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1590 aseq->free_a = PETSC_FALSE; 1591 aseq->free_ij = PETSC_FALSE; 1592 aseq->nz = akok->nnz(); 1593 aseq->maxnz = aseq->nz; 1594 1595 PetscCall(PetscMalloc1(m, &aseq->imax)); 1596 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1597 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1598 1599 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1600 akok->nonzerostate = A->nonzerostate; 1601 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1602 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1603 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1604 PetscFunctionReturn(PETSC_SUCCESS); 1605 } 1606 1607 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 1608 { 1609 PetscFunctionBegin; 1610 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1611 *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 1612 PetscFunctionReturn(PETSC_SUCCESS); 1613 } 1614 1615 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 1616 { 1617 Mat_SeqAIJKokkos *akok; 1618 1619 PetscFunctionBegin; 1620 PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 1621 PetscCall(MatCreate(comm, A)); 1622 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1623 PetscFunctionReturn(PETSC_SUCCESS); 1624 } 1625 1626 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1627 1628 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1629 */ 1630 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1631 { 1632 PetscFunctionBegin; 1633 PetscCall(MatCreate(comm, A)); 1634 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1635 PetscFunctionReturn(PETSC_SUCCESS); 1636 } 1637 1638 /*@C 1639 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1640 (the default parallel PETSc format). This matrix will ultimately be handled by 1641 Kokkos for calculations. 1642 1643 Collective 1644 1645 Input Parameters: 1646 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1647 . m - number of rows 1648 . n - number of columns 1649 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 1650 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 1651 1652 Output Parameter: 1653 . A - the matrix 1654 1655 Level: intermediate 1656 1657 Notes: 1658 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1659 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1660 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1661 1662 The AIJ format, also called 1663 compressed row storage, is fully compatible with standard Fortran 1664 storage. That is, the stored row and column indices can begin at 1665 either one (as in Fortran) or zero. 1666 1667 Specify the preallocated storage with either `nz` or `nnz` (not both). 1668 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1669 allocation. 1670 1671 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()` 1672 @*/ 1673 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1674 { 1675 PetscFunctionBegin; 1676 PetscCall(PetscKokkosInitializeCheck()); 1677 PetscCall(MatCreate(comm, A)); 1678 PetscCall(MatSetSizes(*A, m, n, m, n)); 1679 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1680 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1681 PetscFunctionReturn(PETSC_SUCCESS); 1682 } 1683 1684 // After matrix numeric factorization, there are still steps to do before triangular solve can be called. 1685 // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does). 1686 // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve(). 1687 // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck. 1688 static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A) 1689 { 1690 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1691 const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy 1692 const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy 1693 1694 PetscFunctionBegin; 1695 if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet 1696 if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d)); 1697 if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d)); 1698 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1699 } 1700 PetscFunctionReturn(PETSC_SUCCESS); 1701 } 1702 1703 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1704 { 1705 const PetscInt n = A->rmap->n; 1706 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1707 const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy 1708 const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy 1709 1710 PetscFunctionBegin; 1711 if (!factors->transpose_updated) { 1712 if (has_upper) { 1713 if (!factors->iUt_d.extent(0)) { // Allocate Ut on device if not yet 1714 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix 1715 factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 1716 factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 1717 } 1718 1719 if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host 1720 if (!factors->U) { 1721 Mat_SeqAIJ *seq; 1722 1723 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U)); 1724 PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut)); 1725 1726 seq = static_cast<Mat_SeqAIJ *>(factors->Ut->data); 1727 factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1); 1728 factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz); 1729 factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz); 1730 } else { 1731 PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h 1732 } 1733 // Copy Ut from host to device 1734 PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h)); 1735 PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h)); 1736 PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h)); 1737 } else { // If U was computed on device, we also compute the transpose there 1738 // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. 1739 PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, 1740 factors->jU_d, factors->aU_d, 1741 factors->iUt_d, factors->jUt_d, 1742 factors->aUt_d)); 1743 PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d)); 1744 } 1745 PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d)); 1746 } 1747 1748 // do the same for L with LU 1749 if (has_lower) { 1750 if (!factors->iLt_d.extent(0)) { // Allocate Lt on device if not yet 1751 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix 1752 factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 1753 factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 1754 } 1755 1756 if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host 1757 if (!factors->L) { 1758 Mat_SeqAIJ *seq; 1759 1760 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L)); 1761 PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt)); 1762 1763 seq = static_cast<Mat_SeqAIJ *>(factors->Lt->data); 1764 factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1); 1765 factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz); 1766 factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz); 1767 } else { 1768 PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h 1769 } 1770 // Copy Lt from host to device 1771 PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h)); 1772 PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h)); 1773 PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h)); 1774 } else { // If L was computed on device, we also compute the transpose there 1775 // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. 1776 PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, 1777 factors->jL_d, factors->aL_d, 1778 factors->iLt_d, factors->jLt_d, 1779 factors->aLt_d)); 1780 PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d)); 1781 } 1782 PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d)); 1783 } 1784 1785 factors->transpose_updated = PETSC_TRUE; 1786 } 1787 PetscFunctionReturn(PETSC_SUCCESS); 1788 } 1789 1790 // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A. 1791 // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty. 1792 static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx) 1793 { 1794 auto exec = PetscGetKokkosExecutionSpace(); 1795 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1796 PetscInt m = A->rmap->n; 1797 PetscScalarKokkosView D = factors->D_d; 1798 PetscScalarKokkosView X, Y, B; // alias 1799 ConstPetscScalarKokkosView b; 1800 PetscScalarKokkosView x; 1801 PetscIntKokkosView &rowperm = factors->rowperm; 1802 PetscBool identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1803 1804 PetscFunctionBegin; 1805 PetscCall(PetscLogGpuTimeBegin()); 1806 PetscCall(MatSeqAIJKokkosSolveCheck(A)); // for UX = T 1807 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B 1808 PetscCall(VecGetKokkosView(bb, &b)); 1809 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1810 1811 // Solve U^T Y = B 1812 if (identity) { // Reorder b with the row permutation 1813 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1814 Y = factors->workVector; 1815 } else { 1816 B = factors->workVector; 1817 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); })); 1818 Y = x; 1819 } 1820 PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y)); 1821 1822 // Solve diag(D) Y' = Y. 1823 // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication. 1824 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); })); 1825 1826 // Solve UX = Y 1827 if (identity) { 1828 X = x; 1829 } else { 1830 X = factors->workVector; // B is not needed anymore 1831 } 1832 PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X)); 1833 1834 // Reorder X with the inverse column (row) permutation 1835 if (!identity) { 1836 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); })); 1837 } 1838 1839 PetscCall(VecRestoreKokkosView(bb, &b)); 1840 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1841 PetscCall(PetscLogGpuTimeEnd()); 1842 PetscFunctionReturn(PETSC_SUCCESS); 1843 } 1844 1845 // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively. 1846 // R and C are represented by rowperm and colperm in factors. 1847 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty. 1848 static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx) 1849 { 1850 auto exec = PetscGetKokkosExecutionSpace(); 1851 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1852 PetscInt m = A->rmap->n; 1853 PetscScalarKokkosView X, Y, B; // alias 1854 ConstPetscScalarKokkosView b; 1855 PetscScalarKokkosView x; 1856 PetscIntKokkosView &rowperm = factors->rowperm; 1857 PetscIntKokkosView &colperm = factors->colperm; 1858 PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1859 PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1860 1861 PetscFunctionBegin; 1862 PetscCall(PetscLogGpuTimeBegin()); 1863 PetscCall(MatSeqAIJKokkosSolveCheck(A)); 1864 PetscCall(VecGetKokkosView(bb, &b)); 1865 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1866 1867 // Solve L Y = B (i.e., L (U C^- x) = R b). R b indicates applying the row permutation on b. 1868 if (row_identity) { 1869 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1870 Y = factors->workVector; 1871 } else { 1872 B = factors->workVector; 1873 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); })); 1874 Y = x; 1875 } 1876 PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y)); 1877 1878 // Solve U C^- x = Y 1879 if (col_identity) { 1880 X = x; 1881 } else { 1882 X = factors->workVector; 1883 } 1884 PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X)); 1885 1886 // x = C X; Reorder X with the inverse col permutation 1887 if (!col_identity) { 1888 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); })); 1889 } 1890 1891 PetscCall(VecRestoreKokkosView(bb, &b)); 1892 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1893 PetscCall(PetscLogGpuTimeEnd()); 1894 PetscFunctionReturn(PETSC_SUCCESS); 1895 } 1896 1897 // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively. 1898 // R and C are represented by rowperm and colperm in factors. 1899 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty. 1900 // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R. 1901 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx) 1902 { 1903 auto exec = PetscGetKokkosExecutionSpace(); 1904 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1905 PetscInt m = A->rmap->n; 1906 PetscScalarKokkosView X, Y, B; // alias 1907 ConstPetscScalarKokkosView b; 1908 PetscScalarKokkosView x; 1909 PetscIntKokkosView &rowperm = factors->rowperm; 1910 PetscIntKokkosView &colperm = factors->colperm; 1911 PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1912 PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1913 1914 PetscFunctionBegin; 1915 PetscCall(PetscLogGpuTimeBegin()); 1916 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T 1917 PetscCall(VecGetKokkosView(bb, &b)); 1918 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1919 1920 // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b). Note C^- b = C^T b, which means applying the column permutation on b. 1921 if (col_identity) { // Reorder b with the col permutation 1922 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1923 Y = factors->workVector; 1924 } else { 1925 B = factors->workVector; 1926 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); })); 1927 Y = x; 1928 } 1929 PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y)); 1930 1931 // Solve L^T X = Y 1932 if (row_identity) { 1933 X = x; 1934 } else { 1935 X = factors->workVector; 1936 } 1937 PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X)); 1938 1939 // x = R^- X = R^T X; Reorder X with the inverse row permutation 1940 if (!row_identity) { 1941 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); })); 1942 } 1943 1944 PetscCall(VecRestoreKokkosView(bb, &b)); 1945 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1946 PetscCall(PetscLogGpuTimeEnd()); 1947 PetscFunctionReturn(PETSC_SUCCESS); 1948 } 1949 1950 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1951 { 1952 PetscFunctionBegin; 1953 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1954 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1955 1956 if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device 1957 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1958 Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data); 1959 const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag; 1960 const MatScalar *Ba = b->a; 1961 PetscInt m = B->rmap->n, n = B->cmap->n; 1962 1963 if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time 1964 // Allocate memory and copy the structure 1965 factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1); 1966 factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries 1967 factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m); 1968 factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1); 1969 factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m])); 1970 factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m])); 1971 1972 PetscInt *Li = factors->iL_h.data(); 1973 PetscInt *Lj = factors->jL_h.data(); 1974 PetscInt *Ui = factors->iU_h.data(); 1975 PetscInt *Uj = factors->jU_h.data(); 1976 1977 Li[0] = Ui[0] = 0; 1978 for (PetscInt i = 0; i < m; i++) { 1979 PetscInt llen = Bi[i + 1] - Bi[i]; // exclusive of the diagonal entry 1980 PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry 1981 1982 PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen); // entries of L on the left of the diagonal 1983 Lj[Li[i] + llen] = i; // diagonal entry of L 1984 1985 Uj[Ui[i]] = i; // diagonal entry of U 1986 PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1); // entries of U on the right of the diagonal 1987 1988 Li[i + 1] = Li[i] + llen + 1; 1989 Ui[i + 1] = Ui[i] + ulen; 1990 } 1991 1992 factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h); 1993 factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h); 1994 factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h); 1995 factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h); 1996 factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h); 1997 factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h); 1998 1999 // Copy row/col permutation to device 2000 IS rowperm = ((Mat_SeqAIJ *)B->data)->row; 2001 PetscBool row_identity; 2002 PetscCall(ISIdentity(rowperm, &row_identity)); 2003 if (!row_identity) { 2004 const PetscInt *ip; 2005 2006 PetscCall(ISGetIndices(rowperm, &ip)); 2007 factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m); 2008 PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m))); 2009 PetscCall(ISRestoreIndices(rowperm, &ip)); 2010 PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt))); 2011 } 2012 2013 IS colperm = ((Mat_SeqAIJ *)B->data)->col; 2014 PetscBool col_identity; 2015 PetscCall(ISIdentity(colperm, &col_identity)); 2016 if (!col_identity) { 2017 const PetscInt *ip; 2018 2019 PetscCall(ISGetIndices(colperm, &ip)); 2020 factors->colperm = PetscIntKokkosView(NoInit("colperm"), n); 2021 PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n))); 2022 PetscCall(ISRestoreIndices(colperm, &ip)); 2023 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 2024 } 2025 2026 /* Create sptrsv handles for L, U and their transpose */ 2027 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2028 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2029 #else 2030 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2031 #endif 2032 factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */); 2033 factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */); 2034 factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */); 2035 factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */); 2036 } 2037 2038 // Copy the value 2039 for (PetscInt i = 0; i < m; i++) { 2040 PetscInt llen = Bi[i + 1] - Bi[i]; 2041 PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; 2042 const PetscInt *Li = factors->iL_h.data(); 2043 const PetscInt *Ui = factors->iU_h.data(); 2044 2045 PetscScalar *La = factors->aL_h.data(); 2046 PetscScalar *Ua = factors->aU_h.data(); 2047 2048 PetscArraycpy(La + Li[i], Ba + Bi[i], llen); // entries of L 2049 La[Li[i] + llen] = 1.0; // diagonal entry 2050 2051 Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]]; // diagonal entry 2052 PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1); // entries of U 2053 } 2054 2055 PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h)); 2056 PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h)); 2057 // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic 2058 factors->transpose_updated = PETSC_FALSE; 2059 factors->sptrsv_symbolic_completed = PETSC_FALSE; 2060 2061 B->ops->solve = MatSolve_SeqAIJKokkos_LU; 2062 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU; 2063 } 2064 2065 B->ops->matsolve = NULL; 2066 B->ops->matsolvetranspose = NULL; 2067 PetscFunctionReturn(PETSC_SUCCESS); 2068 } 2069 2070 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info) 2071 { 2072 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 2073 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2074 PetscInt fill_lev = info->levels; 2075 2076 PetscFunctionBegin; 2077 PetscCall(PetscLogGpuTimeBegin()); 2078 PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false"); 2079 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2080 2081 auto a_d = aijkok->a_dual.view_device(); 2082 auto i_d = aijkok->i_dual.view_device(); 2083 auto j_d = aijkok->j_dual.view_device(); 2084 2085 PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d)); 2086 2087 B->assembled = PETSC_TRUE; 2088 B->preallocated = PETSC_TRUE; 2089 B->ops->solve = MatSolve_SeqAIJKokkos_LU; 2090 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU; 2091 B->ops->matsolve = NULL; 2092 B->ops->matsolvetranspose = NULL; 2093 2094 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 2095 factors->transpose_updated = PETSC_FALSE; 2096 factors->sptrsv_symbolic_completed = PETSC_FALSE; 2097 /* TODO: log flops, but how to know that? */ 2098 PetscCall(PetscLogGpuTimeEnd()); 2099 PetscFunctionReturn(PETSC_SUCCESS); 2100 } 2101 2102 // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering 2103 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info) 2104 { 2105 Mat_SeqAIJKokkos *aijkok; 2106 Mat_SeqAIJ *b; 2107 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2108 PetscInt fill_lev = info->levels; 2109 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 2110 PetscInt n = A->rmap->n; 2111 2112 PetscFunctionBegin; 2113 PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now"); 2114 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2115 2116 /* Create a spiluk handle and then do symbolic factorization */ 2117 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 2118 factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 2119 2120 auto spiluk_handle = factors->kh.get_spiluk_handle(); 2121 2122 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 2123 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 2124 Kokkos::realloc(factors->iU_d, n + 1); 2125 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 2126 2127 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 2128 auto i_d = aijkok->i_dual.view_device(); 2129 auto j_d = aijkok->j_dual.view_device(); 2130 PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d)); 2131 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 2132 2133 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 2134 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 2135 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 2136 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 2137 2138 /* TODO: add options to select sptrsv algorithms */ 2139 /* Create sptrsv handles for L, U and their transpose */ 2140 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2141 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2142 #else 2143 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2144 #endif 2145 2146 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 2147 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 2148 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 2149 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 2150 2151 /* Fill fields of the factor matrix B */ 2152 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 2153 b = (Mat_SeqAIJ *)B->data; 2154 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 2155 B->info.fill_ratio_given = info->fill; 2156 B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 2157 2158 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0; 2159 PetscFunctionReturn(PETSC_SUCCESS); 2160 } 2161 2162 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2163 { 2164 PetscFunctionBegin; 2165 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2166 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2167 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2168 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 2169 PetscFunctionReturn(PETSC_SUCCESS); 2170 } 2171 2172 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2173 { 2174 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2175 2176 PetscFunctionBegin; 2177 if (!info->factoronhost) { 2178 PetscCall(ISIdentity(isrow, &row_identity)); 2179 PetscCall(ISIdentity(iscol, &col_identity)); 2180 } 2181 2182 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2183 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2184 2185 if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering 2186 PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info)); 2187 } else { 2188 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host 2189 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 2190 } 2191 PetscFunctionReturn(PETSC_SUCCESS); 2192 } 2193 2194 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 2195 { 2196 PetscFunctionBegin; 2197 PetscCall(MatSeqAIJKokkosSyncHost(A)); 2198 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 2199 2200 if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device 2201 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2202 Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data); 2203 const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag; 2204 const MatScalar *Ba = b->a; 2205 PetscInt m = B->rmap->n; 2206 2207 if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization 2208 // Allocate memory and copy the structure 2209 factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h 2210 factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]); 2211 factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]); 2212 factors->D_h = MatScalarKokkosViewHost(NoInit("D_h"), m); 2213 factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h); 2214 factors->D_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h); 2215 2216 // Build jU_h from the skewed Aj 2217 PetscInt *Uj = factors->jU_h.data(); 2218 for (PetscInt i = 0; i < m; i++) { 2219 PetscInt ulen = Bi[i + 1] - Bi[i]; 2220 Uj[Bi[i]] = i; // diagonal entry 2221 PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal 2222 } 2223 2224 // Copy iU, jU to device 2225 PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h)); 2226 PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h)); 2227 2228 // Copy row/col permutation to device 2229 IS rowperm = ((Mat_SeqAIJ *)B->data)->row; 2230 PetscBool row_identity; 2231 PetscCall(ISIdentity(rowperm, &row_identity)); 2232 if (!row_identity) { 2233 const PetscInt *ip; 2234 2235 PetscCall(ISGetIndices(rowperm, &ip)); 2236 PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m)); 2237 PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m))); 2238 PetscCall(ISRestoreIndices(rowperm, &ip)); 2239 PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt))); 2240 } 2241 2242 // Create sptrsv handles for U and U^T 2243 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2244 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2245 #else 2246 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2247 #endif 2248 factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */); 2249 factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */); 2250 } 2251 // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them 2252 B->ops->solve = MatSolve_SeqAIJKokkos_Cholesky; 2253 B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky; 2254 2255 // Copy the value 2256 PetscScalar *Ua = factors->aU_h.data(); 2257 PetscScalar *D = factors->D_h.data(); 2258 for (PetscInt i = 0; i < m; i++) { 2259 D[i] = Ba[Bdiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 2260 Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U 2261 for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k]; 2262 } 2263 PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h)); 2264 PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h)); 2265 2266 factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again 2267 factors->transpose_updated = PETSC_FALSE; 2268 } 2269 2270 B->ops->matsolve = NULL; 2271 B->ops->matsolvetranspose = NULL; 2272 PetscFunctionReturn(PETSC_SUCCESS); 2273 } 2274 2275 static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2276 { 2277 PetscFunctionBegin; 2278 if (info->solveonhost) { 2279 // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc. 2280 PetscCall(MatSetType(B, MATSEQSBAIJ)); 2281 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 2282 } 2283 2284 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2285 2286 if (!info->solveonhost) { 2287 // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr 2288 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2289 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2290 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos; 2291 } 2292 PetscFunctionReturn(PETSC_SUCCESS); 2293 } 2294 2295 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2296 { 2297 PetscFunctionBegin; 2298 if (info->solveonhost) { 2299 // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc. 2300 PetscCall(MatSetType(B, MATSEQSBAIJ)); 2301 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 2302 } 2303 2304 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm 2305 2306 if (!info->solveonhost) { 2307 // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr 2308 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2309 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2310 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos; 2311 } 2312 PetscFunctionReturn(PETSC_SUCCESS); 2313 } 2314 2315 // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix 2316 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type) 2317 { 2318 PetscFunctionBegin; 2319 *type = MATSOLVERKOKKOS; 2320 PetscFunctionReturn(PETSC_SUCCESS); 2321 } 2322 2323 /*MC 2324 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 2325 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 2326 2327 Level: beginner 2328 2329 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 2330 M*/ 2331 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 2332 { 2333 PetscInt n = A->rmap->n; 2334 MPI_Comm comm; 2335 2336 PetscFunctionBegin; 2337 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 2338 PetscCall(MatCreate(comm, B)); 2339 PetscCall(MatSetSizes(*B, n, n, n, n)); 2340 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2341 (*B)->factortype = ftype; 2342 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 2343 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2344 PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2345 2346 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2347 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 2348 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 2349 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2350 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2351 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2352 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2353 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJKokkos; 2354 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos; 2355 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2356 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2357 } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 2358 2359 // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host 2360 (*B)->canuseordering = PETSC_TRUE; 2361 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos)); 2362 PetscFunctionReturn(PETSC_SUCCESS); 2363 } 2364 2365 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void) 2366 { 2367 PetscFunctionBegin; 2368 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 2369 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos)); 2370 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 2371 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos)); 2372 PetscFunctionReturn(PETSC_SUCCESS); 2373 } 2374 2375 /* Utility to print out a KokkosCsrMatrix for debugging */ 2376 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 2377 { 2378 const auto &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map); 2379 const auto &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries); 2380 const auto &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values); 2381 const PetscInt *i = iv.data(); 2382 const PetscInt *j = jv.data(); 2383 const PetscScalar *a = av.data(); 2384 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 2385 2386 PetscFunctionBegin; 2387 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 2388 for (PetscInt k = 0; k < m; k++) { 2389 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 2390 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 2391 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 2392 } 2393 PetscFunctionReturn(PETSC_SUCCESS); 2394 } 2395