1 #include <petsc_kokkos.hpp> 2 #include <petscvec_kokkos.hpp> 3 #include <petscmat_kokkos.hpp> 4 #include <petscpkg_version.h> 5 #include <petsc/private/petscimpl.h> 6 #include <petsc/private/sfimpl.h> 7 #include <petsc/private/kokkosimpl.hpp> 8 #include <petscsys.h> 9 10 #include <Kokkos_Core.hpp> 11 #include <KokkosBlas.hpp> 12 #include <KokkosSparse_CrsMatrix.hpp> 13 14 // To suppress compiler warnings: 15 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63: 16 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, 17 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*, 18 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations] 19 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations") 20 #include <KokkosSparse_spmv.hpp> 21 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END() 22 23 #include <KokkosSparse_spiluk.hpp> 24 #include <KokkosSparse_sptrsv.hpp> 25 #include <KokkosSparse_spgemm.hpp> 26 #include <KokkosSparse_spadd.hpp> 27 #include <KokkosBatched_LU_Decl.hpp> 28 #include <KokkosBatched_InverseLU_Decl.hpp> 29 30 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 31 32 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 33 #include <KokkosSparse_Utils.hpp> 34 using KokkosSparse::sort_crs_matrix; 35 using KokkosSparse::Impl::transpose_matrix; 36 #else 37 #include <KokkosKernels_Sorting.hpp> 38 using KokkosKernels::sort_crs_matrix; 39 using KokkosKernels::Impl::transpose_matrix; 40 #endif 41 42 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0) 43 using KokkosSparse::spiluk_symbolic; 44 using KokkosSparse::spiluk_numeric; 45 using KokkosSparse::sptrsv_symbolic; 46 using KokkosSparse::sptrsv_solve; 47 using KokkosSparse::Experimental::SPTRSVAlgorithm; 48 using KokkosSparse::Experimental::SPILUKAlgorithm; 49 #else 50 using KokkosSparse::Experimental::spiluk_symbolic; 51 using KokkosSparse::Experimental::spiluk_numeric; 52 using KokkosSparse::Experimental::sptrsv_symbolic; 53 using KokkosSparse::Experimental::sptrsv_solve; 54 using KokkosSparse::Experimental::SPTRSVAlgorithm; 55 using KokkosSparse::Experimental::SPILUKAlgorithm; 56 #endif 57 58 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 59 60 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 61 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 62 In the latter case, it is important to set a_dual's sync state correctly. 63 */ 64 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 65 { 66 Mat_SeqAIJ *aijseq; 67 Mat_SeqAIJKokkos *aijkok; 68 69 PetscFunctionBegin; 70 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 71 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 72 73 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 74 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 75 76 /* If aijkok does not exist, we just copy i, j to device. 77 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 78 In both cases, we build a new aijkok structure. 79 */ 80 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 81 if (aijkok && aijkok->host_aij_allocated_by_kokkos) { /* Avoid accidently freeing much needed a,i,j on host when deleting aijkok */ 82 PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i)); 83 PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j)); 84 PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a)); 85 PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1)); 86 PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz())); 87 PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz())); 88 aijseq->free_a = PETSC_TRUE; 89 aijseq->free_ij = PETSC_TRUE; 90 /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where 91 we have the CsrMatrix on device first and then copy to host, followed by 92 MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL. 93 One could improve it by not using NULL garray. 94 */ 95 } 96 delete aijkok; 97 aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 98 A->spptr = aijkok; 99 } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag. 100 MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n); 101 auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h); 102 aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h); 103 } 104 PetscFunctionReturn(PETSC_SUCCESS); 105 } 106 107 /* Sync CSR data to device if not yet */ 108 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 109 { 110 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 111 112 PetscFunctionBegin; 113 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 114 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 115 if (aijkok->a_dual.need_sync_device()) { 116 aijkok->a_dual.sync_device(); 117 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 118 aijkok->hermitian_updated = PETSC_FALSE; 119 } 120 PetscFunctionReturn(PETSC_SUCCESS); 121 } 122 123 /* Mark the CSR data on device as modified */ 124 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 125 { 126 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 127 128 PetscFunctionBegin; 129 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 130 aijkok->a_dual.clear_sync_state(); 131 aijkok->a_dual.modify_device(); 132 aijkok->transpose_updated = PETSC_FALSE; 133 aijkok->hermitian_updated = PETSC_FALSE; 134 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 135 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 136 PetscFunctionReturn(PETSC_SUCCESS); 137 } 138 139 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 140 { 141 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 142 auto exec = PetscGetKokkosExecutionSpace(); 143 144 PetscFunctionBegin; 145 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 146 /* We do not expect one needs factors on host */ 147 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 148 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 149 PetscCall(KokkosDualViewSync<HostMirrorMemorySpace>(aijkok->a_dual, exec)); 150 PetscFunctionReturn(PETSC_SUCCESS); 151 } 152 153 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 154 { 155 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 156 157 PetscFunctionBegin; 158 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 159 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 160 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 161 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 162 */ 163 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 164 auto exec = PetscGetKokkosExecutionSpace(); 165 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 166 PetscCallCXX(exec.fence()); 167 *array = aijkok->a_dual.view_host().data(); 168 } else { /* Happens when calling MatSetValues on a newly created matrix */ 169 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 170 } 171 PetscFunctionReturn(PETSC_SUCCESS); 172 } 173 174 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 175 { 176 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 177 178 PetscFunctionBegin; 179 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 180 PetscFunctionReturn(PETSC_SUCCESS); 181 } 182 183 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 184 { 185 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 186 187 PetscFunctionBegin; 188 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 189 auto exec = PetscGetKokkosExecutionSpace(); 190 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 191 PetscCallCXX(exec.fence()); 192 *array = aijkok->a_dual.view_host().data(); 193 } else { 194 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 195 } 196 PetscFunctionReturn(PETSC_SUCCESS); 197 } 198 199 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 200 { 201 PetscFunctionBegin; 202 *array = NULL; 203 PetscFunctionReturn(PETSC_SUCCESS); 204 } 205 206 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 207 { 208 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 209 210 PetscFunctionBegin; 211 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 212 *array = aijkok->a_dual.view_host().data(); 213 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 214 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 215 } 216 PetscFunctionReturn(PETSC_SUCCESS); 217 } 218 219 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 220 { 221 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 222 223 PetscFunctionBegin; 224 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 225 aijkok->a_dual.clear_sync_state(); 226 aijkok->a_dual.modify_host(); 227 } 228 PetscFunctionReturn(PETSC_SUCCESS); 229 } 230 231 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 232 { 233 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 234 235 PetscFunctionBegin; 236 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 237 238 if (i) *i = aijkok->i_device_data(); 239 if (j) *j = aijkok->j_device_data(); 240 if (a) { 241 aijkok->a_dual.sync_device(); 242 *a = aijkok->a_device_data(); 243 } 244 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 245 PetscFunctionReturn(PETSC_SUCCESS); 246 } 247 248 /* 249 Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 250 251 Input Parameter: 252 . A - the MATSEQAIJKOKKOS matrix 253 254 Output Parameters: 255 + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 256 - T_d - the transpose on device, whose value array is allocated but not initialized 257 */ 258 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 259 { 260 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 261 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 262 const PetscInt *Ai = aseq->i, *Aj = aseq->j; 263 MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 264 MatRowMapType *Ti = Ti_h.data(); 265 MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 266 MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 267 PetscInt *Tj = Tj_h.data(); 268 PetscInt *perm = perm_h.data(); 269 PetscInt *offset; 270 271 PetscFunctionBegin; 272 // Populate Ti 273 PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 274 Ti++; 275 for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 276 Ti--; 277 for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 278 279 // Populate Tj and the permutation array 280 PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 281 for (PetscInt i = 0; i < m; i++) { 282 for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 283 PetscInt r = Aj[j]; // row r of T 284 PetscInt disp = Ti[r] + offset[r]; 285 286 Tj[disp] = i; // col i of T 287 perm[disp] = j; 288 offset[r]++; 289 } 290 } 291 PetscCall(PetscFree(offset)); 292 293 // Sort each row of T, along with the permutation array 294 for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 295 296 // Output perm and T on device 297 auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 298 auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 299 PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 300 PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 301 PetscFunctionReturn(PETSC_SUCCESS); 302 } 303 304 // Generate the transpose on device and cache it internally 305 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 306 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 307 { 308 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 309 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 310 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 311 KokkosCsrMatrix &T = akok->csrmatT; 312 313 PetscFunctionBegin; 314 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 315 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 316 317 const auto &Aa = akok->a_dual.view_device(); 318 319 if (A->symmetric == PETSC_BOOL3_TRUE) { 320 *csrmatT = akok->csrmat; 321 } else { 322 // See if we already have a cached transpose and its value is up to date 323 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 324 if (!akok->transpose_updated) { // if the value is out of date, update the cached version 325 const auto &perm = akok->transpose_perm; // get the permutation array 326 auto &Ta = T.values; 327 328 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 329 } 330 } else { // Generate T of size n x m for the first time 331 MatRowMapKokkosView perm; 332 333 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 334 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 335 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 336 } 337 akok->transpose_updated = PETSC_TRUE; 338 *csrmatT = akok->csrmatT; 339 } 340 PetscFunctionReturn(PETSC_SUCCESS); 341 } 342 343 // Generate the Hermitian on device and cache it internally 344 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 345 { 346 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 347 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 348 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 349 KokkosCsrMatrix &T = akok->csrmatH; 350 351 PetscFunctionBegin; 352 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 353 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 354 355 const auto &Aa = akok->a_dual.view_device(); 356 357 if (A->hermitian == PETSC_BOOL3_TRUE) { 358 *csrmatH = akok->csrmat; 359 } else { 360 // See if we already have a cached hermitian and its value is up to date 361 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 362 if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 363 const auto &perm = akok->transpose_perm; // get the permutation array 364 auto &Ta = T.values; 365 366 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 367 } 368 } else { // Generate T of size n x m for the first time 369 MatRowMapKokkosView perm; 370 371 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 372 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 373 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 374 } 375 akok->hermitian_updated = PETSC_TRUE; 376 *csrmatH = akok->csrmatH; 377 } 378 PetscFunctionReturn(PETSC_SUCCESS); 379 } 380 381 /* y = A x */ 382 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 383 { 384 Mat_SeqAIJKokkos *aijkok; 385 ConstPetscScalarKokkosView xv; 386 PetscScalarKokkosView yv; 387 388 PetscFunctionBegin; 389 PetscCall(PetscLogGpuTimeBegin()); 390 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 391 PetscCall(VecGetKokkosView(xx, &xv)); 392 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 393 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 394 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 395 PetscCall(VecRestoreKokkosView(xx, &xv)); 396 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 397 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 398 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 399 PetscCall(PetscLogGpuTimeEnd()); 400 PetscFunctionReturn(PETSC_SUCCESS); 401 } 402 403 /* y = A^T x */ 404 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 405 { 406 Mat_SeqAIJKokkos *aijkok; 407 const char *mode; 408 ConstPetscScalarKokkosView xv; 409 PetscScalarKokkosView yv; 410 KokkosCsrMatrix csrmat; 411 412 PetscFunctionBegin; 413 PetscCall(PetscLogGpuTimeBegin()); 414 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 415 PetscCall(VecGetKokkosView(xx, &xv)); 416 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 417 if (A->form_explicit_transpose) { 418 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 419 mode = "N"; 420 } else { 421 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 422 csrmat = aijkok->csrmat; 423 mode = "T"; 424 } 425 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 426 PetscCall(VecRestoreKokkosView(xx, &xv)); 427 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 428 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 429 PetscCall(PetscLogGpuTimeEnd()); 430 PetscFunctionReturn(PETSC_SUCCESS); 431 } 432 433 /* y = A^H x */ 434 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 435 { 436 Mat_SeqAIJKokkos *aijkok; 437 const char *mode; 438 ConstPetscScalarKokkosView xv; 439 PetscScalarKokkosView yv; 440 KokkosCsrMatrix csrmat; 441 442 PetscFunctionBegin; 443 PetscCall(PetscLogGpuTimeBegin()); 444 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 445 PetscCall(VecGetKokkosView(xx, &xv)); 446 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 447 if (A->form_explicit_transpose) { 448 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 449 mode = "N"; 450 } else { 451 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 452 csrmat = aijkok->csrmat; 453 mode = "C"; 454 } 455 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 456 PetscCall(VecRestoreKokkosView(xx, &xv)); 457 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 458 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 459 PetscCall(PetscLogGpuTimeEnd()); 460 PetscFunctionReturn(PETSC_SUCCESS); 461 } 462 463 /* z = A x + y */ 464 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 465 { 466 Mat_SeqAIJKokkos *aijkok; 467 ConstPetscScalarKokkosView xv; 468 PetscScalarKokkosView zv; 469 470 PetscFunctionBegin; 471 PetscCall(PetscLogGpuTimeBegin()); 472 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 473 if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host 474 PetscCall(VecGetKokkosView(xx, &xv)); 475 PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device 476 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 477 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 478 PetscCall(VecRestoreKokkosView(xx, &xv)); 479 PetscCall(VecRestoreKokkosView(zz, &zv)); 480 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 481 PetscCall(PetscLogGpuTimeEnd()); 482 PetscFunctionReturn(PETSC_SUCCESS); 483 } 484 485 /* z = A^T x + y */ 486 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 487 { 488 Mat_SeqAIJKokkos *aijkok; 489 const char *mode; 490 ConstPetscScalarKokkosView xv; 491 PetscScalarKokkosView zv; 492 KokkosCsrMatrix csrmat; 493 494 PetscFunctionBegin; 495 PetscCall(PetscLogGpuTimeBegin()); 496 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 497 if (zz != yy) PetscCall(VecCopy(yy, zz)); 498 PetscCall(VecGetKokkosView(xx, &xv)); 499 PetscCall(VecGetKokkosView(zz, &zv)); 500 if (A->form_explicit_transpose) { 501 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 502 mode = "N"; 503 } else { 504 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 505 csrmat = aijkok->csrmat; 506 mode = "T"; 507 } 508 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 509 PetscCall(VecRestoreKokkosView(xx, &xv)); 510 PetscCall(VecRestoreKokkosView(zz, &zv)); 511 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 512 PetscCall(PetscLogGpuTimeEnd()); 513 PetscFunctionReturn(PETSC_SUCCESS); 514 } 515 516 /* z = A^H x + y */ 517 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 518 { 519 Mat_SeqAIJKokkos *aijkok; 520 const char *mode; 521 ConstPetscScalarKokkosView xv; 522 PetscScalarKokkosView zv; 523 KokkosCsrMatrix csrmat; 524 525 PetscFunctionBegin; 526 PetscCall(PetscLogGpuTimeBegin()); 527 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 528 if (zz != yy) PetscCall(VecCopy(yy, zz)); 529 PetscCall(VecGetKokkosView(xx, &xv)); 530 PetscCall(VecGetKokkosView(zz, &zv)); 531 if (A->form_explicit_transpose) { 532 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 533 mode = "N"; 534 } else { 535 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 536 csrmat = aijkok->csrmat; 537 mode = "C"; 538 } 539 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 540 PetscCall(VecRestoreKokkosView(xx, &xv)); 541 PetscCall(VecRestoreKokkosView(zz, &zv)); 542 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 543 PetscCall(PetscLogGpuTimeEnd()); 544 PetscFunctionReturn(PETSC_SUCCESS); 545 } 546 547 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 548 { 549 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 550 551 PetscFunctionBegin; 552 switch (op) { 553 case MAT_FORM_EXPLICIT_TRANSPOSE: 554 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 555 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 556 A->form_explicit_transpose = flg; 557 break; 558 default: 559 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 560 break; 561 } 562 PetscFunctionReturn(PETSC_SUCCESS); 563 } 564 565 /* Depending on reuse, either build a new mat, or use the existing mat */ 566 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 567 { 568 Mat_SeqAIJ *aseq; 569 570 PetscFunctionBegin; 571 PetscCall(PetscKokkosInitializeCheck()); 572 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 573 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 574 PetscCall(MatSetType(*newmat, mtype)); 575 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 576 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 577 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 578 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 579 PetscCall(PetscFree(A->defaultvectype)); 580 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 581 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 582 PetscCall(MatSetOps_SeqAIJKokkos(A)); 583 aseq = static_cast<Mat_SeqAIJ *>(A->data); 584 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 585 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 586 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE); 587 } 588 } 589 PetscFunctionReturn(PETSC_SUCCESS); 590 } 591 592 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 593 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 594 */ 595 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 596 { 597 Mat_SeqAIJ *bseq; 598 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 599 Mat mat; 600 601 PetscFunctionBegin; 602 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 603 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 604 mat = *B; 605 if (A->assembled) { 606 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 607 bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE); 608 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 609 /* Now copy values to B if needed */ 610 if (dupOption == MAT_COPY_VALUES) { 611 if (akok->a_dual.need_sync_device()) { 612 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 613 bkok->a_dual.modify_host(); 614 } else { /* If device has the latest data, we only copy data on device */ 615 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 616 bkok->a_dual.modify_device(); 617 } 618 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 619 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 620 bkok->a_dual.modify_host(); 621 } 622 mat->spptr = bkok; 623 } 624 625 PetscCall(PetscFree(mat->defaultvectype)); 626 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 627 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 628 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 629 PetscFunctionReturn(PETSC_SUCCESS); 630 } 631 632 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 633 { 634 Mat At; 635 KokkosCsrMatrix internT; 636 Mat_SeqAIJKokkos *atkok, *bkok; 637 638 PetscFunctionBegin; 639 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 640 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 641 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 642 /* Deep copy internT, as we want to isolate the internal transpose */ 643 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 644 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 645 if (reuse == MAT_INITIAL_MATRIX) *B = At; 646 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 647 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 648 if ((*B)->assembled) { 649 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 650 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 651 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 652 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 653 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 654 MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 655 MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 656 PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 657 PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 658 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 659 } 660 PetscFunctionReturn(PETSC_SUCCESS); 661 } 662 663 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 664 { 665 Mat_SeqAIJKokkos *aijkok; 666 667 PetscFunctionBegin; 668 if (A->factortype == MAT_FACTOR_NONE) { 669 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 670 delete aijkok; 671 } else { 672 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 673 } 674 A->spptr = NULL; 675 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 676 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 677 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 678 #if defined(PETSC_HAVE_HYPRE) 679 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL)); 680 #endif 681 PetscCall(MatDestroy_SeqAIJ(A)); 682 PetscFunctionReturn(PETSC_SUCCESS); 683 } 684 685 /*MC 686 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 687 688 A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types 689 690 Options Database Key: 691 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 692 693 Level: beginner 694 695 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 696 M*/ 697 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 698 { 699 PetscFunctionBegin; 700 PetscCall(PetscKokkosInitializeCheck()); 701 PetscCall(MatCreate_SeqAIJ(A)); 702 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 703 PetscFunctionReturn(PETSC_SUCCESS); 704 } 705 706 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 707 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 708 { 709 Mat_SeqAIJ *a, *b; 710 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 711 MatScalarKokkosView aa, ba, ca; 712 MatRowMapKokkosView ai, bi, ci; 713 MatColIdxKokkosView aj, bj, cj; 714 PetscInt m, n, nnz, aN; 715 716 PetscFunctionBegin; 717 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 718 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 719 PetscAssertPointer(C, 4); 720 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 721 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 722 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 723 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 724 725 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 726 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 727 a = static_cast<Mat_SeqAIJ *>(A->data); 728 b = static_cast<Mat_SeqAIJ *>(B->data); 729 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 730 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 731 aa = akok->a_dual.view_device(); 732 ai = akok->i_dual.view_device(); 733 ba = bkok->a_dual.view_device(); 734 bi = bkok->i_dual.view_device(); 735 m = A->rmap->n; /* M, N and nnz of C */ 736 n = A->cmap->n + B->cmap->n; 737 nnz = a->nz + b->nz; 738 aN = A->cmap->n; /* N of A */ 739 if (reuse == MAT_INITIAL_MATRIX) { 740 aj = akok->j_dual.view_device(); 741 bj = bkok->j_dual.view_device(); 742 auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 743 auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 744 auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 745 ca = ca_dual.view_device(); 746 ci = ci_dual.view_device(); 747 cj = cj_dual.view_device(); 748 749 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 750 Kokkos::parallel_for( 751 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 752 PetscInt i = t.league_rank(); /* row i */ 753 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 754 755 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 756 ci(i) = coffset; 757 if (i == m - 1) ci(m) = ai(m) + bi(m); 758 }); 759 760 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 761 if (k < alen) { 762 ca(coffset + k) = aa(ai(i) + k); 763 cj(coffset + k) = aj(ai(i) + k); 764 } else { 765 ca(coffset + k) = ba(bi(i) + k - alen); 766 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 767 } 768 }); 769 }); 770 ca_dual.modify_device(); 771 ci_dual.modify_device(); 772 cj_dual.modify_device(); 773 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 774 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 775 } else if (reuse == MAT_REUSE_MATRIX) { 776 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 777 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 778 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 779 ca = ckok->a_dual.view_device(); 780 ci = ckok->i_dual.view_device(); 781 782 Kokkos::parallel_for( 783 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 784 PetscInt i = t.league_rank(); /* row i */ 785 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 786 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 787 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 788 else ca(ci(i) + k) = ba(bi(i) + k - alen); 789 }); 790 }); 791 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 792 } 793 PetscFunctionReturn(PETSC_SUCCESS); 794 } 795 796 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 797 { 798 PetscFunctionBegin; 799 delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 800 PetscFunctionReturn(PETSC_SUCCESS); 801 } 802 803 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 804 { 805 Mat_Product *product = C->product; 806 Mat A, B; 807 bool transA, transB; /* use bool, since KK needs this type */ 808 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 809 Mat_SeqAIJ *c; 810 MatProductData_SeqAIJKokkos *pdata; 811 KokkosCsrMatrix csrmatA, csrmatB; 812 813 PetscFunctionBegin; 814 MatCheckProduct(C, 1); 815 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 816 pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 817 818 // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 819 // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 820 // we still do numeric. 821 if (pdata->reusesym) { // numeric reuses results from symbolic 822 pdata->reusesym = PETSC_FALSE; 823 PetscFunctionReturn(PETSC_SUCCESS); 824 } 825 826 switch (product->type) { 827 case MATPRODUCT_AB: 828 transA = false; 829 transB = false; 830 break; 831 case MATPRODUCT_AtB: 832 transA = true; 833 transB = false; 834 break; 835 case MATPRODUCT_ABt: 836 transA = false; 837 transB = true; 838 break; 839 default: 840 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 841 } 842 843 A = product->A; 844 B = product->B; 845 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 846 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 847 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 848 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 849 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 850 851 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 852 853 csrmatA = akok->csrmat; 854 csrmatB = bkok->csrmat; 855 856 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 857 if (transA) { 858 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 859 transA = false; 860 } 861 862 if (transB) { 863 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 864 transB = false; 865 } 866 PetscCall(PetscLogGpuTimeBegin()); 867 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 868 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 869 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 870 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 871 #endif 872 873 PetscCall(PetscLogGpuTimeEnd()); 874 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 875 /* shorter version of MatAssemblyEnd_SeqAIJ */ 876 c = (Mat_SeqAIJ *)C->data; 877 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 878 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 879 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 880 c->reallocs = 0; 881 C->info.mallocs = 0; 882 C->info.nz_unneeded = 0; 883 C->assembled = C->was_assembled = PETSC_TRUE; 884 C->num_ass++; 885 PetscFunctionReturn(PETSC_SUCCESS); 886 } 887 888 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 889 { 890 Mat_Product *product = C->product; 891 MatProductType ptype; 892 Mat A, B; 893 bool transA, transB; 894 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 895 MatProductData_SeqAIJKokkos *pdata; 896 MPI_Comm comm; 897 KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 898 899 PetscFunctionBegin; 900 MatCheckProduct(C, 1); 901 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 902 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 903 A = product->A; 904 B = product->B; 905 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 906 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 907 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 908 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 909 csrmatA = akok->csrmat; 910 csrmatB = bkok->csrmat; 911 912 ptype = product->type; 913 // Take advantage of the symmetry if true 914 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 915 ptype = MATPRODUCT_AB; 916 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 917 } 918 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 919 ptype = MATPRODUCT_AB; 920 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 921 } 922 923 switch (ptype) { 924 case MATPRODUCT_AB: 925 transA = false; 926 transB = false; 927 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 928 break; 929 case MATPRODUCT_AtB: 930 transA = true; 931 transB = false; 932 if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs)); 933 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 934 break; 935 case MATPRODUCT_ABt: 936 transA = false; 937 transB = true; 938 if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs)); 939 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 940 break; 941 default: 942 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 943 } 944 PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 945 pdata->reusesym = product->api_user; 946 947 /* TODO: add command line options to select spgemm algorithms */ 948 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 949 950 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 951 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 952 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 953 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 954 #endif 955 #endif 956 PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 957 958 PetscCall(PetscLogGpuTimeBegin()); 959 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 960 if (transA) { 961 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 962 transA = false; 963 } 964 965 if (transB) { 966 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 967 transB = false; 968 } 969 970 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 971 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 972 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 973 calling new Mat_SeqAIJKokkos(). 974 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 975 */ 976 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 977 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 978 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 979 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 980 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 981 #endif 982 PetscCall(PetscLogGpuTimeEnd()); 983 984 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 985 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 986 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 987 PetscFunctionReturn(PETSC_SUCCESS); 988 } 989 990 /* handles sparse matrix matrix ops */ 991 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 992 { 993 Mat_Product *product = mat->product; 994 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 995 996 PetscFunctionBegin; 997 MatCheckProduct(mat, 1); 998 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 999 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 1000 if (Biskok && Ciskok) { 1001 switch (product->type) { 1002 case MATPRODUCT_AB: 1003 case MATPRODUCT_AtB: 1004 case MATPRODUCT_ABt: 1005 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 1006 break; 1007 case MATPRODUCT_PtAP: 1008 case MATPRODUCT_RARt: 1009 case MATPRODUCT_ABC: 1010 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 1011 break; 1012 default: 1013 break; 1014 } 1015 } else { /* fallback for AIJ */ 1016 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 1017 } 1018 PetscFunctionReturn(PETSC_SUCCESS); 1019 } 1020 1021 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 1022 { 1023 Mat_SeqAIJKokkos *aijkok; 1024 1025 PetscFunctionBegin; 1026 PetscCall(PetscLogGpuTimeBegin()); 1027 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1028 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1029 KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 1030 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1031 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 1032 PetscCall(PetscLogGpuTimeEnd()); 1033 PetscFunctionReturn(PETSC_SUCCESS); 1034 } 1035 1036 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular) 1037 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a) 1038 { 1039 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1040 1041 PetscFunctionBegin; 1042 if (A->assembled && aijseq->diagonaldense) { // no missing diagonals 1043 PetscInt n = PetscMin(A->rmap->n, A->cmap->n); 1044 1045 PetscCall(PetscLogGpuTimeBegin()); 1046 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1047 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1048 const auto &Aa = aijkok->a_dual.view_device(); 1049 const auto &Adiag = aijkok->diag_dual.view_device(); 1050 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; })); 1051 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1052 PetscCall(PetscLogGpuFlops(n)); 1053 PetscCall(PetscLogGpuTimeEnd()); 1054 } else { // need reassembly, very slow! 1055 PetscCall(MatShift_Basic(A, a)); 1056 } 1057 PetscFunctionReturn(PETSC_SUCCESS); 1058 } 1059 1060 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is) 1061 { 1062 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data); 1063 1064 PetscFunctionBegin; 1065 if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals 1066 ConstPetscScalarKokkosView dv; 1067 PetscInt n, nv; 1068 1069 PetscCall(PetscLogGpuTimeBegin()); 1070 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1071 PetscCall(VecGetKokkosView(D, &dv)); 1072 PetscCall(VecGetLocalSize(D, &nv)); 1073 n = PetscMin(Y->rmap->n, Y->cmap->n); 1074 PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match"); 1075 1076 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1077 const auto &Aa = aijkok->a_dual.view_device(); 1078 const auto &Adiag = aijkok->diag_dual.view_device(); 1079 PetscCallCXX(Kokkos::parallel_for( 1080 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1081 if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i); 1082 else Aa(Adiag(i)) += dv(i); 1083 })); 1084 PetscCall(VecRestoreKokkosView(D, &dv)); 1085 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1086 PetscCall(PetscLogGpuFlops(n)); 1087 PetscCall(PetscLogGpuTimeEnd()); 1088 } else { // need reassembly, very slow! 1089 PetscCall(MatDiagonalSet_Default(Y, D, is)); 1090 } 1091 PetscFunctionReturn(PETSC_SUCCESS); 1092 } 1093 1094 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr) 1095 { 1096 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1097 PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz; 1098 ConstPetscScalarKokkosView lv, rv; 1099 1100 PetscFunctionBegin; 1101 PetscCall(PetscLogGpuTimeBegin()); 1102 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1103 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1104 const auto &Aa = aijkok->a_dual.view_device(); 1105 const auto &Ai = aijkok->i_dual.view_device(); 1106 const auto &Aj = aijkok->j_dual.view_device(); 1107 if (ll) { 1108 PetscCall(VecGetLocalSize(ll, &m)); 1109 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1110 PetscCall(VecGetKokkosView(ll, &lv)); 1111 PetscCallCXX(Kokkos::parallel_for( // for each row 1112 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1113 PetscInt i = t.league_rank(); // row i 1114 PetscInt len = Ai(i + 1) - Ai(i); 1115 // scale entries on the row 1116 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); }); 1117 })); 1118 PetscCall(VecRestoreKokkosView(ll, &lv)); 1119 PetscCall(PetscLogGpuFlops(nz)); 1120 } 1121 if (rr) { 1122 PetscCall(VecGetLocalSize(rr, &n)); 1123 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1124 PetscCall(VecGetKokkosView(rr, &rv)); 1125 PetscCallCXX(Kokkos::parallel_for( // for each nonzero 1126 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); })); 1127 PetscCall(VecRestoreKokkosView(rr, &lv)); 1128 PetscCall(PetscLogGpuFlops(nz)); 1129 } 1130 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1131 PetscCall(PetscLogGpuTimeEnd()); 1132 PetscFunctionReturn(PETSC_SUCCESS); 1133 } 1134 1135 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 1136 { 1137 Mat_SeqAIJKokkos *aijkok; 1138 1139 PetscFunctionBegin; 1140 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1141 if (aijkok) { /* Only zero the device if data is already there */ 1142 KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0); 1143 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1144 } else { /* Might be preallocated but not assembled */ 1145 PetscCall(MatZeroEntries_SeqAIJ(A)); 1146 } 1147 PetscFunctionReturn(PETSC_SUCCESS); 1148 } 1149 1150 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1151 { 1152 Mat_SeqAIJKokkos *aijkok; 1153 PetscInt n; 1154 PetscScalarKokkosView xv; 1155 1156 PetscFunctionBegin; 1157 PetscCall(VecGetLocalSize(x, &n)); 1158 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1159 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1160 1161 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1162 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1163 1164 const auto &Aa = aijkok->a_dual.view_device(); 1165 const auto &Ai = aijkok->i_dual.view_device(); 1166 const auto &Adiag = aijkok->diag_dual.view_device(); 1167 1168 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1169 Kokkos::parallel_for( 1170 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1171 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1172 else xv(i) = 0; 1173 }); 1174 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1175 PetscFunctionReturn(PETSC_SUCCESS); 1176 } 1177 1178 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1179 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1180 { 1181 Mat_SeqAIJKokkos *aijkok; 1182 1183 PetscFunctionBegin; 1184 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1185 PetscAssertPointer(kv, 2); 1186 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1187 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1188 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1189 *kv = aijkok->a_dual.view_device(); 1190 PetscFunctionReturn(PETSC_SUCCESS); 1191 } 1192 1193 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1194 { 1195 PetscFunctionBegin; 1196 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1197 PetscAssertPointer(kv, 2); 1198 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1199 PetscFunctionReturn(PETSC_SUCCESS); 1200 } 1201 1202 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1203 { 1204 Mat_SeqAIJKokkos *aijkok; 1205 1206 PetscFunctionBegin; 1207 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1208 PetscAssertPointer(kv, 2); 1209 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1210 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1211 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1212 *kv = aijkok->a_dual.view_device(); 1213 PetscFunctionReturn(PETSC_SUCCESS); 1214 } 1215 1216 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1217 { 1218 PetscFunctionBegin; 1219 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1220 PetscAssertPointer(kv, 2); 1221 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1222 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1223 PetscFunctionReturn(PETSC_SUCCESS); 1224 } 1225 1226 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1227 { 1228 Mat_SeqAIJKokkos *aijkok; 1229 1230 PetscFunctionBegin; 1231 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1232 PetscAssertPointer(kv, 2); 1233 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1234 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1235 *kv = aijkok->a_dual.view_device(); 1236 PetscFunctionReturn(PETSC_SUCCESS); 1237 } 1238 1239 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1240 { 1241 PetscFunctionBegin; 1242 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1243 PetscAssertPointer(kv, 2); 1244 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1245 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1246 PetscFunctionReturn(PETSC_SUCCESS); 1247 } 1248 1249 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A) 1250 { 1251 Mat_SeqAIJKokkos *akok; 1252 1253 PetscFunctionBegin; 1254 auto exec = PetscGetKokkosExecutionSpace(); 1255 // Create host copies of the input aij 1256 auto i_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), i_d); 1257 auto j_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), j_d); 1258 // Don't copy the vals to the host now 1259 auto a_h = Kokkos::create_mirror_view(HostMirrorMemorySpace(), a_d); 1260 1261 MatScalarKokkosDualView a_dual = MatScalarKokkosDualView(a_d, a_h); 1262 // Note we have modified device data so it will copy lazily 1263 a_dual.modify_device(); 1264 MatRowMapKokkosDualView i_dual = MatRowMapKokkosDualView(i_d, i_h); 1265 MatColIdxKokkosDualView j_dual = MatColIdxKokkosDualView(j_d, j_h); 1266 1267 PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_dual.extent(0), i_dual, j_dual, a_dual)); 1268 PetscCall(MatCreate(comm, A)); 1269 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1270 PetscFunctionReturn(PETSC_SUCCESS); 1271 } 1272 1273 /* Computes Y += alpha X */ 1274 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1275 { 1276 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1277 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1278 ConstMatScalarKokkosView Xa; 1279 MatScalarKokkosView Ya; 1280 auto exec = PetscGetKokkosExecutionSpace(); 1281 1282 PetscFunctionBegin; 1283 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1284 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1285 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1286 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1287 PetscCall(PetscLogGpuTimeBegin()); 1288 1289 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1290 PetscBool e; 1291 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1292 if (e) { 1293 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1294 if (e) pattern = SAME_NONZERO_PATTERN; 1295 } 1296 } 1297 1298 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1299 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1300 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1301 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1302 */ 1303 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1304 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1305 Xa = xkok->a_dual.view_device(); 1306 Ya = ykok->a_dual.view_device(); 1307 1308 if (pattern == SAME_NONZERO_PATTERN) { 1309 KokkosBlas::axpy(exec, alpha, Xa, Ya); 1310 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1311 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1312 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1313 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1314 1315 Kokkos::parallel_for( 1316 Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1317 PetscInt i = t.league_rank(); // row i 1318 Kokkos::single(Kokkos::PerTeam(t), [=]() { 1319 // Only one thread works in a team 1320 PetscInt p, q = Yi(i); 1321 for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 1322 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 1323 if (Xj(p) == Yj(q)) { // Found it 1324 Ya(q) += alpha * Xa(p); 1325 q++; 1326 } else { 1327 // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1328 // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1329 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 1330 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 1331 #else 1332 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 1333 #endif 1334 } 1335 } 1336 }); 1337 }); 1338 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1339 } else { // different nonzero patterns 1340 Mat Z; 1341 KokkosCsrMatrix zcsr; 1342 KernelHandle kh; 1343 kh.create_spadd_handle(true); // X, Y are sorted 1344 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1345 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1346 zkok = new Mat_SeqAIJKokkos(zcsr); 1347 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1348 PetscCall(MatHeaderReplace(Y, &Z)); 1349 kh.destroy_spadd_handle(); 1350 } 1351 PetscCall(PetscLogGpuTimeEnd()); 1352 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 1353 PetscFunctionReturn(PETSC_SUCCESS); 1354 } 1355 1356 struct MatCOOStruct_SeqAIJKokkos { 1357 PetscCount n; 1358 PetscCount Atot; 1359 PetscInt nz; 1360 PetscCountKokkosView jmap; 1361 PetscCountKokkosView perm; 1362 1363 MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h) 1364 { 1365 nz = coo_h->nz; 1366 n = coo_h->n; 1367 Atot = coo_h->Atot; 1368 jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1)); 1369 perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot)); 1370 } 1371 }; 1372 1373 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data) 1374 { 1375 PetscFunctionBegin; 1376 PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data)); 1377 PetscFunctionReturn(PETSC_SUCCESS); 1378 } 1379 1380 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1381 { 1382 Mat_SeqAIJKokkos *akok; 1383 Mat_SeqAIJ *aseq; 1384 PetscContainer container_h; 1385 MatCOOStruct_SeqAIJ *coo_h; 1386 MatCOOStruct_SeqAIJKokkos *coo_d; 1387 1388 PetscFunctionBegin; 1389 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1390 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1391 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1392 delete akok; 1393 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE); 1394 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1395 1396 // Copy the COO struct to device 1397 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 1398 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 1399 PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h)); 1400 1401 // Put the COO struct in a container and then attach that to the matrix 1402 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos)); 1403 PetscFunctionReturn(PETSC_SUCCESS); 1404 } 1405 1406 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1407 { 1408 MatScalarKokkosView Aa; 1409 ConstMatScalarKokkosView kv; 1410 PetscMemType memtype; 1411 PetscContainer container; 1412 MatCOOStruct_SeqAIJKokkos *coo; 1413 1414 PetscFunctionBegin; 1415 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 1416 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 1417 1418 const auto &n = coo->n; 1419 const auto &Annz = coo->nz; 1420 const auto &jmap = coo->jmap; 1421 const auto &perm = coo->perm; 1422 1423 PetscCall(PetscGetMemType(v, &memtype)); 1424 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1425 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n)); 1426 } else { 1427 kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */ 1428 } 1429 1430 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1431 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1432 1433 PetscCall(PetscLogGpuTimeBegin()); 1434 Kokkos::parallel_for( 1435 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) { 1436 PetscScalar sum = 0.0; 1437 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1438 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1439 }); 1440 PetscCall(PetscLogGpuTimeEnd()); 1441 1442 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1443 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1444 PetscFunctionReturn(PETSC_SUCCESS); 1445 } 1446 1447 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1448 { 1449 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1450 1451 PetscFunctionBegin; 1452 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1453 A->boundtocpu = PETSC_FALSE; 1454 1455 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1456 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1457 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1458 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1459 A->ops->scale = MatScale_SeqAIJKokkos; 1460 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1461 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1462 A->ops->mult = MatMult_SeqAIJKokkos; 1463 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1464 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1465 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1466 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1467 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1468 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1469 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1470 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1471 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1472 A->ops->shift = MatShift_SeqAIJKokkos; 1473 A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos; 1474 A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos; 1475 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1476 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1477 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1478 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1479 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1480 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1481 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1482 1483 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1484 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1485 #if defined(PETSC_HAVE_HYPRE) 1486 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE)); 1487 #endif 1488 PetscFunctionReturn(PETSC_SUCCESS); 1489 } 1490 1491 /* 1492 Extract the (prescribled) diagonal blocks of the matrix and then invert them 1493 1494 Input Parameters: 1495 + A - the MATSEQAIJKOKKOS matrix 1496 . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i) 1497 . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[] 1498 . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i) 1499 - work - a pre-allocated work buffer (as big as diagVal) for use by this routine 1500 1501 Output Parameter: 1502 . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order) 1503 */ 1504 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal) 1505 { 1506 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1507 PetscInt nblocks = bs.extent(0) - 1; 1508 1509 PetscFunctionBegin; 1510 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device 1511 1512 // Pull out the diagonal blocks of the matrix and then invert the blocks 1513 auto Aa = akok->a_dual.view_device(); 1514 auto Ai = akok->i_dual.view_device(); 1515 auto Aj = akok->j_dual.view_device(); 1516 auto Adiag = akok->diag_dual.view_device(); 1517 // TODO: how to tune the team size? 1518 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY) 1519 auto ts = Kokkos::AUTO(); 1520 #else 1521 auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs 1522 #endif 1523 PetscCallCXX(Kokkos::parallel_for( 1524 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) { 1525 const PetscInt bid = teamMember.league_rank(); // block id 1526 const PetscInt rstart = bs(bid); // this block starts from this row 1527 const PetscInt m = bs(bid + 1) - bs(bid); // size of this block 1528 const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order 1529 const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m); 1530 1531 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B 1532 PetscInt i = rstart + r; // i-th row in A 1533 1534 if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case) 1535 PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row 1536 1537 for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet 1538 if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero 1539 B(r, c) = 0.0; 1540 } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column 1541 B(r, c) = Aa(first + c); 1542 } else { // this entry does not show up in the CSR 1543 B(r, c) = 0.0; 1544 } 1545 } 1546 } else { // rare case that the diagonal does not exist 1547 const PetscInt begin = Ai(i); 1548 const PetscInt end = Ai(i + 1); 1549 for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0; 1550 for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not. 1551 if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j); 1552 else if (Aj(j) >= rstart + m) break; 1553 } 1554 } 1555 }); 1556 1557 // LU-decompose B (w/o pivoting) and then invert B 1558 KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0); 1559 KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W); 1560 })); 1561 // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels 1562 PetscFunctionReturn(PETSC_SUCCESS); 1563 } 1564 1565 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1566 { 1567 Mat_SeqAIJ *aseq; 1568 PetscInt i, m, n; 1569 auto exec = PetscGetKokkosExecutionSpace(); 1570 1571 PetscFunctionBegin; 1572 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1573 1574 m = akok->nrows(); 1575 n = akok->ncols(); 1576 PetscCall(MatSetSizes(A, m, n, m, n)); 1577 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1578 1579 /* Set up data structures of A as a MATSEQAIJ */ 1580 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1581 aseq = (Mat_SeqAIJ *)A->data; 1582 1583 PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */ 1584 PetscCallCXX(akok->j_dual.sync_host(exec)); 1585 PetscCallCXX(exec.fence()); 1586 1587 aseq->i = akok->i_host_data(); 1588 aseq->j = akok->j_host_data(); 1589 aseq->a = akok->a_host_data(); 1590 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1591 aseq->free_a = PETSC_FALSE; 1592 aseq->free_ij = PETSC_FALSE; 1593 aseq->nz = akok->nnz(); 1594 aseq->maxnz = aseq->nz; 1595 1596 PetscCall(PetscMalloc1(m, &aseq->imax)); 1597 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1598 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1599 1600 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1601 akok->nonzerostate = A->nonzerostate; 1602 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1603 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1604 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1605 PetscFunctionReturn(PETSC_SUCCESS); 1606 } 1607 1608 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 1609 { 1610 PetscFunctionBegin; 1611 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1612 *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 1613 PetscFunctionReturn(PETSC_SUCCESS); 1614 } 1615 1616 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 1617 { 1618 Mat_SeqAIJKokkos *akok; 1619 1620 PetscFunctionBegin; 1621 PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 1622 PetscCall(MatCreate(comm, A)); 1623 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1624 PetscFunctionReturn(PETSC_SUCCESS); 1625 } 1626 1627 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1628 1629 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1630 */ 1631 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1632 { 1633 PetscFunctionBegin; 1634 PetscCall(MatCreate(comm, A)); 1635 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1636 PetscFunctionReturn(PETSC_SUCCESS); 1637 } 1638 1639 /*@C 1640 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1641 (the default parallel PETSc format). This matrix will ultimately be handled by 1642 Kokkos for calculations. 1643 1644 Collective 1645 1646 Input Parameters: 1647 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1648 . m - number of rows 1649 . n - number of columns 1650 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 1651 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 1652 1653 Output Parameter: 1654 . A - the matrix 1655 1656 Level: intermediate 1657 1658 Notes: 1659 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1660 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1661 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1662 1663 The AIJ format, also called 1664 compressed row storage, is fully compatible with standard Fortran 1665 storage. That is, the stored row and column indices can begin at 1666 either one (as in Fortran) or zero. 1667 1668 Specify the preallocated storage with either `nz` or `nnz` (not both). 1669 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1670 allocation. 1671 1672 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()` 1673 @*/ 1674 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1675 { 1676 PetscFunctionBegin; 1677 PetscCall(PetscKokkosInitializeCheck()); 1678 PetscCall(MatCreate(comm, A)); 1679 PetscCall(MatSetSizes(*A, m, n, m, n)); 1680 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1681 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1682 PetscFunctionReturn(PETSC_SUCCESS); 1683 } 1684 1685 // After matrix numeric factorization, there are still steps to do before triangular solve can be called. 1686 // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does). 1687 // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve(). 1688 // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck. 1689 static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A) 1690 { 1691 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1692 const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy 1693 const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy 1694 1695 PetscFunctionBegin; 1696 if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet 1697 if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d)); 1698 if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d)); 1699 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1700 } 1701 PetscFunctionReturn(PETSC_SUCCESS); 1702 } 1703 1704 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1705 { 1706 const PetscInt n = A->rmap->n; 1707 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1708 const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy 1709 const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy 1710 1711 PetscFunctionBegin; 1712 if (!factors->transpose_updated) { 1713 if (has_upper) { 1714 if (!factors->iUt_d.extent(0)) { // Allocate Ut on device if not yet 1715 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix 1716 factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 1717 factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 1718 } 1719 1720 if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host 1721 if (!factors->U) { 1722 Mat_SeqAIJ *seq; 1723 1724 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U)); 1725 PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut)); 1726 1727 seq = static_cast<Mat_SeqAIJ *>(factors->Ut->data); 1728 factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1); 1729 factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz); 1730 factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz); 1731 } else { 1732 PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h 1733 } 1734 // Copy Ut from host to device 1735 PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h)); 1736 PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h)); 1737 PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h)); 1738 } else { // If U was computed on device, we also compute the transpose there 1739 // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. 1740 PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, 1741 factors->jU_d, factors->aU_d, 1742 factors->iUt_d, factors->jUt_d, 1743 factors->aUt_d)); 1744 PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d)); 1745 } 1746 PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d)); 1747 } 1748 1749 // do the same for L with LU 1750 if (has_lower) { 1751 if (!factors->iLt_d.extent(0)) { // Allocate Lt on device if not yet 1752 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix 1753 factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 1754 factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 1755 } 1756 1757 if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host 1758 if (!factors->L) { 1759 Mat_SeqAIJ *seq; 1760 1761 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L)); 1762 PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt)); 1763 1764 seq = static_cast<Mat_SeqAIJ *>(factors->Lt->data); 1765 factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1); 1766 factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz); 1767 factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz); 1768 } else { 1769 PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h 1770 } 1771 // Copy Lt from host to device 1772 PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h)); 1773 PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h)); 1774 PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h)); 1775 } else { // If L was computed on device, we also compute the transpose there 1776 // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. 1777 PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, 1778 factors->jL_d, factors->aL_d, 1779 factors->iLt_d, factors->jLt_d, 1780 factors->aLt_d)); 1781 PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d)); 1782 } 1783 PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d)); 1784 } 1785 1786 factors->transpose_updated = PETSC_TRUE; 1787 } 1788 PetscFunctionReturn(PETSC_SUCCESS); 1789 } 1790 1791 // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A. 1792 // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty. 1793 static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx) 1794 { 1795 auto exec = PetscGetKokkosExecutionSpace(); 1796 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1797 PetscInt m = A->rmap->n; 1798 PetscScalarKokkosView D = factors->D_d; 1799 PetscScalarKokkosView X, Y, B; // alias 1800 ConstPetscScalarKokkosView b; 1801 PetscScalarKokkosView x; 1802 PetscIntKokkosView &rowperm = factors->rowperm; 1803 PetscBool identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1804 1805 PetscFunctionBegin; 1806 PetscCall(PetscLogGpuTimeBegin()); 1807 PetscCall(MatSeqAIJKokkosSolveCheck(A)); // for UX = T 1808 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B 1809 PetscCall(VecGetKokkosView(bb, &b)); 1810 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1811 1812 // Solve U^T Y = B 1813 if (identity) { // Reorder b with the row permutation 1814 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1815 Y = factors->workVector; 1816 } else { 1817 B = factors->workVector; 1818 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); })); 1819 Y = x; 1820 } 1821 PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y)); 1822 1823 // Solve diag(D) Y' = Y. 1824 // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication. 1825 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); })); 1826 1827 // Solve UX = Y 1828 if (identity) { 1829 X = x; 1830 } else { 1831 X = factors->workVector; // B is not needed anymore 1832 } 1833 PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X)); 1834 1835 // Reorder X with the inverse column (row) permutation 1836 if (!identity) { 1837 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); })); 1838 } 1839 1840 PetscCall(VecRestoreKokkosView(bb, &b)); 1841 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1842 PetscCall(PetscLogGpuTimeEnd()); 1843 PetscFunctionReturn(PETSC_SUCCESS); 1844 } 1845 1846 // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively. 1847 // R and C are represented by rowperm and colperm in factors. 1848 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty. 1849 static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx) 1850 { 1851 auto exec = PetscGetKokkosExecutionSpace(); 1852 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1853 PetscInt m = A->rmap->n; 1854 PetscScalarKokkosView X, Y, B; // alias 1855 ConstPetscScalarKokkosView b; 1856 PetscScalarKokkosView x; 1857 PetscIntKokkosView &rowperm = factors->rowperm; 1858 PetscIntKokkosView &colperm = factors->colperm; 1859 PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1860 PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1861 1862 PetscFunctionBegin; 1863 PetscCall(PetscLogGpuTimeBegin()); 1864 PetscCall(MatSeqAIJKokkosSolveCheck(A)); 1865 PetscCall(VecGetKokkosView(bb, &b)); 1866 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1867 1868 // Solve L Y = B (i.e., L (U C^- x) = R b). R b indicates applying the row permutation on b. 1869 if (row_identity) { 1870 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1871 Y = factors->workVector; 1872 } else { 1873 B = factors->workVector; 1874 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); })); 1875 Y = x; 1876 } 1877 PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y)); 1878 1879 // Solve U C^- x = Y 1880 if (col_identity) { 1881 X = x; 1882 } else { 1883 X = factors->workVector; 1884 } 1885 PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X)); 1886 1887 // x = C X; Reorder X with the inverse col permutation 1888 if (!col_identity) { 1889 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); })); 1890 } 1891 1892 PetscCall(VecRestoreKokkosView(bb, &b)); 1893 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1894 PetscCall(PetscLogGpuTimeEnd()); 1895 PetscFunctionReturn(PETSC_SUCCESS); 1896 } 1897 1898 // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively. 1899 // R and C are represented by rowperm and colperm in factors. 1900 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty. 1901 // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R. 1902 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx) 1903 { 1904 auto exec = PetscGetKokkosExecutionSpace(); 1905 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1906 PetscInt m = A->rmap->n; 1907 PetscScalarKokkosView X, Y, B; // alias 1908 ConstPetscScalarKokkosView b; 1909 PetscScalarKokkosView x; 1910 PetscIntKokkosView &rowperm = factors->rowperm; 1911 PetscIntKokkosView &colperm = factors->colperm; 1912 PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1913 PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1914 1915 PetscFunctionBegin; 1916 PetscCall(PetscLogGpuTimeBegin()); 1917 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T 1918 PetscCall(VecGetKokkosView(bb, &b)); 1919 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1920 1921 // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b). Note C^- b = C^T b, which means applying the column permutation on b. 1922 if (col_identity) { // Reorder b with the col permutation 1923 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1924 Y = factors->workVector; 1925 } else { 1926 B = factors->workVector; 1927 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); })); 1928 Y = x; 1929 } 1930 PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y)); 1931 1932 // Solve L^T X = Y 1933 if (row_identity) { 1934 X = x; 1935 } else { 1936 X = factors->workVector; 1937 } 1938 PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X)); 1939 1940 // x = R^- X = R^T X; Reorder X with the inverse row permutation 1941 if (!row_identity) { 1942 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); })); 1943 } 1944 1945 PetscCall(VecRestoreKokkosView(bb, &b)); 1946 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1947 PetscCall(PetscLogGpuTimeEnd()); 1948 PetscFunctionReturn(PETSC_SUCCESS); 1949 } 1950 1951 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1952 { 1953 PetscFunctionBegin; 1954 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1955 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1956 1957 if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device 1958 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1959 Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data); 1960 const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag; 1961 const MatScalar *Ba = b->a; 1962 PetscInt m = B->rmap->n, n = B->cmap->n; 1963 1964 if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time 1965 // Allocate memory and copy the structure 1966 factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1); 1967 factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries 1968 factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m); 1969 factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1); 1970 factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m])); 1971 factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m])); 1972 1973 PetscInt *Li = factors->iL_h.data(); 1974 PetscInt *Lj = factors->jL_h.data(); 1975 PetscInt *Ui = factors->iU_h.data(); 1976 PetscInt *Uj = factors->jU_h.data(); 1977 1978 Li[0] = Ui[0] = 0; 1979 for (PetscInt i = 0; i < m; i++) { 1980 PetscInt llen = Bi[i + 1] - Bi[i]; // exclusive of the diagonal entry 1981 PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry 1982 1983 PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen); // entries of L on the left of the diagonal 1984 Lj[Li[i] + llen] = i; // diagonal entry of L 1985 1986 Uj[Ui[i]] = i; // diagonal entry of U 1987 PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1); // entries of U on the right of the diagonal 1988 1989 Li[i + 1] = Li[i] + llen + 1; 1990 Ui[i + 1] = Ui[i] + ulen; 1991 } 1992 1993 factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h); 1994 factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h); 1995 factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h); 1996 factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h); 1997 factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h); 1998 factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h); 1999 2000 // Copy row/col permutation to device 2001 IS rowperm = ((Mat_SeqAIJ *)B->data)->row; 2002 PetscBool row_identity; 2003 PetscCall(ISIdentity(rowperm, &row_identity)); 2004 if (!row_identity) { 2005 const PetscInt *ip; 2006 2007 PetscCall(ISGetIndices(rowperm, &ip)); 2008 factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m); 2009 PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m))); 2010 PetscCall(ISRestoreIndices(rowperm, &ip)); 2011 PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt))); 2012 } 2013 2014 IS colperm = ((Mat_SeqAIJ *)B->data)->col; 2015 PetscBool col_identity; 2016 PetscCall(ISIdentity(colperm, &col_identity)); 2017 if (!col_identity) { 2018 const PetscInt *ip; 2019 2020 PetscCall(ISGetIndices(colperm, &ip)); 2021 factors->colperm = PetscIntKokkosView(NoInit("colperm"), n); 2022 PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n))); 2023 PetscCall(ISRestoreIndices(colperm, &ip)); 2024 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 2025 } 2026 2027 /* Create sptrsv handles for L, U and their transpose */ 2028 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2029 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2030 #else 2031 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2032 #endif 2033 factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */); 2034 factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */); 2035 factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */); 2036 factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */); 2037 } 2038 2039 // Copy the value 2040 for (PetscInt i = 0; i < m; i++) { 2041 PetscInt llen = Bi[i + 1] - Bi[i]; 2042 PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; 2043 const PetscInt *Li = factors->iL_h.data(); 2044 const PetscInt *Ui = factors->iU_h.data(); 2045 2046 PetscScalar *La = factors->aL_h.data(); 2047 PetscScalar *Ua = factors->aU_h.data(); 2048 2049 PetscArraycpy(La + Li[i], Ba + Bi[i], llen); // entries of L 2050 La[Li[i] + llen] = 1.0; // diagonal entry 2051 2052 Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]]; // diagonal entry 2053 PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1); // entries of U 2054 } 2055 2056 PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h)); 2057 PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h)); 2058 // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic 2059 factors->transpose_updated = PETSC_FALSE; 2060 factors->sptrsv_symbolic_completed = PETSC_FALSE; 2061 2062 B->ops->solve = MatSolve_SeqAIJKokkos_LU; 2063 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU; 2064 } 2065 2066 B->ops->matsolve = NULL; 2067 B->ops->matsolvetranspose = NULL; 2068 PetscFunctionReturn(PETSC_SUCCESS); 2069 } 2070 2071 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info) 2072 { 2073 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 2074 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2075 PetscInt fill_lev = info->levels; 2076 2077 PetscFunctionBegin; 2078 PetscCall(PetscLogGpuTimeBegin()); 2079 PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false"); 2080 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2081 2082 auto a_d = aijkok->a_dual.view_device(); 2083 auto i_d = aijkok->i_dual.view_device(); 2084 auto j_d = aijkok->j_dual.view_device(); 2085 2086 PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d)); 2087 2088 B->assembled = PETSC_TRUE; 2089 B->preallocated = PETSC_TRUE; 2090 B->ops->solve = MatSolve_SeqAIJKokkos_LU; 2091 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU; 2092 B->ops->matsolve = NULL; 2093 B->ops->matsolvetranspose = NULL; 2094 2095 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 2096 factors->transpose_updated = PETSC_FALSE; 2097 factors->sptrsv_symbolic_completed = PETSC_FALSE; 2098 /* TODO: log flops, but how to know that? */ 2099 PetscCall(PetscLogGpuTimeEnd()); 2100 PetscFunctionReturn(PETSC_SUCCESS); 2101 } 2102 2103 // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering 2104 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info) 2105 { 2106 Mat_SeqAIJKokkos *aijkok; 2107 Mat_SeqAIJ *b; 2108 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2109 PetscInt fill_lev = info->levels; 2110 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 2111 PetscInt n = A->rmap->n; 2112 2113 PetscFunctionBegin; 2114 PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now"); 2115 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2116 2117 /* Create a spiluk handle and then do symbolic factorization */ 2118 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 2119 factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 2120 2121 auto spiluk_handle = factors->kh.get_spiluk_handle(); 2122 2123 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 2124 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 2125 Kokkos::realloc(factors->iU_d, n + 1); 2126 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 2127 2128 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 2129 auto i_d = aijkok->i_dual.view_device(); 2130 auto j_d = aijkok->j_dual.view_device(); 2131 PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d)); 2132 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 2133 2134 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 2135 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 2136 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 2137 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 2138 2139 /* TODO: add options to select sptrsv algorithms */ 2140 /* Create sptrsv handles for L, U and their transpose */ 2141 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2142 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2143 #else 2144 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2145 #endif 2146 2147 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 2148 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 2149 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 2150 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 2151 2152 /* Fill fields of the factor matrix B */ 2153 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 2154 b = (Mat_SeqAIJ *)B->data; 2155 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 2156 B->info.fill_ratio_given = info->fill; 2157 B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 2158 2159 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0; 2160 PetscFunctionReturn(PETSC_SUCCESS); 2161 } 2162 2163 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2164 { 2165 PetscFunctionBegin; 2166 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2167 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2168 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2169 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 2170 PetscFunctionReturn(PETSC_SUCCESS); 2171 } 2172 2173 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2174 { 2175 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2176 2177 PetscFunctionBegin; 2178 if (!info->factoronhost) { 2179 PetscCall(ISIdentity(isrow, &row_identity)); 2180 PetscCall(ISIdentity(iscol, &col_identity)); 2181 } 2182 2183 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2184 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2185 2186 if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering 2187 PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info)); 2188 } else { 2189 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host 2190 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 2191 } 2192 PetscFunctionReturn(PETSC_SUCCESS); 2193 } 2194 2195 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 2196 { 2197 PetscFunctionBegin; 2198 PetscCall(MatSeqAIJKokkosSyncHost(A)); 2199 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 2200 2201 if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device 2202 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2203 Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data); 2204 const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag; 2205 const MatScalar *Ba = b->a; 2206 PetscInt m = B->rmap->n; 2207 2208 if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization 2209 // Allocate memory and copy the structure 2210 factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h 2211 factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]); 2212 factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]); 2213 factors->D_h = MatScalarKokkosViewHost(NoInit("D_h"), m); 2214 factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h); 2215 factors->D_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h); 2216 2217 // Build jU_h from the skewed Aj 2218 PetscInt *Uj = factors->jU_h.data(); 2219 for (PetscInt i = 0; i < m; i++) { 2220 PetscInt ulen = Bi[i + 1] - Bi[i]; 2221 Uj[Bi[i]] = i; // diagonal entry 2222 PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal 2223 } 2224 2225 // Copy iU, jU to device 2226 PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h)); 2227 PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h)); 2228 2229 // Copy row/col permutation to device 2230 IS rowperm = ((Mat_SeqAIJ *)B->data)->row; 2231 PetscBool row_identity; 2232 PetscCall(ISIdentity(rowperm, &row_identity)); 2233 if (!row_identity) { 2234 const PetscInt *ip; 2235 2236 PetscCall(ISGetIndices(rowperm, &ip)); 2237 PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m)); 2238 PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m))); 2239 PetscCall(ISRestoreIndices(rowperm, &ip)); 2240 PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt))); 2241 } 2242 2243 // Create sptrsv handles for U and U^T 2244 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2245 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2246 #else 2247 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2248 #endif 2249 factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */); 2250 factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */); 2251 } 2252 // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them 2253 B->ops->solve = MatSolve_SeqAIJKokkos_Cholesky; 2254 B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky; 2255 2256 // Copy the value 2257 PetscScalar *Ua = factors->aU_h.data(); 2258 PetscScalar *D = factors->D_h.data(); 2259 for (PetscInt i = 0; i < m; i++) { 2260 D[i] = Ba[Bdiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 2261 Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U 2262 for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k]; 2263 } 2264 PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h)); 2265 PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h)); 2266 2267 factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again 2268 factors->transpose_updated = PETSC_FALSE; 2269 } 2270 2271 B->ops->matsolve = NULL; 2272 B->ops->matsolvetranspose = NULL; 2273 PetscFunctionReturn(PETSC_SUCCESS); 2274 } 2275 2276 static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2277 { 2278 PetscFunctionBegin; 2279 if (info->solveonhost) { 2280 // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc. 2281 PetscCall(MatSetType(B, MATSEQSBAIJ)); 2282 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 2283 } 2284 2285 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2286 2287 if (!info->solveonhost) { 2288 // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr 2289 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2290 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2291 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos; 2292 } 2293 PetscFunctionReturn(PETSC_SUCCESS); 2294 } 2295 2296 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2297 { 2298 PetscFunctionBegin; 2299 if (info->solveonhost) { 2300 // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc. 2301 PetscCall(MatSetType(B, MATSEQSBAIJ)); 2302 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 2303 } 2304 2305 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm 2306 2307 if (!info->solveonhost) { 2308 // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr 2309 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2310 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2311 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos; 2312 } 2313 PetscFunctionReturn(PETSC_SUCCESS); 2314 } 2315 2316 // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix 2317 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type) 2318 { 2319 PetscFunctionBegin; 2320 *type = MATSOLVERKOKKOS; 2321 PetscFunctionReturn(PETSC_SUCCESS); 2322 } 2323 2324 /*MC 2325 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 2326 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 2327 2328 Level: beginner 2329 2330 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 2331 M*/ 2332 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 2333 { 2334 PetscInt n = A->rmap->n; 2335 MPI_Comm comm; 2336 2337 PetscFunctionBegin; 2338 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 2339 PetscCall(MatCreate(comm, B)); 2340 PetscCall(MatSetSizes(*B, n, n, n, n)); 2341 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2342 (*B)->factortype = ftype; 2343 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 2344 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2345 PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2346 2347 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2348 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 2349 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 2350 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2351 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2352 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2353 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2354 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJKokkos; 2355 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos; 2356 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2357 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2358 } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 2359 2360 // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host 2361 (*B)->canuseordering = PETSC_TRUE; 2362 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos)); 2363 PetscFunctionReturn(PETSC_SUCCESS); 2364 } 2365 2366 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void) 2367 { 2368 PetscFunctionBegin; 2369 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 2370 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos)); 2371 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 2372 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos)); 2373 PetscFunctionReturn(PETSC_SUCCESS); 2374 } 2375 2376 /* Utility to print out a KokkosCsrMatrix for debugging */ 2377 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 2378 { 2379 const auto &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map); 2380 const auto &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries); 2381 const auto &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values); 2382 const PetscInt *i = iv.data(); 2383 const PetscInt *j = jv.data(); 2384 const PetscScalar *a = av.data(); 2385 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 2386 2387 PetscFunctionBegin; 2388 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 2389 for (PetscInt k = 0; k < m; k++) { 2390 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 2391 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 2392 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 2393 } 2394 PetscFunctionReturn(PETSC_SUCCESS); 2395 } 2396