1 #include <petsc_kokkos.hpp> 2 #include <petscvec_kokkos.hpp> 3 #include <petscmat_kokkos.hpp> 4 #include <petscpkg_version.h> 5 #include <petsc/private/petscimpl.h> 6 #include <petsc/private/sfimpl.h> 7 #include <petsc/private/kokkosimpl.hpp> 8 #include <petscsys.h> 9 10 #include <Kokkos_Core.hpp> 11 #include <KokkosBlas.hpp> 12 #include <KokkosSparse_CrsMatrix.hpp> 13 14 // To suppress compiler warnings: 15 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63: 16 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, 17 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*, 18 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations] 19 #define DISABLE_CUSPARSE_DEPRECATED 20 #include <KokkosSparse_spmv.hpp> 21 22 #include <KokkosSparse_spiluk.hpp> 23 #include <KokkosSparse_sptrsv.hpp> 24 #include <KokkosSparse_spgemm.hpp> 25 #include <KokkosSparse_spadd.hpp> 26 #include <KokkosBatched_LU_Decl.hpp> 27 #include <KokkosBatched_InverseLU_Decl.hpp> 28 29 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 30 31 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 32 #include <KokkosSparse_Utils.hpp> 33 using KokkosSparse::sort_crs_matrix; 34 using KokkosSparse::Impl::transpose_matrix; 35 #else 36 #include <KokkosKernels_Sorting.hpp> 37 using KokkosKernels::sort_crs_matrix; 38 using KokkosKernels::Impl::transpose_matrix; 39 #endif 40 41 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0) 42 using KokkosSparse::spiluk_symbolic; 43 using KokkosSparse::spiluk_numeric; 44 using KokkosSparse::sptrsv_symbolic; 45 using KokkosSparse::sptrsv_solve; 46 using KokkosSparse::Experimental::SPTRSVAlgorithm; 47 using KokkosSparse::Experimental::SPILUKAlgorithm; 48 #else 49 using KokkosSparse::Experimental::spiluk_symbolic; 50 using KokkosSparse::Experimental::spiluk_numeric; 51 using KokkosSparse::Experimental::sptrsv_symbolic; 52 using KokkosSparse::Experimental::sptrsv_solve; 53 using KokkosSparse::Experimental::SPTRSVAlgorithm; 54 using KokkosSparse::Experimental::SPILUKAlgorithm; 55 #endif 56 57 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 58 59 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 60 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 61 In the latter case, it is important to set a_dual's sync state correctly. 62 */ 63 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 64 { 65 Mat_SeqAIJ *aijseq; 66 Mat_SeqAIJKokkos *aijkok; 67 68 PetscFunctionBegin; 69 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 70 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 71 72 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 73 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 74 75 /* If aijkok does not exist, we just copy i, j to device. 76 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 77 In both cases, we build a new aijkok structure. 78 */ 79 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 80 if (aijkok && aijkok->host_aij_allocated_by_kokkos) { /* Avoid accidentally freeing much needed a,i,j on host when deleting aijkok */ 81 PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i)); 82 PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j)); 83 PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a)); 84 PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1)); 85 PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz())); 86 PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz())); 87 aijseq->free_a = PETSC_TRUE; 88 aijseq->free_ij = PETSC_TRUE; 89 /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where 90 we have the CsrMatrix on device first and then copy to host, followed by 91 MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL. 92 One could improve it by not using NULL garray. 93 */ 94 } 95 delete aijkok; 96 aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 97 A->spptr = aijkok; 98 } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag. 99 MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n); 100 auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h); 101 aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h); 102 } 103 PetscFunctionReturn(PETSC_SUCCESS); 104 } 105 106 /* Sync CSR data to device if not yet */ 107 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 108 { 109 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 110 111 PetscFunctionBegin; 112 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 113 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 114 if (aijkok->a_dual.need_sync_device()) { 115 PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 116 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 117 aijkok->hermitian_updated = PETSC_FALSE; 118 } 119 PetscFunctionReturn(PETSC_SUCCESS); 120 } 121 122 /* Mark the CSR data on device as modified */ 123 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 124 { 125 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 126 127 PetscFunctionBegin; 128 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 129 aijkok->a_dual.clear_sync_state(); 130 aijkok->a_dual.modify_device(); 131 aijkok->transpose_updated = PETSC_FALSE; 132 aijkok->hermitian_updated = PETSC_FALSE; 133 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 134 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 135 PetscFunctionReturn(PETSC_SUCCESS); 136 } 137 138 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 139 { 140 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 141 auto exec = PetscGetKokkosExecutionSpace(); 142 143 PetscFunctionBegin; 144 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 145 /* We do not expect one needs factors on host */ 146 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 147 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 148 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec)); 149 PetscFunctionReturn(PETSC_SUCCESS); 150 } 151 152 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 153 { 154 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 155 156 PetscFunctionBegin; 157 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 158 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 159 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 160 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 161 */ 162 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 163 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 164 *array = aijkok->a_dual.view_host().data(); 165 } else { /* Happens when calling MatSetValues on a newly created matrix */ 166 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 167 } 168 PetscFunctionReturn(PETSC_SUCCESS); 169 } 170 171 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 172 { 173 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 174 175 PetscFunctionBegin; 176 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 177 PetscFunctionReturn(PETSC_SUCCESS); 178 } 179 180 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 181 { 182 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 183 184 PetscFunctionBegin; 185 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 186 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 187 *array = aijkok->a_dual.view_host().data(); 188 } else { 189 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 190 } 191 PetscFunctionReturn(PETSC_SUCCESS); 192 } 193 194 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 195 { 196 PetscFunctionBegin; 197 *array = NULL; 198 PetscFunctionReturn(PETSC_SUCCESS); 199 } 200 201 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 202 { 203 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 204 205 PetscFunctionBegin; 206 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 207 *array = aijkok->a_dual.view_host().data(); 208 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 209 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 210 } 211 PetscFunctionReturn(PETSC_SUCCESS); 212 } 213 214 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 215 { 216 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 217 218 PetscFunctionBegin; 219 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 220 aijkok->a_dual.clear_sync_state(); 221 aijkok->a_dual.modify_host(); 222 } 223 PetscFunctionReturn(PETSC_SUCCESS); 224 } 225 226 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 227 { 228 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 229 230 PetscFunctionBegin; 231 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 232 233 if (i) *i = aijkok->i_device_data(); 234 if (j) *j = aijkok->j_device_data(); 235 if (a) { 236 PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 237 *a = aijkok->a_device_data(); 238 } 239 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 240 PetscFunctionReturn(PETSC_SUCCESS); 241 } 242 243 static PetscErrorCode MatGetCurrentMemType_SeqAIJKokkos(PETSC_UNUSED Mat A, PetscMemType *mtype) 244 { 245 PetscFunctionBegin; 246 *mtype = PETSC_MEMTYPE_KOKKOS; 247 PetscFunctionReturn(PETSC_SUCCESS); 248 } 249 250 /* 251 Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 252 253 Input Parameter: 254 . A - the MATSEQAIJKOKKOS matrix 255 256 Output Parameters: 257 + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 258 - T_d - the transpose on device, whose value array is allocated but not initialized 259 */ 260 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 261 { 262 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 263 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 264 const PetscInt *Ai = aseq->i, *Aj = aseq->j; 265 MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 266 MatRowMapType *Ti = Ti_h.data(); 267 MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 268 MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 269 PetscInt *Tj = Tj_h.data(); 270 PetscInt *perm = perm_h.data(); 271 PetscInt *offset; 272 273 PetscFunctionBegin; 274 // Populate Ti 275 PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 276 Ti++; 277 for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 278 Ti--; 279 for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 280 281 // Populate Tj and the permutation array 282 PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 283 for (PetscInt i = 0; i < m; i++) { 284 for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 285 PetscInt r = Aj[j]; // row r of T 286 PetscInt disp = Ti[r] + offset[r]; 287 288 Tj[disp] = i; // col i of T 289 perm[disp] = j; 290 offset[r]++; 291 } 292 } 293 PetscCall(PetscFree(offset)); 294 295 // Sort each row of T, along with the permutation array 296 for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 297 298 // Output perm and T on device 299 auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 300 auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 301 PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 302 PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 303 PetscFunctionReturn(PETSC_SUCCESS); 304 } 305 306 // Generate the transpose on device and cache it internally 307 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 308 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 309 { 310 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 311 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 312 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 313 KokkosCsrMatrix &T = akok->csrmatT; 314 315 PetscFunctionBegin; 316 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 317 PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device 318 319 const auto &Aa = akok->a_dual.view_device(); 320 321 if (A->symmetric == PETSC_BOOL3_TRUE) { 322 *csrmatT = akok->csrmat; 323 } else { 324 // See if we already have a cached transpose and its value is up to date 325 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 326 if (!akok->transpose_updated) { // if the value is out of date, update the cached version 327 const auto &perm = akok->transpose_perm; // get the permutation array 328 auto &Ta = T.values; 329 330 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 331 } 332 } else { // Generate T of size n x m for the first time 333 MatRowMapKokkosView perm; 334 335 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 336 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 337 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 338 } 339 akok->transpose_updated = PETSC_TRUE; 340 *csrmatT = akok->csrmatT; 341 } 342 PetscFunctionReturn(PETSC_SUCCESS); 343 } 344 345 // Generate the Hermitian on device and cache it internally 346 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 347 { 348 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 349 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 350 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 351 KokkosCsrMatrix &T = akok->csrmatH; 352 353 PetscFunctionBegin; 354 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 355 PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device 356 357 const auto &Aa = akok->a_dual.view_device(); 358 359 if (A->hermitian == PETSC_BOOL3_TRUE) { 360 *csrmatH = akok->csrmat; 361 } else { 362 // See if we already have a cached hermitian and its value is up to date 363 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 364 if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 365 const auto &perm = akok->transpose_perm; // get the permutation array 366 auto &Ta = T.values; 367 368 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 369 } 370 } else { // Generate T of size n x m for the first time 371 MatRowMapKokkosView perm; 372 373 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 374 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 375 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 376 } 377 akok->hermitian_updated = PETSC_TRUE; 378 *csrmatH = akok->csrmatH; 379 } 380 PetscFunctionReturn(PETSC_SUCCESS); 381 } 382 383 /* y = A x */ 384 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 385 { 386 Mat_SeqAIJKokkos *aijkok; 387 ConstPetscScalarKokkosView xv; 388 PetscScalarKokkosView yv; 389 390 PetscFunctionBegin; 391 PetscCall(PetscLogGpuTimeBegin()); 392 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 393 PetscCall(VecGetKokkosView(xx, &xv)); 394 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 395 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 396 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 397 PetscCall(VecRestoreKokkosView(xx, &xv)); 398 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 399 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 400 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 401 PetscCall(PetscLogGpuTimeEnd()); 402 PetscFunctionReturn(PETSC_SUCCESS); 403 } 404 405 /* y = A^T x */ 406 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 407 { 408 Mat_SeqAIJKokkos *aijkok; 409 const char *mode; 410 ConstPetscScalarKokkosView xv; 411 PetscScalarKokkosView yv; 412 KokkosCsrMatrix csrmat; 413 414 PetscFunctionBegin; 415 PetscCall(PetscLogGpuTimeBegin()); 416 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 417 PetscCall(VecGetKokkosView(xx, &xv)); 418 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 419 if (A->form_explicit_transpose) { 420 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 421 mode = "N"; 422 } else { 423 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 424 csrmat = aijkok->csrmat; 425 mode = "T"; 426 } 427 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 428 PetscCall(VecRestoreKokkosView(xx, &xv)); 429 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 430 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 431 PetscCall(PetscLogGpuTimeEnd()); 432 PetscFunctionReturn(PETSC_SUCCESS); 433 } 434 435 /* y = A^H x */ 436 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 437 { 438 Mat_SeqAIJKokkos *aijkok; 439 const char *mode; 440 ConstPetscScalarKokkosView xv; 441 PetscScalarKokkosView yv; 442 KokkosCsrMatrix csrmat; 443 444 PetscFunctionBegin; 445 PetscCall(PetscLogGpuTimeBegin()); 446 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 447 PetscCall(VecGetKokkosView(xx, &xv)); 448 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 449 if (A->form_explicit_transpose) { 450 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 451 mode = "N"; 452 } else { 453 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 454 csrmat = aijkok->csrmat; 455 mode = "C"; 456 } 457 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 458 PetscCall(VecRestoreKokkosView(xx, &xv)); 459 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 460 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 461 PetscCall(PetscLogGpuTimeEnd()); 462 PetscFunctionReturn(PETSC_SUCCESS); 463 } 464 465 /* z = A x + y */ 466 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 467 { 468 Mat_SeqAIJKokkos *aijkok; 469 ConstPetscScalarKokkosView xv; 470 PetscScalarKokkosView zv; 471 472 PetscFunctionBegin; 473 PetscCall(PetscLogGpuTimeBegin()); 474 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 475 if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host 476 PetscCall(VecGetKokkosView(xx, &xv)); 477 PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device 478 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 479 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 480 PetscCall(VecRestoreKokkosView(xx, &xv)); 481 PetscCall(VecRestoreKokkosView(zz, &zv)); 482 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 483 PetscCall(PetscLogGpuTimeEnd()); 484 PetscFunctionReturn(PETSC_SUCCESS); 485 } 486 487 /* z = A^T x + y */ 488 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 489 { 490 Mat_SeqAIJKokkos *aijkok; 491 const char *mode; 492 ConstPetscScalarKokkosView xv; 493 PetscScalarKokkosView zv; 494 KokkosCsrMatrix csrmat; 495 496 PetscFunctionBegin; 497 PetscCall(PetscLogGpuTimeBegin()); 498 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 499 if (zz != yy) PetscCall(VecCopy(yy, zz)); 500 PetscCall(VecGetKokkosView(xx, &xv)); 501 PetscCall(VecGetKokkosView(zz, &zv)); 502 if (A->form_explicit_transpose) { 503 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 504 mode = "N"; 505 } else { 506 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 507 csrmat = aijkok->csrmat; 508 mode = "T"; 509 } 510 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 511 PetscCall(VecRestoreKokkosView(xx, &xv)); 512 PetscCall(VecRestoreKokkosView(zz, &zv)); 513 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 514 PetscCall(PetscLogGpuTimeEnd()); 515 PetscFunctionReturn(PETSC_SUCCESS); 516 } 517 518 /* z = A^H x + y */ 519 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 520 { 521 Mat_SeqAIJKokkos *aijkok; 522 const char *mode; 523 ConstPetscScalarKokkosView xv; 524 PetscScalarKokkosView zv; 525 KokkosCsrMatrix csrmat; 526 527 PetscFunctionBegin; 528 PetscCall(PetscLogGpuTimeBegin()); 529 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 530 if (zz != yy) PetscCall(VecCopy(yy, zz)); 531 PetscCall(VecGetKokkosView(xx, &xv)); 532 PetscCall(VecGetKokkosView(zz, &zv)); 533 if (A->form_explicit_transpose) { 534 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 535 mode = "N"; 536 } else { 537 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 538 csrmat = aijkok->csrmat; 539 mode = "C"; 540 } 541 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 542 PetscCall(VecRestoreKokkosView(xx, &xv)); 543 PetscCall(VecRestoreKokkosView(zz, &zv)); 544 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 545 PetscCall(PetscLogGpuTimeEnd()); 546 PetscFunctionReturn(PETSC_SUCCESS); 547 } 548 549 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 550 { 551 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 552 553 PetscFunctionBegin; 554 switch (op) { 555 case MAT_FORM_EXPLICIT_TRANSPOSE: 556 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 557 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 558 A->form_explicit_transpose = flg; 559 break; 560 default: 561 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 562 break; 563 } 564 PetscFunctionReturn(PETSC_SUCCESS); 565 } 566 567 /* Depending on reuse, either build a new mat, or use the existing mat */ 568 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 569 { 570 Mat_SeqAIJ *aseq; 571 572 PetscFunctionBegin; 573 PetscCall(PetscKokkosInitializeCheck()); 574 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 575 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 576 PetscCall(MatSetType(*newmat, mtype)); 577 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 578 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 579 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 580 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 581 PetscCall(PetscFree(A->defaultvectype)); 582 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 583 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 584 PetscCall(MatSetOps_SeqAIJKokkos(A)); 585 aseq = static_cast<Mat_SeqAIJ *>(A->data); 586 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 587 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 588 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE); 589 } 590 } 591 PetscFunctionReturn(PETSC_SUCCESS); 592 } 593 594 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 595 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 596 */ 597 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 598 { 599 Mat_SeqAIJ *bseq; 600 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 601 Mat mat; 602 603 PetscFunctionBegin; 604 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 605 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 606 mat = *B; 607 if (A->assembled) { 608 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 609 bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE); 610 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 611 /* Now copy values to B if needed */ 612 if (dupOption == MAT_COPY_VALUES) { 613 if (akok->a_dual.need_sync_device()) { 614 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 615 bkok->a_dual.modify_host(); 616 } else { /* If device has the latest data, we only copy data on device */ 617 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 618 bkok->a_dual.modify_device(); 619 } 620 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 621 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 622 bkok->a_dual.modify_host(); 623 } 624 mat->spptr = bkok; 625 } 626 627 PetscCall(PetscFree(mat->defaultvectype)); 628 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 629 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 630 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 631 PetscFunctionReturn(PETSC_SUCCESS); 632 } 633 634 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 635 { 636 Mat At; 637 KokkosCsrMatrix internT; 638 Mat_SeqAIJKokkos *atkok, *bkok; 639 640 PetscFunctionBegin; 641 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 642 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 643 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 644 /* Deep copy internT, as we want to isolate the internal transpose */ 645 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 646 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 647 if (reuse == MAT_INITIAL_MATRIX) *B = At; 648 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 649 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 650 if ((*B)->assembled) { 651 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 652 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 653 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 654 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 655 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 656 MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 657 MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 658 PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 659 PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 660 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 661 } 662 PetscFunctionReturn(PETSC_SUCCESS); 663 } 664 665 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 666 { 667 Mat_SeqAIJKokkos *aijkok; 668 669 PetscFunctionBegin; 670 if (A->factortype == MAT_FACTOR_NONE) { 671 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 672 delete aijkok; 673 } else { 674 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 675 } 676 A->spptr = NULL; 677 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 678 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 679 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 680 #if defined(PETSC_HAVE_HYPRE) 681 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL)); 682 #endif 683 PetscCall(MatDestroy_SeqAIJ(A)); 684 PetscFunctionReturn(PETSC_SUCCESS); 685 } 686 687 /*MC 688 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 689 690 A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types 691 692 Options Database Key: 693 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 694 695 Level: beginner 696 697 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 698 M*/ 699 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 700 { 701 PetscFunctionBegin; 702 PetscCall(PetscKokkosInitializeCheck()); 703 PetscCall(MatCreate_SeqAIJ(A)); 704 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 705 PetscFunctionReturn(PETSC_SUCCESS); 706 } 707 708 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 709 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 710 { 711 Mat_SeqAIJ *a, *b; 712 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 713 MatScalarKokkosView aa, ba, ca; 714 MatRowMapKokkosView ai, bi, ci; 715 MatColIdxKokkosView aj, bj, cj; 716 PetscInt m, n, nnz, aN; 717 718 PetscFunctionBegin; 719 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 720 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 721 PetscAssertPointer(C, 4); 722 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 723 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 724 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 725 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 726 727 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 728 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 729 a = static_cast<Mat_SeqAIJ *>(A->data); 730 b = static_cast<Mat_SeqAIJ *>(B->data); 731 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 732 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 733 aa = akok->a_dual.view_device(); 734 ai = akok->i_dual.view_device(); 735 ba = bkok->a_dual.view_device(); 736 bi = bkok->i_dual.view_device(); 737 m = A->rmap->n; /* M, N and nnz of C */ 738 n = A->cmap->n + B->cmap->n; 739 nnz = a->nz + b->nz; 740 aN = A->cmap->n; /* N of A */ 741 if (reuse == MAT_INITIAL_MATRIX) { 742 aj = akok->j_dual.view_device(); 743 bj = bkok->j_dual.view_device(); 744 auto ca = MatScalarKokkosView("a", aa.extent(0) + ba.extent(0)); 745 auto ci = MatRowMapKokkosView("i", ai.extent(0)); 746 auto cj = MatColIdxKokkosView("j", aj.extent(0) + bj.extent(0)); 747 748 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 749 Kokkos::parallel_for( 750 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 751 PetscInt i = t.league_rank(); /* row i */ 752 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 753 754 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 755 ci(i) = coffset; 756 if (i == m - 1) ci(m) = ai(m) + bi(m); 757 }); 758 759 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 760 if (k < alen) { 761 ca(coffset + k) = aa(ai(i) + k); 762 cj(coffset + k) = aj(ai(i) + k); 763 } else { 764 ca(coffset + k) = ba(bi(i) + k - alen); 765 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 766 } 767 }); 768 }); 769 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci, cj, ca)); 770 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 771 } else if (reuse == MAT_REUSE_MATRIX) { 772 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 773 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 774 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 775 ca = ckok->a_dual.view_device(); 776 ci = ckok->i_dual.view_device(); 777 778 Kokkos::parallel_for( 779 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 780 PetscInt i = t.league_rank(); /* row i */ 781 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 782 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 783 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 784 else ca(ci(i) + k) = ba(bi(i) + k - alen); 785 }); 786 }); 787 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 788 } 789 PetscFunctionReturn(PETSC_SUCCESS); 790 } 791 792 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 793 { 794 PetscFunctionBegin; 795 delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 796 PetscFunctionReturn(PETSC_SUCCESS); 797 } 798 799 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 800 { 801 Mat_Product *product = C->product; 802 Mat A, B; 803 bool transA, transB; /* use bool, since KK needs this type */ 804 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 805 Mat_SeqAIJ *c; 806 MatProductData_SeqAIJKokkos *pdata; 807 KokkosCsrMatrix csrmatA, csrmatB; 808 809 PetscFunctionBegin; 810 MatCheckProduct(C, 1); 811 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 812 pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 813 814 // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 815 // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 816 // we still do numeric. 817 if (pdata->reusesym) { // numeric reuses results from symbolic 818 pdata->reusesym = PETSC_FALSE; 819 PetscFunctionReturn(PETSC_SUCCESS); 820 } 821 822 switch (product->type) { 823 case MATPRODUCT_AB: 824 transA = false; 825 transB = false; 826 break; 827 case MATPRODUCT_AtB: 828 transA = true; 829 transB = false; 830 break; 831 case MATPRODUCT_ABt: 832 transA = false; 833 transB = true; 834 break; 835 default: 836 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 837 } 838 839 A = product->A; 840 B = product->B; 841 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 842 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 843 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 844 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 845 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 846 847 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 848 849 csrmatA = akok->csrmat; 850 csrmatB = bkok->csrmat; 851 852 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 853 if (transA) { 854 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 855 transA = false; 856 } 857 858 if (transB) { 859 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 860 transB = false; 861 } 862 PetscCall(PetscLogGpuTimeBegin()); 863 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 864 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 865 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 866 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 867 #endif 868 869 PetscCall(PetscLogGpuTimeEnd()); 870 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 871 /* shorter version of MatAssemblyEnd_SeqAIJ */ 872 c = (Mat_SeqAIJ *)C->data; 873 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 874 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 875 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 876 c->reallocs = 0; 877 C->info.mallocs = 0; 878 C->info.nz_unneeded = 0; 879 C->assembled = C->was_assembled = PETSC_TRUE; 880 C->num_ass++; 881 PetscFunctionReturn(PETSC_SUCCESS); 882 } 883 884 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 885 { 886 Mat_Product *product = C->product; 887 MatProductType ptype; 888 Mat A, B; 889 bool transA, transB; 890 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 891 MatProductData_SeqAIJKokkos *pdata; 892 MPI_Comm comm; 893 KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 894 895 PetscFunctionBegin; 896 MatCheckProduct(C, 1); 897 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 898 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 899 A = product->A; 900 B = product->B; 901 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 902 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 903 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 904 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 905 csrmatA = akok->csrmat; 906 csrmatB = bkok->csrmat; 907 908 ptype = product->type; 909 // Take advantage of the symmetry if true 910 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 911 ptype = MATPRODUCT_AB; 912 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 913 } 914 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 915 ptype = MATPRODUCT_AB; 916 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 917 } 918 919 switch (ptype) { 920 case MATPRODUCT_AB: 921 transA = false; 922 transB = false; 923 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 924 break; 925 case MATPRODUCT_AtB: 926 transA = true; 927 transB = false; 928 if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs)); 929 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 930 break; 931 case MATPRODUCT_ABt: 932 transA = false; 933 transB = true; 934 if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs)); 935 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 936 break; 937 default: 938 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 939 } 940 PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 941 pdata->reusesym = product->api_user; 942 943 /* TODO: add command line options to select spgemm algorithms */ 944 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 945 946 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 947 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 948 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 949 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 950 #endif 951 #endif 952 PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 953 954 PetscCall(PetscLogGpuTimeBegin()); 955 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 956 if (transA) { 957 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 958 transA = false; 959 } 960 961 if (transB) { 962 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 963 transB = false; 964 } 965 966 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 967 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 968 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 969 calling new Mat_SeqAIJKokkos(). 970 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 971 */ 972 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 973 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 974 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 975 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 976 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 977 #endif 978 PetscCall(PetscLogGpuTimeEnd()); 979 980 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 981 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 982 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 983 PetscFunctionReturn(PETSC_SUCCESS); 984 } 985 986 /* handles sparse matrix matrix ops */ 987 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 988 { 989 Mat_Product *product = mat->product; 990 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 991 992 PetscFunctionBegin; 993 MatCheckProduct(mat, 1); 994 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 995 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 996 if (Biskok && Ciskok) { 997 switch (product->type) { 998 case MATPRODUCT_AB: 999 case MATPRODUCT_AtB: 1000 case MATPRODUCT_ABt: 1001 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 1002 break; 1003 case MATPRODUCT_PtAP: 1004 case MATPRODUCT_RARt: 1005 case MATPRODUCT_ABC: 1006 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 1007 break; 1008 default: 1009 break; 1010 } 1011 } else { /* fallback for AIJ */ 1012 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 1013 } 1014 PetscFunctionReturn(PETSC_SUCCESS); 1015 } 1016 1017 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 1018 { 1019 Mat_SeqAIJKokkos *aijkok; 1020 1021 PetscFunctionBegin; 1022 PetscCall(PetscLogGpuTimeBegin()); 1023 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1024 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1025 KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 1026 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1027 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 1028 PetscCall(PetscLogGpuTimeEnd()); 1029 PetscFunctionReturn(PETSC_SUCCESS); 1030 } 1031 1032 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular) 1033 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a) 1034 { 1035 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1036 1037 PetscFunctionBegin; 1038 if (A->assembled && aijseq->diagonaldense) { // no missing diagonals 1039 PetscInt n = PetscMin(A->rmap->n, A->cmap->n); 1040 1041 PetscCall(PetscLogGpuTimeBegin()); 1042 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1043 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1044 const auto &Aa = aijkok->a_dual.view_device(); 1045 const auto &Adiag = aijkok->diag_dual.view_device(); 1046 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; })); 1047 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1048 PetscCall(PetscLogGpuFlops(n)); 1049 PetscCall(PetscLogGpuTimeEnd()); 1050 } else { // need reassembly, very slow! 1051 PetscCall(MatShift_Basic(A, a)); 1052 } 1053 PetscFunctionReturn(PETSC_SUCCESS); 1054 } 1055 1056 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is) 1057 { 1058 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data); 1059 1060 PetscFunctionBegin; 1061 if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals 1062 ConstPetscScalarKokkosView dv; 1063 PetscInt n, nv; 1064 1065 PetscCall(PetscLogGpuTimeBegin()); 1066 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1067 PetscCall(VecGetKokkosView(D, &dv)); 1068 PetscCall(VecGetLocalSize(D, &nv)); 1069 n = PetscMin(Y->rmap->n, Y->cmap->n); 1070 PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match"); 1071 1072 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1073 const auto &Aa = aijkok->a_dual.view_device(); 1074 const auto &Adiag = aijkok->diag_dual.view_device(); 1075 PetscCallCXX(Kokkos::parallel_for( 1076 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1077 if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i); 1078 else Aa(Adiag(i)) += dv(i); 1079 })); 1080 PetscCall(VecRestoreKokkosView(D, &dv)); 1081 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1082 PetscCall(PetscLogGpuFlops(n)); 1083 PetscCall(PetscLogGpuTimeEnd()); 1084 } else { // need reassembly, very slow! 1085 PetscCall(MatDiagonalSet_Default(Y, D, is)); 1086 } 1087 PetscFunctionReturn(PETSC_SUCCESS); 1088 } 1089 1090 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr) 1091 { 1092 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1093 PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz; 1094 ConstPetscScalarKokkosView lv, rv; 1095 1096 PetscFunctionBegin; 1097 PetscCall(PetscLogGpuTimeBegin()); 1098 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1099 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1100 const auto &Aa = aijkok->a_dual.view_device(); 1101 const auto &Ai = aijkok->i_dual.view_device(); 1102 const auto &Aj = aijkok->j_dual.view_device(); 1103 if (ll) { 1104 PetscCall(VecGetLocalSize(ll, &m)); 1105 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1106 PetscCall(VecGetKokkosView(ll, &lv)); 1107 PetscCallCXX(Kokkos::parallel_for( // for each row 1108 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1109 PetscInt i = t.league_rank(); // row i 1110 PetscInt len = Ai(i + 1) - Ai(i); 1111 // scale entries on the row 1112 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); }); 1113 })); 1114 PetscCall(VecRestoreKokkosView(ll, &lv)); 1115 PetscCall(PetscLogGpuFlops(nz)); 1116 } 1117 if (rr) { 1118 PetscCall(VecGetLocalSize(rr, &n)); 1119 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1120 PetscCall(VecGetKokkosView(rr, &rv)); 1121 PetscCallCXX(Kokkos::parallel_for( // for each nonzero 1122 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); })); 1123 PetscCall(VecRestoreKokkosView(rr, &lv)); 1124 PetscCall(PetscLogGpuFlops(nz)); 1125 } 1126 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1127 PetscCall(PetscLogGpuTimeEnd()); 1128 PetscFunctionReturn(PETSC_SUCCESS); 1129 } 1130 1131 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 1132 { 1133 Mat_SeqAIJKokkos *aijkok; 1134 1135 PetscFunctionBegin; 1136 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1137 if (aijkok) { /* Only zero the device if data is already there */ 1138 KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0); 1139 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1140 } else { /* Might be preallocated but not assembled */ 1141 PetscCall(MatZeroEntries_SeqAIJ(A)); 1142 } 1143 PetscFunctionReturn(PETSC_SUCCESS); 1144 } 1145 1146 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1147 { 1148 Mat_SeqAIJKokkos *aijkok; 1149 PetscInt n; 1150 PetscScalarKokkosView xv; 1151 1152 PetscFunctionBegin; 1153 PetscCall(VecGetLocalSize(x, &n)); 1154 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1155 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1156 1157 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1158 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1159 1160 const auto &Aa = aijkok->a_dual.view_device(); 1161 const auto &Ai = aijkok->i_dual.view_device(); 1162 const auto &Adiag = aijkok->diag_dual.view_device(); 1163 1164 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1165 Kokkos::parallel_for( 1166 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1167 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1168 else xv(i) = 0; 1169 }); 1170 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1171 PetscFunctionReturn(PETSC_SUCCESS); 1172 } 1173 1174 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1175 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1176 { 1177 Mat_SeqAIJKokkos *aijkok; 1178 1179 PetscFunctionBegin; 1180 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1181 PetscAssertPointer(kv, 2); 1182 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1183 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1184 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1185 *kv = aijkok->a_dual.view_device(); 1186 PetscFunctionReturn(PETSC_SUCCESS); 1187 } 1188 1189 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1190 { 1191 PetscFunctionBegin; 1192 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1193 PetscAssertPointer(kv, 2); 1194 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1195 PetscFunctionReturn(PETSC_SUCCESS); 1196 } 1197 1198 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1199 { 1200 Mat_SeqAIJKokkos *aijkok; 1201 1202 PetscFunctionBegin; 1203 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1204 PetscAssertPointer(kv, 2); 1205 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1206 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1207 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1208 *kv = aijkok->a_dual.view_device(); 1209 PetscFunctionReturn(PETSC_SUCCESS); 1210 } 1211 1212 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1213 { 1214 PetscFunctionBegin; 1215 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1216 PetscAssertPointer(kv, 2); 1217 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1218 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1219 PetscFunctionReturn(PETSC_SUCCESS); 1220 } 1221 1222 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1223 { 1224 Mat_SeqAIJKokkos *aijkok; 1225 1226 PetscFunctionBegin; 1227 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1228 PetscAssertPointer(kv, 2); 1229 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1230 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1231 *kv = aijkok->a_dual.view_device(); 1232 PetscFunctionReturn(PETSC_SUCCESS); 1233 } 1234 1235 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1236 { 1237 PetscFunctionBegin; 1238 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1239 PetscAssertPointer(kv, 2); 1240 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1241 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1242 PetscFunctionReturn(PETSC_SUCCESS); 1243 } 1244 1245 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A) 1246 { 1247 Mat_SeqAIJKokkos *akok; 1248 1249 PetscFunctionBegin; 1250 PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_d.extent(0), i_d, j_d, a_d)); 1251 PetscCall(MatCreate(comm, A)); 1252 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1253 PetscFunctionReturn(PETSC_SUCCESS); 1254 } 1255 1256 /* Computes Y += alpha X */ 1257 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1258 { 1259 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1260 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1261 ConstMatScalarKokkosView Xa; 1262 MatScalarKokkosView Ya; 1263 auto exec = PetscGetKokkosExecutionSpace(); 1264 1265 PetscFunctionBegin; 1266 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1267 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1268 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1269 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1270 PetscCall(PetscLogGpuTimeBegin()); 1271 1272 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1273 PetscBool e; 1274 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1275 if (e) { 1276 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1277 if (e) pattern = SAME_NONZERO_PATTERN; 1278 } 1279 } 1280 1281 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1282 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1283 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1284 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1285 */ 1286 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1287 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1288 Xa = xkok->a_dual.view_device(); 1289 Ya = ykok->a_dual.view_device(); 1290 1291 if (pattern == SAME_NONZERO_PATTERN) { 1292 KokkosBlas::axpy(exec, alpha, Xa, Ya); 1293 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1294 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1295 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1296 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1297 1298 Kokkos::parallel_for( 1299 Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1300 PetscInt i = t.league_rank(); // row i 1301 Kokkos::single(Kokkos::PerTeam(t), [=]() { 1302 // Only one thread works in a team 1303 PetscInt p, q = Yi(i); 1304 for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 1305 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 1306 if (Xj(p) == Yj(q)) { // Found it 1307 Ya(q) += alpha * Xa(p); 1308 q++; 1309 } else { 1310 // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1311 // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1312 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 1313 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 1314 #else 1315 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 1316 #endif 1317 } 1318 } 1319 }); 1320 }); 1321 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1322 } else { // different nonzero patterns 1323 Mat Z; 1324 KokkosCsrMatrix zcsr; 1325 KernelHandle kh; 1326 kh.create_spadd_handle(true); // X, Y are sorted 1327 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1328 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1329 zkok = new Mat_SeqAIJKokkos(zcsr); 1330 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1331 PetscCall(MatHeaderReplace(Y, &Z)); 1332 kh.destroy_spadd_handle(); 1333 } 1334 PetscCall(PetscLogGpuTimeEnd()); 1335 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 1336 PetscFunctionReturn(PETSC_SUCCESS); 1337 } 1338 1339 struct MatCOOStruct_SeqAIJKokkos { 1340 PetscCount n; 1341 PetscCount Atot; 1342 PetscInt nz; 1343 PetscCountKokkosView jmap; 1344 PetscCountKokkosView perm; 1345 1346 MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h) 1347 { 1348 nz = coo_h->nz; 1349 n = coo_h->n; 1350 Atot = coo_h->Atot; 1351 jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1)); 1352 perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot)); 1353 } 1354 }; 1355 1356 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data) 1357 { 1358 PetscFunctionBegin; 1359 PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data)); 1360 PetscFunctionReturn(PETSC_SUCCESS); 1361 } 1362 1363 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1364 { 1365 Mat_SeqAIJKokkos *akok; 1366 Mat_SeqAIJ *aseq; 1367 PetscContainer container_h; 1368 MatCOOStruct_SeqAIJ *coo_h; 1369 MatCOOStruct_SeqAIJKokkos *coo_d; 1370 1371 PetscFunctionBegin; 1372 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1373 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1374 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1375 delete akok; 1376 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE); 1377 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1378 1379 // Copy the COO struct to device 1380 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 1381 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 1382 PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h)); 1383 1384 // Put the COO struct in a container and then attach that to the matrix 1385 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos)); 1386 PetscFunctionReturn(PETSC_SUCCESS); 1387 } 1388 1389 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1390 { 1391 MatScalarKokkosView Aa; 1392 ConstMatScalarKokkosView kv; 1393 PetscMemType memtype; 1394 PetscContainer container; 1395 MatCOOStruct_SeqAIJKokkos *coo; 1396 1397 PetscFunctionBegin; 1398 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 1399 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 1400 1401 const auto &n = coo->n; 1402 const auto &Annz = coo->nz; 1403 const auto &jmap = coo->jmap; 1404 const auto &perm = coo->perm; 1405 1406 PetscCall(PetscGetMemType(v, &memtype)); 1407 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1408 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n)); 1409 } else { 1410 kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */ 1411 } 1412 1413 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1414 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1415 1416 PetscCall(PetscLogGpuTimeBegin()); 1417 Kokkos::parallel_for( 1418 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) { 1419 PetscScalar sum = 0.0; 1420 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1421 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1422 }); 1423 PetscCall(PetscLogGpuTimeEnd()); 1424 1425 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1426 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1427 PetscFunctionReturn(PETSC_SUCCESS); 1428 } 1429 1430 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1431 { 1432 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1433 1434 PetscFunctionBegin; 1435 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1436 A->boundtocpu = PETSC_FALSE; 1437 1438 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1439 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1440 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1441 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1442 A->ops->scale = MatScale_SeqAIJKokkos; 1443 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1444 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1445 A->ops->mult = MatMult_SeqAIJKokkos; 1446 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1447 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1448 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1449 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1450 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1451 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1452 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1453 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1454 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1455 A->ops->shift = MatShift_SeqAIJKokkos; 1456 A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos; 1457 A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos; 1458 A->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJKokkos; 1459 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1460 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1461 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1462 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1463 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1464 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1465 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1466 1467 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1468 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1469 #if defined(PETSC_HAVE_HYPRE) 1470 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE)); 1471 #endif 1472 PetscFunctionReturn(PETSC_SUCCESS); 1473 } 1474 1475 /* 1476 Extract the (prescribled) diagonal blocks of the matrix and then invert them 1477 1478 Input Parameters: 1479 + A - the MATSEQAIJKOKKOS matrix 1480 . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i) 1481 . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[] 1482 . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i) 1483 - work - a pre-allocated work buffer (as big as diagVal) for use by this routine 1484 1485 Output Parameter: 1486 . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order) 1487 */ 1488 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal) 1489 { 1490 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1491 PetscInt nblocks = bs.extent(0) - 1; 1492 1493 PetscFunctionBegin; 1494 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device 1495 1496 // Pull out the diagonal blocks of the matrix and then invert the blocks 1497 auto Aa = akok->a_dual.view_device(); 1498 auto Ai = akok->i_dual.view_device(); 1499 auto Aj = akok->j_dual.view_device(); 1500 auto Adiag = akok->diag_dual.view_device(); 1501 // TODO: how to tune the team size? 1502 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY) 1503 auto ts = Kokkos::AUTO(); 1504 #else 1505 auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs 1506 #endif 1507 PetscCallCXX(Kokkos::parallel_for( 1508 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) { 1509 const PetscInt bid = teamMember.league_rank(); // block id 1510 const PetscInt rstart = bs(bid); // this block starts from this row 1511 const PetscInt m = bs(bid + 1) - bs(bid); // size of this block 1512 const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order 1513 const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m); 1514 1515 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B 1516 PetscInt i = rstart + r; // i-th row in A 1517 1518 if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case) 1519 PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row 1520 1521 for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet 1522 if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero 1523 B(r, c) = 0.0; 1524 } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column 1525 B(r, c) = Aa(first + c); 1526 } else { // this entry does not show up in the CSR 1527 B(r, c) = 0.0; 1528 } 1529 } 1530 } else { // rare case that the diagonal does not exist 1531 const PetscInt begin = Ai(i); 1532 const PetscInt end = Ai(i + 1); 1533 for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0; 1534 for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not. 1535 if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j); 1536 else if (Aj(j) >= rstart + m) break; 1537 } 1538 } 1539 }); 1540 1541 // LU-decompose B (w/o pivoting) and then invert B 1542 KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0); 1543 KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W); 1544 })); 1545 // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels 1546 PetscFunctionReturn(PETSC_SUCCESS); 1547 } 1548 1549 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1550 { 1551 Mat_SeqAIJ *aseq; 1552 PetscInt i, m, n; 1553 auto exec = PetscGetKokkosExecutionSpace(); 1554 1555 PetscFunctionBegin; 1556 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1557 1558 m = akok->nrows(); 1559 n = akok->ncols(); 1560 PetscCall(MatSetSizes(A, m, n, m, n)); 1561 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1562 1563 /* Set up data structures of A as a MATSEQAIJ */ 1564 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1565 aseq = (Mat_SeqAIJ *)A->data; 1566 1567 PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */ 1568 PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec)); 1569 1570 aseq->i = akok->i_host_data(); 1571 aseq->j = akok->j_host_data(); 1572 aseq->a = akok->a_host_data(); 1573 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1574 aseq->free_a = PETSC_FALSE; 1575 aseq->free_ij = PETSC_FALSE; 1576 aseq->nz = akok->nnz(); 1577 aseq->maxnz = aseq->nz; 1578 1579 PetscCall(PetscMalloc1(m, &aseq->imax)); 1580 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1581 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1582 1583 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1584 akok->nonzerostate = A->nonzerostate; 1585 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1586 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1587 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1588 PetscFunctionReturn(PETSC_SUCCESS); 1589 } 1590 1591 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 1592 { 1593 PetscFunctionBegin; 1594 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1595 *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 1596 PetscFunctionReturn(PETSC_SUCCESS); 1597 } 1598 1599 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 1600 { 1601 Mat_SeqAIJKokkos *akok; 1602 1603 PetscFunctionBegin; 1604 PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 1605 PetscCall(MatCreate(comm, A)); 1606 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1607 PetscFunctionReturn(PETSC_SUCCESS); 1608 } 1609 1610 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1611 1612 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1613 */ 1614 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1615 { 1616 PetscFunctionBegin; 1617 PetscCall(MatCreate(comm, A)); 1618 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1619 PetscFunctionReturn(PETSC_SUCCESS); 1620 } 1621 1622 /*@C 1623 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1624 (the default parallel PETSc format). This matrix will ultimately be handled by 1625 Kokkos for calculations. 1626 1627 Collective 1628 1629 Input Parameters: 1630 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1631 . m - number of rows 1632 . n - number of columns 1633 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 1634 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 1635 1636 Output Parameter: 1637 . A - the matrix 1638 1639 Level: intermediate 1640 1641 Notes: 1642 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1643 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1644 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1645 1646 The AIJ format, also called 1647 compressed row storage, is fully compatible with standard Fortran 1648 storage. That is, the stored row and column indices can begin at 1649 either one (as in Fortran) or zero. 1650 1651 Specify the preallocated storage with either `nz` or `nnz` (not both). 1652 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1653 allocation. 1654 1655 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()` 1656 @*/ 1657 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1658 { 1659 PetscFunctionBegin; 1660 PetscCall(PetscKokkosInitializeCheck()); 1661 PetscCall(MatCreate(comm, A)); 1662 PetscCall(MatSetSizes(*A, m, n, m, n)); 1663 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1664 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1665 PetscFunctionReturn(PETSC_SUCCESS); 1666 } 1667 1668 // After matrix numeric factorization, there are still steps to do before triangular solve can be called. 1669 // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does). 1670 // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve(). 1671 // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck. 1672 static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A) 1673 { 1674 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1675 const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy 1676 const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy 1677 1678 PetscFunctionBegin; 1679 if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet 1680 if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d)); 1681 if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d)); 1682 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1683 } 1684 PetscFunctionReturn(PETSC_SUCCESS); 1685 } 1686 1687 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1688 { 1689 const PetscInt n = A->rmap->n; 1690 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1691 const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy 1692 const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy 1693 1694 PetscFunctionBegin; 1695 if (!factors->transpose_updated) { 1696 if (has_upper) { 1697 if (!factors->iUt_d.extent(0)) { // Allocate Ut on device if not yet 1698 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix 1699 factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 1700 factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 1701 } 1702 1703 if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host 1704 if (!factors->U) { 1705 Mat_SeqAIJ *seq; 1706 1707 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U)); 1708 PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut)); 1709 1710 seq = static_cast<Mat_SeqAIJ *>(factors->Ut->data); 1711 factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1); 1712 factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz); 1713 factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz); 1714 } else { 1715 PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h 1716 } 1717 // Copy Ut from host to device 1718 PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h)); 1719 PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h)); 1720 PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h)); 1721 } else { // If U was computed on device, we also compute the transpose there 1722 // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. 1723 PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, 1724 factors->jU_d, factors->aU_d, 1725 factors->iUt_d, factors->jUt_d, 1726 factors->aUt_d)); 1727 PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d)); 1728 } 1729 PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d)); 1730 } 1731 1732 // do the same for L with LU 1733 if (has_lower) { 1734 if (!factors->iLt_d.extent(0)) { // Allocate Lt on device if not yet 1735 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix 1736 factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 1737 factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 1738 } 1739 1740 if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host 1741 if (!factors->L) { 1742 Mat_SeqAIJ *seq; 1743 1744 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L)); 1745 PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt)); 1746 1747 seq = static_cast<Mat_SeqAIJ *>(factors->Lt->data); 1748 factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1); 1749 factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz); 1750 factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz); 1751 } else { 1752 PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h 1753 } 1754 // Copy Lt from host to device 1755 PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h)); 1756 PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h)); 1757 PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h)); 1758 } else { // If L was computed on device, we also compute the transpose there 1759 // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. 1760 PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, 1761 factors->jL_d, factors->aL_d, 1762 factors->iLt_d, factors->jLt_d, 1763 factors->aLt_d)); 1764 PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d)); 1765 } 1766 PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d)); 1767 } 1768 1769 factors->transpose_updated = PETSC_TRUE; 1770 } 1771 PetscFunctionReturn(PETSC_SUCCESS); 1772 } 1773 1774 // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A. 1775 // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty. 1776 static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx) 1777 { 1778 auto exec = PetscGetKokkosExecutionSpace(); 1779 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1780 PetscInt m = A->rmap->n; 1781 PetscScalarKokkosView D = factors->D_d; 1782 PetscScalarKokkosView X, Y, B; // alias 1783 ConstPetscScalarKokkosView b; 1784 PetscScalarKokkosView x; 1785 PetscIntKokkosView &rowperm = factors->rowperm; 1786 PetscBool identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1787 1788 PetscFunctionBegin; 1789 PetscCall(PetscLogGpuTimeBegin()); 1790 PetscCall(MatSeqAIJKokkosSolveCheck(A)); // for UX = T 1791 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B 1792 PetscCall(VecGetKokkosView(bb, &b)); 1793 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1794 1795 // Solve U^T Y = B 1796 if (identity) { // Reorder b with the row permutation 1797 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1798 Y = factors->workVector; 1799 } else { 1800 B = factors->workVector; 1801 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); })); 1802 Y = x; 1803 } 1804 PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y)); 1805 1806 // Solve diag(D) Y' = Y. 1807 // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication. 1808 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); })); 1809 1810 // Solve UX = Y 1811 if (identity) { 1812 X = x; 1813 } else { 1814 X = factors->workVector; // B is not needed anymore 1815 } 1816 PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X)); 1817 1818 // Reorder X with the inverse column (row) permutation 1819 if (!identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); })); 1820 1821 PetscCall(VecRestoreKokkosView(bb, &b)); 1822 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1823 PetscCall(PetscLogGpuTimeEnd()); 1824 PetscFunctionReturn(PETSC_SUCCESS); 1825 } 1826 1827 // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively. 1828 // R and C are represented by rowperm and colperm in factors. 1829 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty. 1830 static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx) 1831 { 1832 auto exec = PetscGetKokkosExecutionSpace(); 1833 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1834 PetscInt m = A->rmap->n; 1835 PetscScalarKokkosView X, Y, B; // alias 1836 ConstPetscScalarKokkosView b; 1837 PetscScalarKokkosView x; 1838 PetscIntKokkosView &rowperm = factors->rowperm; 1839 PetscIntKokkosView &colperm = factors->colperm; 1840 PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1841 PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1842 1843 PetscFunctionBegin; 1844 PetscCall(PetscLogGpuTimeBegin()); 1845 PetscCall(MatSeqAIJKokkosSolveCheck(A)); 1846 PetscCall(VecGetKokkosView(bb, &b)); 1847 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1848 1849 // Solve L Y = B (i.e., L (U C^- x) = R b). R b indicates applying the row permutation on b. 1850 if (row_identity) { 1851 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1852 Y = factors->workVector; 1853 } else { 1854 B = factors->workVector; 1855 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); })); 1856 Y = x; 1857 } 1858 PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y)); 1859 1860 // Solve U C^- x = Y 1861 if (col_identity) { 1862 X = x; 1863 } else { 1864 X = factors->workVector; 1865 } 1866 PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X)); 1867 1868 // x = C X; Reorder X with the inverse col permutation 1869 if (!col_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); })); 1870 1871 PetscCall(VecRestoreKokkosView(bb, &b)); 1872 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1873 PetscCall(PetscLogGpuTimeEnd()); 1874 PetscFunctionReturn(PETSC_SUCCESS); 1875 } 1876 1877 // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively. 1878 // R and C are represented by rowperm and colperm in factors. 1879 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty. 1880 // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R. 1881 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx) 1882 { 1883 auto exec = PetscGetKokkosExecutionSpace(); 1884 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1885 PetscInt m = A->rmap->n; 1886 PetscScalarKokkosView X, Y, B; // alias 1887 ConstPetscScalarKokkosView b; 1888 PetscScalarKokkosView x; 1889 PetscIntKokkosView &rowperm = factors->rowperm; 1890 PetscIntKokkosView &colperm = factors->colperm; 1891 PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1892 PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1893 1894 PetscFunctionBegin; 1895 PetscCall(PetscLogGpuTimeBegin()); 1896 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T 1897 PetscCall(VecGetKokkosView(bb, &b)); 1898 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1899 1900 // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b). Note C^- b = C^T b, which means applying the column permutation on b. 1901 if (col_identity) { // Reorder b with the col permutation 1902 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1903 Y = factors->workVector; 1904 } else { 1905 B = factors->workVector; 1906 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); })); 1907 Y = x; 1908 } 1909 PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y)); 1910 1911 // Solve L^T X = Y 1912 if (row_identity) { 1913 X = x; 1914 } else { 1915 X = factors->workVector; 1916 } 1917 PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X)); 1918 1919 // x = R^- X = R^T X; Reorder X with the inverse row permutation 1920 if (!row_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); })); 1921 1922 PetscCall(VecRestoreKokkosView(bb, &b)); 1923 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1924 PetscCall(PetscLogGpuTimeEnd()); 1925 PetscFunctionReturn(PETSC_SUCCESS); 1926 } 1927 1928 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1929 { 1930 PetscFunctionBegin; 1931 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1932 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1933 1934 if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device 1935 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1936 Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data); 1937 const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag; 1938 const MatScalar *Ba = b->a; 1939 PetscInt m = B->rmap->n, n = B->cmap->n; 1940 1941 if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time 1942 // Allocate memory and copy the structure 1943 factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1); 1944 factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries 1945 factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m); 1946 factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1); 1947 factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m])); 1948 factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m])); 1949 1950 PetscInt *Li = factors->iL_h.data(); 1951 PetscInt *Lj = factors->jL_h.data(); 1952 PetscInt *Ui = factors->iU_h.data(); 1953 PetscInt *Uj = factors->jU_h.data(); 1954 1955 Li[0] = Ui[0] = 0; 1956 for (PetscInt i = 0; i < m; i++) { 1957 PetscInt llen = Bi[i + 1] - Bi[i]; // exclusive of the diagonal entry 1958 PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry 1959 1960 PetscCall(PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen)); // entries of L on the left of the diagonal 1961 Lj[Li[i] + llen] = i; // diagonal entry of L 1962 1963 Uj[Ui[i]] = i; // diagonal entry of U 1964 PetscCall(PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 1965 1966 Li[i + 1] = Li[i] + llen + 1; 1967 Ui[i + 1] = Ui[i] + ulen; 1968 } 1969 1970 factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h); 1971 factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h); 1972 factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h); 1973 factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h); 1974 factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h); 1975 factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h); 1976 1977 // Copy row/col permutation to device 1978 IS rowperm = ((Mat_SeqAIJ *)B->data)->row; 1979 PetscBool row_identity; 1980 PetscCall(ISIdentity(rowperm, &row_identity)); 1981 if (!row_identity) { 1982 const PetscInt *ip; 1983 1984 PetscCall(ISGetIndices(rowperm, &ip)); 1985 factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m); 1986 PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m))); 1987 PetscCall(ISRestoreIndices(rowperm, &ip)); 1988 PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt))); 1989 } 1990 1991 IS colperm = ((Mat_SeqAIJ *)B->data)->col; 1992 PetscBool col_identity; 1993 PetscCall(ISIdentity(colperm, &col_identity)); 1994 if (!col_identity) { 1995 const PetscInt *ip; 1996 1997 PetscCall(ISGetIndices(colperm, &ip)); 1998 factors->colperm = PetscIntKokkosView(NoInit("colperm"), n); 1999 PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n))); 2000 PetscCall(ISRestoreIndices(colperm, &ip)); 2001 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 2002 } 2003 2004 /* Create sptrsv handles for L, U and their transpose */ 2005 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2006 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2007 #else 2008 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2009 #endif 2010 factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */); 2011 factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */); 2012 factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */); 2013 factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */); 2014 } 2015 2016 // Copy the value 2017 for (PetscInt i = 0; i < m; i++) { 2018 PetscInt llen = Bi[i + 1] - Bi[i]; 2019 PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; 2020 const PetscInt *Li = factors->iL_h.data(); 2021 const PetscInt *Ui = factors->iU_h.data(); 2022 2023 PetscScalar *La = factors->aL_h.data(); 2024 PetscScalar *Ua = factors->aU_h.data(); 2025 2026 PetscCall(PetscArraycpy(La + Li[i], Ba + Bi[i], llen)); // entries of L 2027 La[Li[i] + llen] = 1.0; // diagonal entry 2028 2029 Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]]; // diagonal entry 2030 PetscCall(PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1)); // entries of U 2031 } 2032 2033 PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h)); 2034 PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h)); 2035 // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic 2036 factors->transpose_updated = PETSC_FALSE; 2037 factors->sptrsv_symbolic_completed = PETSC_FALSE; 2038 2039 B->ops->solve = MatSolve_SeqAIJKokkos_LU; 2040 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU; 2041 } 2042 2043 B->ops->matsolve = NULL; 2044 B->ops->matsolvetranspose = NULL; 2045 PetscFunctionReturn(PETSC_SUCCESS); 2046 } 2047 2048 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info) 2049 { 2050 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 2051 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2052 PetscInt fill_lev = info->levels; 2053 2054 PetscFunctionBegin; 2055 PetscCall(PetscLogGpuTimeBegin()); 2056 PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false"); 2057 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2058 2059 auto a_d = aijkok->a_dual.view_device(); 2060 auto i_d = aijkok->i_dual.view_device(); 2061 auto j_d = aijkok->j_dual.view_device(); 2062 2063 PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d)); 2064 2065 B->assembled = PETSC_TRUE; 2066 B->preallocated = PETSC_TRUE; 2067 B->ops->solve = MatSolve_SeqAIJKokkos_LU; 2068 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU; 2069 B->ops->matsolve = NULL; 2070 B->ops->matsolvetranspose = NULL; 2071 2072 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 2073 factors->transpose_updated = PETSC_FALSE; 2074 factors->sptrsv_symbolic_completed = PETSC_FALSE; 2075 /* TODO: log flops, but how to know that? */ 2076 PetscCall(PetscLogGpuTimeEnd()); 2077 PetscFunctionReturn(PETSC_SUCCESS); 2078 } 2079 2080 // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering 2081 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info) 2082 { 2083 Mat_SeqAIJKokkos *aijkok; 2084 Mat_SeqAIJ *b; 2085 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2086 PetscInt fill_lev = info->levels; 2087 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 2088 PetscInt n = A->rmap->n; 2089 2090 PetscFunctionBegin; 2091 PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now"); 2092 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2093 2094 /* Create a spiluk handle and then do symbolic factorization */ 2095 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 2096 factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 2097 2098 auto spiluk_handle = factors->kh.get_spiluk_handle(); 2099 2100 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 2101 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 2102 Kokkos::realloc(factors->iU_d, n + 1); 2103 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 2104 2105 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 2106 auto i_d = aijkok->i_dual.view_device(); 2107 auto j_d = aijkok->j_dual.view_device(); 2108 PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d)); 2109 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 2110 2111 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 2112 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 2113 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 2114 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 2115 2116 /* TODO: add options to select sptrsv algorithms */ 2117 /* Create sptrsv handles for L, U and their transpose */ 2118 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2119 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2120 #else 2121 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2122 #endif 2123 2124 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 2125 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 2126 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 2127 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 2128 2129 /* Fill fields of the factor matrix B */ 2130 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 2131 b = (Mat_SeqAIJ *)B->data; 2132 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 2133 B->info.fill_ratio_given = info->fill; 2134 B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 2135 2136 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0; 2137 PetscFunctionReturn(PETSC_SUCCESS); 2138 } 2139 2140 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2141 { 2142 PetscFunctionBegin; 2143 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2144 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2145 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2146 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 2147 PetscFunctionReturn(PETSC_SUCCESS); 2148 } 2149 2150 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2151 { 2152 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2153 2154 PetscFunctionBegin; 2155 if (!info->factoronhost) { 2156 PetscCall(ISIdentity(isrow, &row_identity)); 2157 PetscCall(ISIdentity(iscol, &col_identity)); 2158 } 2159 2160 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2161 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2162 2163 if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering 2164 PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info)); 2165 } else { 2166 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host 2167 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 2168 } 2169 PetscFunctionReturn(PETSC_SUCCESS); 2170 } 2171 2172 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 2173 { 2174 PetscFunctionBegin; 2175 PetscCall(MatSeqAIJKokkosSyncHost(A)); 2176 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 2177 2178 if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device 2179 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2180 Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data); 2181 const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag; 2182 const MatScalar *Ba = b->a; 2183 PetscInt m = B->rmap->n; 2184 2185 if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization 2186 // Allocate memory and copy the structure 2187 factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h 2188 factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]); 2189 factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]); 2190 factors->D_h = MatScalarKokkosViewHost(NoInit("D_h"), m); 2191 factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h); 2192 factors->D_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h); 2193 2194 // Build jU_h from the skewed Aj 2195 PetscInt *Uj = factors->jU_h.data(); 2196 for (PetscInt i = 0; i < m; i++) { 2197 PetscInt ulen = Bi[i + 1] - Bi[i]; 2198 Uj[Bi[i]] = i; // diagonal entry 2199 PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal 2200 } 2201 2202 // Copy iU, jU to device 2203 PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h)); 2204 PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h)); 2205 2206 // Copy row/col permutation to device 2207 IS rowperm = ((Mat_SeqAIJ *)B->data)->row; 2208 PetscBool row_identity; 2209 PetscCall(ISIdentity(rowperm, &row_identity)); 2210 if (!row_identity) { 2211 const PetscInt *ip; 2212 2213 PetscCall(ISGetIndices(rowperm, &ip)); 2214 PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m)); 2215 PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m))); 2216 PetscCall(ISRestoreIndices(rowperm, &ip)); 2217 PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt))); 2218 } 2219 2220 // Create sptrsv handles for U and U^T 2221 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2222 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2223 #else 2224 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2225 #endif 2226 factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */); 2227 factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */); 2228 } 2229 // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them 2230 B->ops->solve = MatSolve_SeqAIJKokkos_Cholesky; 2231 B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky; 2232 2233 // Copy the value 2234 PetscScalar *Ua = factors->aU_h.data(); 2235 PetscScalar *D = factors->D_h.data(); 2236 for (PetscInt i = 0; i < m; i++) { 2237 D[i] = Ba[Bdiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 2238 Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U 2239 for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k]; 2240 } 2241 PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h)); 2242 PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h)); 2243 2244 factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again 2245 factors->transpose_updated = PETSC_FALSE; 2246 } 2247 2248 B->ops->matsolve = NULL; 2249 B->ops->matsolvetranspose = NULL; 2250 PetscFunctionReturn(PETSC_SUCCESS); 2251 } 2252 2253 static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2254 { 2255 PetscFunctionBegin; 2256 if (info->solveonhost) { 2257 // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc. 2258 PetscCall(MatSetType(B, MATSEQSBAIJ)); 2259 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 2260 } 2261 2262 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2263 2264 if (!info->solveonhost) { 2265 // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr 2266 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2267 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2268 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos; 2269 } 2270 PetscFunctionReturn(PETSC_SUCCESS); 2271 } 2272 2273 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2274 { 2275 PetscFunctionBegin; 2276 if (info->solveonhost) { 2277 // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc. 2278 PetscCall(MatSetType(B, MATSEQSBAIJ)); 2279 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 2280 } 2281 2282 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm 2283 2284 if (!info->solveonhost) { 2285 // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr 2286 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2287 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2288 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos; 2289 } 2290 PetscFunctionReturn(PETSC_SUCCESS); 2291 } 2292 2293 // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix 2294 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type) 2295 { 2296 PetscFunctionBegin; 2297 *type = MATSOLVERKOKKOS; 2298 PetscFunctionReturn(PETSC_SUCCESS); 2299 } 2300 2301 /*MC 2302 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 2303 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 2304 2305 Level: beginner 2306 2307 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS` 2308 M*/ 2309 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 2310 { 2311 PetscInt n = A->rmap->n; 2312 MPI_Comm comm; 2313 2314 PetscFunctionBegin; 2315 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 2316 PetscCall(MatCreate(comm, B)); 2317 PetscCall(MatSetSizes(*B, n, n, n, n)); 2318 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2319 (*B)->factortype = ftype; 2320 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 2321 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2322 PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2323 2324 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2325 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 2326 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 2327 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2328 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2329 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2330 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2331 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJKokkos; 2332 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos; 2333 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2334 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2335 } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 2336 2337 // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host 2338 (*B)->canuseordering = PETSC_TRUE; 2339 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos)); 2340 PetscFunctionReturn(PETSC_SUCCESS); 2341 } 2342 2343 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void) 2344 { 2345 PetscFunctionBegin; 2346 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 2347 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos)); 2348 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 2349 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos)); 2350 PetscFunctionReturn(PETSC_SUCCESS); 2351 } 2352 2353 /* Utility to print out a KokkosCsrMatrix for debugging */ 2354 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 2355 { 2356 const auto &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map); 2357 const auto &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries); 2358 const auto &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values); 2359 const PetscInt *i = iv.data(); 2360 const PetscInt *j = jv.data(); 2361 const PetscScalar *a = av.data(); 2362 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 2363 2364 PetscFunctionBegin; 2365 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 2366 for (PetscInt k = 0; k < m; k++) { 2367 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 2368 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 2369 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 2370 } 2371 PetscFunctionReturn(PETSC_SUCCESS); 2372 } 2373