1 #include <petsc_kokkos.hpp> 2 #include <petscvec_kokkos.hpp> 3 #include <petscmat_kokkos.hpp> 4 #include <petscpkg_version.h> 5 #include <petsc/private/petscimpl.h> 6 #include <petsc/private/sfimpl.h> 7 #include <petsc/private/kokkosimpl.hpp> 8 #include <petscsys.h> 9 10 #include <Kokkos_Core.hpp> 11 #include <KokkosBlas.hpp> 12 #include <KokkosSparse_CrsMatrix.hpp> 13 14 // To suppress compiler warnings: 15 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63: 16 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, 17 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*, 18 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations] 19 #define DISABLE_CUSPARSE_DEPRECATED 20 #include <KokkosSparse_spmv.hpp> 21 22 #include <KokkosSparse_spiluk.hpp> 23 #include <KokkosSparse_sptrsv.hpp> 24 #include <KokkosSparse_spgemm.hpp> 25 #include <KokkosSparse_spadd.hpp> 26 #include <KokkosBatched_LU_Decl.hpp> 27 #include <KokkosBatched_InverseLU_Decl.hpp> 28 29 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 30 31 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 32 #include <KokkosSparse_Utils.hpp> 33 using KokkosSparse::sort_crs_matrix; 34 using KokkosSparse::Impl::transpose_matrix; 35 #else 36 #include <KokkosKernels_Sorting.hpp> 37 using KokkosKernels::sort_crs_matrix; 38 using KokkosKernels::Impl::transpose_matrix; 39 #endif 40 41 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0) 42 using KokkosSparse::spiluk_symbolic; 43 using KokkosSparse::spiluk_numeric; 44 using KokkosSparse::sptrsv_symbolic; 45 using KokkosSparse::sptrsv_solve; 46 using KokkosSparse::Experimental::SPTRSVAlgorithm; 47 using KokkosSparse::Experimental::SPILUKAlgorithm; 48 #else 49 using KokkosSparse::Experimental::spiluk_symbolic; 50 using KokkosSparse::Experimental::spiluk_numeric; 51 using KokkosSparse::Experimental::sptrsv_symbolic; 52 using KokkosSparse::Experimental::sptrsv_solve; 53 using KokkosSparse::Experimental::SPTRSVAlgorithm; 54 using KokkosSparse::Experimental::SPILUKAlgorithm; 55 #endif 56 57 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 58 59 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 60 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 61 In the latter case, it is important to set a_dual's sync state correctly. 62 */ 63 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 64 { 65 Mat_SeqAIJ *aijseq; 66 Mat_SeqAIJKokkos *aijkok; 67 68 PetscFunctionBegin; 69 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 70 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 71 72 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 73 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 74 75 /* If aijkok does not exist, we just copy i, j to device. 76 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 77 In both cases, we build a new aijkok structure. 78 */ 79 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 80 if (aijkok && aijkok->host_aij_allocated_by_kokkos) { /* Avoid accidentally freeing much needed a,i,j on host when deleting aijkok */ 81 PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i)); 82 PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j)); 83 PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a)); 84 PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1)); 85 PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz())); 86 PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz())); 87 aijseq->free_a = PETSC_TRUE; 88 aijseq->free_ij = PETSC_TRUE; 89 /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where 90 we have the CsrMatrix on device first and then copy to host, followed by 91 MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL. 92 One could improve it by not using NULL garray. 93 */ 94 } 95 delete aijkok; 96 aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 97 A->spptr = aijkok; 98 } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag. 99 MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n); 100 auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h); 101 aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h); 102 } 103 PetscFunctionReturn(PETSC_SUCCESS); 104 } 105 106 /* Sync CSR data to device if not yet */ 107 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 108 { 109 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 110 111 PetscFunctionBegin; 112 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 113 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 114 if (aijkok->a_dual.need_sync_device()) { 115 PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 116 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 117 aijkok->hermitian_updated = PETSC_FALSE; 118 } 119 PetscFunctionReturn(PETSC_SUCCESS); 120 } 121 122 /* Mark the CSR data on device as modified */ 123 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 124 { 125 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 126 127 PetscFunctionBegin; 128 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 129 aijkok->a_dual.clear_sync_state(); 130 aijkok->a_dual.modify_device(); 131 aijkok->transpose_updated = PETSC_FALSE; 132 aijkok->hermitian_updated = PETSC_FALSE; 133 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 134 PetscFunctionReturn(PETSC_SUCCESS); 135 } 136 137 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 138 { 139 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 140 auto exec = PetscGetKokkosExecutionSpace(); 141 142 PetscFunctionBegin; 143 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 144 /* We do not expect one needs factors on host */ 145 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 146 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 147 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec)); 148 PetscFunctionReturn(PETSC_SUCCESS); 149 } 150 151 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 152 { 153 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 154 155 PetscFunctionBegin; 156 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 157 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 158 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 159 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 160 */ 161 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 162 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 163 *array = aijkok->a_dual.view_host().data(); 164 } else { /* Happens when calling MatSetValues on a newly created matrix */ 165 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 166 } 167 PetscFunctionReturn(PETSC_SUCCESS); 168 } 169 170 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 171 { 172 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 173 174 PetscFunctionBegin; 175 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 176 PetscFunctionReturn(PETSC_SUCCESS); 177 } 178 179 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 180 { 181 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 182 183 PetscFunctionBegin; 184 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 185 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 186 *array = aijkok->a_dual.view_host().data(); 187 } else { 188 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 189 } 190 PetscFunctionReturn(PETSC_SUCCESS); 191 } 192 193 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 194 { 195 PetscFunctionBegin; 196 *array = NULL; 197 PetscFunctionReturn(PETSC_SUCCESS); 198 } 199 200 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 201 { 202 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 203 204 PetscFunctionBegin; 205 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 206 *array = aijkok->a_dual.view_host().data(); 207 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 208 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 209 } 210 PetscFunctionReturn(PETSC_SUCCESS); 211 } 212 213 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 214 { 215 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 216 217 PetscFunctionBegin; 218 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 219 aijkok->a_dual.clear_sync_state(); 220 aijkok->a_dual.modify_host(); 221 } 222 PetscFunctionReturn(PETSC_SUCCESS); 223 } 224 225 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 226 { 227 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 228 229 PetscFunctionBegin; 230 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 231 232 if (i) *i = aijkok->i_device_data(); 233 if (j) *j = aijkok->j_device_data(); 234 if (a) { 235 PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 236 *a = aijkok->a_device_data(); 237 } 238 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 239 PetscFunctionReturn(PETSC_SUCCESS); 240 } 241 242 static PetscErrorCode MatGetCurrentMemType_SeqAIJKokkos(PETSC_UNUSED Mat A, PetscMemType *mtype) 243 { 244 PetscFunctionBegin; 245 *mtype = PETSC_MEMTYPE_KOKKOS; 246 PetscFunctionReturn(PETSC_SUCCESS); 247 } 248 249 /* 250 Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 251 252 Input Parameter: 253 . A - the MATSEQAIJKOKKOS matrix 254 255 Output Parameters: 256 + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 257 - T_d - the transpose on device, whose value array is allocated but not initialized 258 */ 259 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 260 { 261 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 262 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 263 const PetscInt *Ai = aseq->i, *Aj = aseq->j; 264 MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 265 MatRowMapType *Ti = Ti_h.data(); 266 MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 267 MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 268 PetscInt *Tj = Tj_h.data(); 269 PetscInt *perm = perm_h.data(); 270 PetscInt *offset; 271 272 PetscFunctionBegin; 273 // Populate Ti 274 PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 275 Ti++; 276 for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 277 Ti--; 278 for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 279 280 // Populate Tj and the permutation array 281 PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 282 for (PetscInt i = 0; i < m; i++) { 283 for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 284 PetscInt r = Aj[j]; // row r of T 285 PetscInt disp = Ti[r] + offset[r]; 286 287 Tj[disp] = i; // col i of T 288 perm[disp] = j; 289 offset[r]++; 290 } 291 } 292 PetscCall(PetscFree(offset)); 293 294 // Sort each row of T, along with the permutation array 295 for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 296 297 // Output perm and T on device 298 auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 299 auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 300 PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 301 PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 302 PetscFunctionReturn(PETSC_SUCCESS); 303 } 304 305 // Generate the transpose on device and cache it internally 306 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 307 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 308 { 309 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 310 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 311 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 312 KokkosCsrMatrix &T = akok->csrmatT; 313 314 PetscFunctionBegin; 315 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 316 PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device 317 318 const auto &Aa = akok->a_dual.view_device(); 319 320 if (A->symmetric == PETSC_BOOL3_TRUE) { 321 *csrmatT = akok->csrmat; 322 } else { 323 // See if we already have a cached transpose and its value is up to date 324 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 325 if (!akok->transpose_updated) { // if the value is out of date, update the cached version 326 const auto &perm = akok->transpose_perm; // get the permutation array 327 auto &Ta = T.values; 328 329 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 330 } 331 } else { // Generate T of size n x m for the first time 332 MatRowMapKokkosView perm; 333 334 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 335 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 336 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 337 } 338 akok->transpose_updated = PETSC_TRUE; 339 *csrmatT = akok->csrmatT; 340 } 341 PetscFunctionReturn(PETSC_SUCCESS); 342 } 343 344 // Generate the Hermitian on device and cache it internally 345 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 346 { 347 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 348 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 349 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 350 KokkosCsrMatrix &T = akok->csrmatH; 351 352 PetscFunctionBegin; 353 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 354 PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device 355 356 const auto &Aa = akok->a_dual.view_device(); 357 358 if (A->hermitian == PETSC_BOOL3_TRUE) { 359 *csrmatH = akok->csrmat; 360 } else { 361 // See if we already have a cached hermitian and its value is up to date 362 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 363 if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 364 const auto &perm = akok->transpose_perm; // get the permutation array 365 auto &Ta = T.values; 366 367 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 368 } 369 } else { // Generate T of size n x m for the first time 370 MatRowMapKokkosView perm; 371 372 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 373 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 374 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 375 } 376 akok->hermitian_updated = PETSC_TRUE; 377 *csrmatH = akok->csrmatH; 378 } 379 PetscFunctionReturn(PETSC_SUCCESS); 380 } 381 382 /* y = A x */ 383 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 384 { 385 Mat_SeqAIJKokkos *aijkok; 386 ConstPetscScalarKokkosView xv; 387 PetscScalarKokkosView yv; 388 389 PetscFunctionBegin; 390 PetscCall(PetscLogGpuTimeBegin()); 391 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 392 PetscCall(VecGetKokkosView(xx, &xv)); 393 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 394 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 395 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 396 PetscCall(VecRestoreKokkosView(xx, &xv)); 397 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 398 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 399 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 400 PetscCall(PetscLogGpuTimeEnd()); 401 PetscFunctionReturn(PETSC_SUCCESS); 402 } 403 404 /* y = A^T x */ 405 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 406 { 407 Mat_SeqAIJKokkos *aijkok; 408 const char *mode; 409 ConstPetscScalarKokkosView xv; 410 PetscScalarKokkosView yv; 411 KokkosCsrMatrix csrmat; 412 413 PetscFunctionBegin; 414 PetscCall(PetscLogGpuTimeBegin()); 415 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 416 PetscCall(VecGetKokkosView(xx, &xv)); 417 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 418 if (A->form_explicit_transpose) { 419 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 420 mode = "N"; 421 } else { 422 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 423 csrmat = aijkok->csrmat; 424 mode = "T"; 425 } 426 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 427 PetscCall(VecRestoreKokkosView(xx, &xv)); 428 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 429 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 430 PetscCall(PetscLogGpuTimeEnd()); 431 PetscFunctionReturn(PETSC_SUCCESS); 432 } 433 434 /* y = A^H x */ 435 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 436 { 437 Mat_SeqAIJKokkos *aijkok; 438 const char *mode; 439 ConstPetscScalarKokkosView xv; 440 PetscScalarKokkosView yv; 441 KokkosCsrMatrix csrmat; 442 443 PetscFunctionBegin; 444 PetscCall(PetscLogGpuTimeBegin()); 445 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 446 PetscCall(VecGetKokkosView(xx, &xv)); 447 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 448 if (A->form_explicit_transpose) { 449 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 450 mode = "N"; 451 } else { 452 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 453 csrmat = aijkok->csrmat; 454 mode = "C"; 455 } 456 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 457 PetscCall(VecRestoreKokkosView(xx, &xv)); 458 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 459 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 460 PetscCall(PetscLogGpuTimeEnd()); 461 PetscFunctionReturn(PETSC_SUCCESS); 462 } 463 464 /* z = A x + y */ 465 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 466 { 467 Mat_SeqAIJKokkos *aijkok; 468 ConstPetscScalarKokkosView xv; 469 PetscScalarKokkosView zv; 470 471 PetscFunctionBegin; 472 PetscCall(PetscLogGpuTimeBegin()); 473 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 474 if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host 475 PetscCall(VecGetKokkosView(xx, &xv)); 476 PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device 477 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 478 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 479 PetscCall(VecRestoreKokkosView(xx, &xv)); 480 PetscCall(VecRestoreKokkosView(zz, &zv)); 481 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 482 PetscCall(PetscLogGpuTimeEnd()); 483 PetscFunctionReturn(PETSC_SUCCESS); 484 } 485 486 /* z = A^T x + y */ 487 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 488 { 489 Mat_SeqAIJKokkos *aijkok; 490 const char *mode; 491 ConstPetscScalarKokkosView xv; 492 PetscScalarKokkosView zv; 493 KokkosCsrMatrix csrmat; 494 495 PetscFunctionBegin; 496 PetscCall(PetscLogGpuTimeBegin()); 497 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 498 if (zz != yy) PetscCall(VecCopy(yy, zz)); 499 PetscCall(VecGetKokkosView(xx, &xv)); 500 PetscCall(VecGetKokkosView(zz, &zv)); 501 if (A->form_explicit_transpose) { 502 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 503 mode = "N"; 504 } else { 505 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 506 csrmat = aijkok->csrmat; 507 mode = "T"; 508 } 509 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 510 PetscCall(VecRestoreKokkosView(xx, &xv)); 511 PetscCall(VecRestoreKokkosView(zz, &zv)); 512 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 513 PetscCall(PetscLogGpuTimeEnd()); 514 PetscFunctionReturn(PETSC_SUCCESS); 515 } 516 517 /* z = A^H x + y */ 518 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 519 { 520 Mat_SeqAIJKokkos *aijkok; 521 const char *mode; 522 ConstPetscScalarKokkosView xv; 523 PetscScalarKokkosView zv; 524 KokkosCsrMatrix csrmat; 525 526 PetscFunctionBegin; 527 PetscCall(PetscLogGpuTimeBegin()); 528 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 529 if (zz != yy) PetscCall(VecCopy(yy, zz)); 530 PetscCall(VecGetKokkosView(xx, &xv)); 531 PetscCall(VecGetKokkosView(zz, &zv)); 532 if (A->form_explicit_transpose) { 533 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 534 mode = "N"; 535 } else { 536 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 537 csrmat = aijkok->csrmat; 538 mode = "C"; 539 } 540 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 541 PetscCall(VecRestoreKokkosView(xx, &xv)); 542 PetscCall(VecRestoreKokkosView(zz, &zv)); 543 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 544 PetscCall(PetscLogGpuTimeEnd()); 545 PetscFunctionReturn(PETSC_SUCCESS); 546 } 547 548 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 549 { 550 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 551 552 PetscFunctionBegin; 553 switch (op) { 554 case MAT_FORM_EXPLICIT_TRANSPOSE: 555 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 556 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 557 A->form_explicit_transpose = flg; 558 break; 559 default: 560 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 561 break; 562 } 563 PetscFunctionReturn(PETSC_SUCCESS); 564 } 565 566 /* Depending on reuse, either build a new mat, or use the existing mat */ 567 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 568 { 569 Mat_SeqAIJ *aseq; 570 571 PetscFunctionBegin; 572 PetscCall(PetscKokkosInitializeCheck()); 573 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 574 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 575 PetscCall(MatSetType(*newmat, mtype)); 576 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 577 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 578 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 579 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 580 PetscCall(PetscFree(A->defaultvectype)); 581 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 582 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 583 PetscCall(MatSetOps_SeqAIJKokkos(A)); 584 aseq = static_cast<Mat_SeqAIJ *>(A->data); 585 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 586 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 587 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE); 588 } 589 } 590 PetscFunctionReturn(PETSC_SUCCESS); 591 } 592 593 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 594 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 595 */ 596 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 597 { 598 Mat_SeqAIJ *bseq; 599 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 600 Mat mat; 601 602 PetscFunctionBegin; 603 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 604 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 605 mat = *B; 606 if (A->assembled) { 607 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 608 bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE); 609 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 610 /* Now copy values to B if needed */ 611 if (dupOption == MAT_COPY_VALUES) { 612 if (akok->a_dual.need_sync_device()) { 613 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 614 bkok->a_dual.modify_host(); 615 } else { /* If device has the latest data, we only copy data on device */ 616 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 617 bkok->a_dual.modify_device(); 618 } 619 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 620 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 621 bkok->a_dual.modify_host(); 622 } 623 mat->spptr = bkok; 624 } 625 626 PetscCall(PetscFree(mat->defaultvectype)); 627 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 628 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 629 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 630 PetscFunctionReturn(PETSC_SUCCESS); 631 } 632 633 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 634 { 635 Mat At; 636 KokkosCsrMatrix internT; 637 Mat_SeqAIJKokkos *atkok, *bkok; 638 639 PetscFunctionBegin; 640 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 641 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 642 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 643 /* Deep copy internT, as we want to isolate the internal transpose */ 644 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 645 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 646 if (reuse == MAT_INITIAL_MATRIX) *B = At; 647 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 648 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 649 if ((*B)->assembled) { 650 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 651 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 652 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 653 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 654 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 655 MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 656 MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 657 PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 658 PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 659 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 660 } 661 PetscFunctionReturn(PETSC_SUCCESS); 662 } 663 664 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 665 { 666 Mat_SeqAIJKokkos *aijkok; 667 668 PetscFunctionBegin; 669 if (A->factortype == MAT_FACTOR_NONE) { 670 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 671 delete aijkok; 672 } else { 673 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 674 } 675 A->spptr = NULL; 676 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 677 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 678 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 679 #if defined(PETSC_HAVE_HYPRE) 680 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL)); 681 #endif 682 PetscCall(MatDestroy_SeqAIJ(A)); 683 PetscFunctionReturn(PETSC_SUCCESS); 684 } 685 686 /*MC 687 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 688 689 A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types 690 691 Options Database Key: 692 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 693 694 Level: beginner 695 696 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 697 M*/ 698 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 699 { 700 PetscFunctionBegin; 701 PetscCall(PetscKokkosInitializeCheck()); 702 PetscCall(MatCreate_SeqAIJ(A)); 703 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 704 PetscFunctionReturn(PETSC_SUCCESS); 705 } 706 707 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 708 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 709 { 710 Mat_SeqAIJ *a, *b; 711 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 712 MatScalarKokkosView aa, ba, ca; 713 MatRowMapKokkosView ai, bi, ci; 714 MatColIdxKokkosView aj, bj, cj; 715 PetscInt m, n, nnz, aN; 716 717 PetscFunctionBegin; 718 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 719 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 720 PetscAssertPointer(C, 4); 721 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 722 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 723 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 724 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 725 726 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 727 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 728 a = static_cast<Mat_SeqAIJ *>(A->data); 729 b = static_cast<Mat_SeqAIJ *>(B->data); 730 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 731 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 732 aa = akok->a_dual.view_device(); 733 ai = akok->i_dual.view_device(); 734 ba = bkok->a_dual.view_device(); 735 bi = bkok->i_dual.view_device(); 736 m = A->rmap->n; /* M, N and nnz of C */ 737 n = A->cmap->n + B->cmap->n; 738 nnz = a->nz + b->nz; 739 aN = A->cmap->n; /* N of A */ 740 if (reuse == MAT_INITIAL_MATRIX) { 741 aj = akok->j_dual.view_device(); 742 bj = bkok->j_dual.view_device(); 743 auto ca = MatScalarKokkosView("a", aa.extent(0) + ba.extent(0)); 744 auto ci = MatRowMapKokkosView("i", ai.extent(0)); 745 auto cj = MatColIdxKokkosView("j", aj.extent(0) + bj.extent(0)); 746 747 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 748 Kokkos::parallel_for( 749 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 750 PetscInt i = t.league_rank(); /* row i */ 751 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 752 753 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 754 ci(i) = coffset; 755 if (i == m - 1) ci(m) = ai(m) + bi(m); 756 }); 757 758 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 759 if (k < alen) { 760 ca(coffset + k) = aa(ai(i) + k); 761 cj(coffset + k) = aj(ai(i) + k); 762 } else { 763 ca(coffset + k) = ba(bi(i) + k - alen); 764 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 765 } 766 }); 767 }); 768 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci, cj, ca)); 769 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 770 } else if (reuse == MAT_REUSE_MATRIX) { 771 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 772 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 773 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 774 ca = ckok->a_dual.view_device(); 775 ci = ckok->i_dual.view_device(); 776 777 Kokkos::parallel_for( 778 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 779 PetscInt i = t.league_rank(); /* row i */ 780 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 781 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 782 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 783 else ca(ci(i) + k) = ba(bi(i) + k - alen); 784 }); 785 }); 786 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 787 } 788 PetscFunctionReturn(PETSC_SUCCESS); 789 } 790 791 static PetscErrorCode MatProductCtxDestroy_SeqAIJKokkos(void **pdata) 792 { 793 PetscFunctionBegin; 794 delete *reinterpret_cast<MatProductCtx_SeqAIJKokkos **>(pdata); 795 PetscFunctionReturn(PETSC_SUCCESS); 796 } 797 798 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 799 { 800 Mat_Product *product = C->product; 801 Mat A, B; 802 bool transA, transB; /* use bool, since KK needs this type */ 803 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 804 Mat_SeqAIJ *c; 805 MatProductCtx_SeqAIJKokkos *pdata; 806 KokkosCsrMatrix csrmatA, csrmatB; 807 808 PetscFunctionBegin; 809 MatCheckProduct(C, 1); 810 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 811 pdata = static_cast<MatProductCtx_SeqAIJKokkos *>(C->product->data); 812 813 // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 814 // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 815 // we still do numeric. 816 if (pdata->reusesym) { // numeric reuses results from symbolic 817 pdata->reusesym = PETSC_FALSE; 818 PetscFunctionReturn(PETSC_SUCCESS); 819 } 820 821 switch (product->type) { 822 case MATPRODUCT_AB: 823 transA = false; 824 transB = false; 825 break; 826 case MATPRODUCT_AtB: 827 transA = true; 828 transB = false; 829 break; 830 case MATPRODUCT_ABt: 831 transA = false; 832 transB = true; 833 break; 834 default: 835 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 836 } 837 838 A = product->A; 839 B = product->B; 840 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 841 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 842 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 843 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 844 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 845 846 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 847 848 csrmatA = akok->csrmat; 849 csrmatB = bkok->csrmat; 850 851 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 852 if (transA) { 853 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 854 transA = false; 855 } 856 857 if (transB) { 858 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 859 transB = false; 860 } 861 PetscCall(PetscLogGpuTimeBegin()); 862 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 863 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 864 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 865 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 866 #endif 867 868 PetscCall(PetscLogGpuTimeEnd()); 869 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 870 /* shorter version of MatAssemblyEnd_SeqAIJ */ 871 c = (Mat_SeqAIJ *)C->data; 872 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 873 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 874 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 875 c->reallocs = 0; 876 C->info.mallocs = 0; 877 C->info.nz_unneeded = 0; 878 C->assembled = C->was_assembled = PETSC_TRUE; 879 C->num_ass++; 880 PetscFunctionReturn(PETSC_SUCCESS); 881 } 882 883 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 884 { 885 Mat_Product *product = C->product; 886 MatProductType ptype; 887 Mat A, B; 888 bool transA, transB; 889 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 890 MatProductCtx_SeqAIJKokkos *pdata; 891 MPI_Comm comm; 892 KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 893 894 PetscFunctionBegin; 895 MatCheckProduct(C, 1); 896 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 897 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 898 A = product->A; 899 B = product->B; 900 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 901 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 902 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 903 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 904 csrmatA = akok->csrmat; 905 csrmatB = bkok->csrmat; 906 907 ptype = product->type; 908 // Take advantage of the symmetry if true 909 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 910 ptype = MATPRODUCT_AB; 911 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 912 } 913 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 914 ptype = MATPRODUCT_AB; 915 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 916 } 917 918 switch (ptype) { 919 case MATPRODUCT_AB: 920 transA = false; 921 transB = false; 922 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 923 break; 924 case MATPRODUCT_AtB: 925 transA = true; 926 transB = false; 927 if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs)); 928 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 929 break; 930 case MATPRODUCT_ABt: 931 transA = false; 932 transB = true; 933 if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs)); 934 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 935 break; 936 default: 937 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 938 } 939 PetscCallCXX(product->data = pdata = new MatProductCtx_SeqAIJKokkos()); 940 pdata->reusesym = product->api_user; 941 942 /* TODO: add command line options to select spgemm algorithms */ 943 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 944 945 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 946 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 947 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 948 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 949 #endif 950 #endif 951 PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 952 953 PetscCall(PetscLogGpuTimeBegin()); 954 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 955 if (transA) { 956 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 957 transA = false; 958 } 959 960 if (transB) { 961 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 962 transB = false; 963 } 964 965 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 966 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 967 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 968 calling new Mat_SeqAIJKokkos(). 969 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 970 */ 971 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 972 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 973 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 974 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 975 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 976 #endif 977 PetscCall(PetscLogGpuTimeEnd()); 978 979 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 980 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 981 C->product->destroy = MatProductCtxDestroy_SeqAIJKokkos; 982 PetscFunctionReturn(PETSC_SUCCESS); 983 } 984 985 /* handles sparse matrix matrix ops */ 986 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 987 { 988 Mat_Product *product = mat->product; 989 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 990 991 PetscFunctionBegin; 992 MatCheckProduct(mat, 1); 993 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 994 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 995 if (Biskok && Ciskok) { 996 switch (product->type) { 997 case MATPRODUCT_AB: 998 case MATPRODUCT_AtB: 999 case MATPRODUCT_ABt: 1000 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 1001 break; 1002 case MATPRODUCT_PtAP: 1003 case MATPRODUCT_RARt: 1004 case MATPRODUCT_ABC: 1005 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 1006 break; 1007 default: 1008 break; 1009 } 1010 } else { /* fallback for AIJ */ 1011 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 1012 } 1013 PetscFunctionReturn(PETSC_SUCCESS); 1014 } 1015 1016 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 1017 { 1018 Mat_SeqAIJKokkos *aijkok; 1019 1020 PetscFunctionBegin; 1021 PetscCall(PetscLogGpuTimeBegin()); 1022 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1023 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1024 KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 1025 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1026 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 1027 PetscCall(PetscLogGpuTimeEnd()); 1028 PetscFunctionReturn(PETSC_SUCCESS); 1029 } 1030 1031 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular) 1032 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a) 1033 { 1034 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1035 1036 PetscFunctionBegin; 1037 if (A->assembled && aijseq->diagDense) { // no missing diagonals 1038 PetscInt n = PetscMin(A->rmap->n, A->cmap->n); 1039 1040 PetscCall(PetscLogGpuTimeBegin()); 1041 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1042 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1043 const auto &Aa = aijkok->a_dual.view_device(); 1044 const auto &Adiag = aijkok->diag_dual.view_device(); 1045 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; })); 1046 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1047 PetscCall(PetscLogGpuFlops(n)); 1048 PetscCall(PetscLogGpuTimeEnd()); 1049 } else { // need reassembly, very slow! 1050 PetscCall(MatShift_Basic(A, a)); 1051 } 1052 PetscFunctionReturn(PETSC_SUCCESS); 1053 } 1054 1055 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is) 1056 { 1057 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data); 1058 1059 PetscFunctionBegin; 1060 if (Y->assembled && aijseq->diagDense) { // no missing diagonals 1061 ConstPetscScalarKokkosView dv; 1062 PetscInt n, nv; 1063 1064 PetscCall(PetscLogGpuTimeBegin()); 1065 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1066 PetscCall(VecGetKokkosView(D, &dv)); 1067 PetscCall(VecGetLocalSize(D, &nv)); 1068 n = PetscMin(Y->rmap->n, Y->cmap->n); 1069 PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match"); 1070 1071 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1072 const auto &Aa = aijkok->a_dual.view_device(); 1073 const auto &Adiag = aijkok->diag_dual.view_device(); 1074 PetscCallCXX(Kokkos::parallel_for( 1075 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1076 if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i); 1077 else Aa(Adiag(i)) += dv(i); 1078 })); 1079 PetscCall(VecRestoreKokkosView(D, &dv)); 1080 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1081 PetscCall(PetscLogGpuFlops(n)); 1082 PetscCall(PetscLogGpuTimeEnd()); 1083 } else { // need reassembly, very slow! 1084 PetscCall(MatDiagonalSet_Default(Y, D, is)); 1085 } 1086 PetscFunctionReturn(PETSC_SUCCESS); 1087 } 1088 1089 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr) 1090 { 1091 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1092 PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz; 1093 ConstPetscScalarKokkosView lv, rv; 1094 1095 PetscFunctionBegin; 1096 PetscCall(PetscLogGpuTimeBegin()); 1097 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1098 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1099 const auto &Aa = aijkok->a_dual.view_device(); 1100 const auto &Ai = aijkok->i_dual.view_device(); 1101 const auto &Aj = aijkok->j_dual.view_device(); 1102 if (ll) { 1103 PetscCall(VecGetLocalSize(ll, &m)); 1104 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1105 PetscCall(VecGetKokkosView(ll, &lv)); 1106 PetscCallCXX(Kokkos::parallel_for( // for each row 1107 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1108 PetscInt i = t.league_rank(); // row i 1109 PetscInt len = Ai(i + 1) - Ai(i); 1110 // scale entries on the row 1111 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); }); 1112 })); 1113 PetscCall(VecRestoreKokkosView(ll, &lv)); 1114 PetscCall(PetscLogGpuFlops(nz)); 1115 } 1116 if (rr) { 1117 PetscCall(VecGetLocalSize(rr, &n)); 1118 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1119 PetscCall(VecGetKokkosView(rr, &rv)); 1120 PetscCallCXX(Kokkos::parallel_for( // for each nonzero 1121 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); })); 1122 PetscCall(VecRestoreKokkosView(rr, &lv)); 1123 PetscCall(PetscLogGpuFlops(nz)); 1124 } 1125 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1126 PetscCall(PetscLogGpuTimeEnd()); 1127 PetscFunctionReturn(PETSC_SUCCESS); 1128 } 1129 1130 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 1131 { 1132 Mat_SeqAIJKokkos *aijkok; 1133 1134 PetscFunctionBegin; 1135 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1136 if (aijkok) { /* Only zero the device if data is already there */ 1137 KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0); 1138 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1139 } else { /* Might be preallocated but not assembled */ 1140 PetscCall(MatZeroEntries_SeqAIJ(A)); 1141 } 1142 PetscFunctionReturn(PETSC_SUCCESS); 1143 } 1144 1145 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1146 { 1147 Mat_SeqAIJKokkos *aijkok; 1148 PetscInt n; 1149 PetscScalarKokkosView xv; 1150 1151 PetscFunctionBegin; 1152 PetscCall(VecGetLocalSize(x, &n)); 1153 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1154 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1155 1156 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1157 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1158 1159 const auto &Aa = aijkok->a_dual.view_device(); 1160 const auto &Ai = aijkok->i_dual.view_device(); 1161 const auto &Adiag = aijkok->diag_dual.view_device(); 1162 1163 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1164 Kokkos::parallel_for( 1165 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1166 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1167 else xv(i) = 0; 1168 }); 1169 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1170 PetscFunctionReturn(PETSC_SUCCESS); 1171 } 1172 1173 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1174 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1175 { 1176 Mat_SeqAIJKokkos *aijkok; 1177 1178 PetscFunctionBegin; 1179 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1180 PetscAssertPointer(kv, 2); 1181 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1182 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1183 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1184 *kv = aijkok->a_dual.view_device(); 1185 PetscFunctionReturn(PETSC_SUCCESS); 1186 } 1187 1188 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1189 { 1190 PetscFunctionBegin; 1191 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1192 PetscAssertPointer(kv, 2); 1193 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1194 PetscFunctionReturn(PETSC_SUCCESS); 1195 } 1196 1197 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1198 { 1199 Mat_SeqAIJKokkos *aijkok; 1200 1201 PetscFunctionBegin; 1202 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1203 PetscAssertPointer(kv, 2); 1204 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1205 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1206 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1207 *kv = aijkok->a_dual.view_device(); 1208 PetscFunctionReturn(PETSC_SUCCESS); 1209 } 1210 1211 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1212 { 1213 PetscFunctionBegin; 1214 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1215 PetscAssertPointer(kv, 2); 1216 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1217 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1218 PetscFunctionReturn(PETSC_SUCCESS); 1219 } 1220 1221 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1222 { 1223 Mat_SeqAIJKokkos *aijkok; 1224 1225 PetscFunctionBegin; 1226 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1227 PetscAssertPointer(kv, 2); 1228 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1229 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1230 *kv = aijkok->a_dual.view_device(); 1231 PetscFunctionReturn(PETSC_SUCCESS); 1232 } 1233 1234 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1235 { 1236 PetscFunctionBegin; 1237 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1238 PetscAssertPointer(kv, 2); 1239 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1240 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1241 PetscFunctionReturn(PETSC_SUCCESS); 1242 } 1243 1244 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A) 1245 { 1246 Mat_SeqAIJKokkos *akok; 1247 1248 PetscFunctionBegin; 1249 PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_d.extent(0), i_d, j_d, a_d)); 1250 PetscCall(MatCreate(comm, A)); 1251 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1252 PetscFunctionReturn(PETSC_SUCCESS); 1253 } 1254 1255 /* Computes Y += alpha X */ 1256 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1257 { 1258 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1259 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1260 ConstMatScalarKokkosView Xa; 1261 MatScalarKokkosView Ya; 1262 auto exec = PetscGetKokkosExecutionSpace(); 1263 1264 PetscFunctionBegin; 1265 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1266 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1267 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1268 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1269 PetscCall(PetscLogGpuTimeBegin()); 1270 1271 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1272 PetscBool e; 1273 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1274 if (e) { 1275 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1276 if (e) pattern = SAME_NONZERO_PATTERN; 1277 } 1278 } 1279 1280 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1281 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1282 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1283 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1284 */ 1285 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1286 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1287 Xa = xkok->a_dual.view_device(); 1288 Ya = ykok->a_dual.view_device(); 1289 1290 if (pattern == SAME_NONZERO_PATTERN) { 1291 KokkosBlas::axpy(exec, alpha, Xa, Ya); 1292 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1293 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1294 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1295 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1296 1297 Kokkos::parallel_for( 1298 Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1299 PetscInt i = t.league_rank(); // row i 1300 Kokkos::single(Kokkos::PerTeam(t), [=]() { 1301 // Only one thread works in a team 1302 PetscInt p, q = Yi(i); 1303 for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 1304 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 1305 if (Xj(p) == Yj(q)) { // Found it 1306 Ya(q) += alpha * Xa(p); 1307 q++; 1308 } else { 1309 // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1310 // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1311 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 1312 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 1313 #else 1314 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 1315 #endif 1316 } 1317 } 1318 }); 1319 }); 1320 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1321 } else { // different nonzero patterns 1322 Mat Z; 1323 KokkosCsrMatrix zcsr; 1324 KernelHandle kh; 1325 kh.create_spadd_handle(true); // X, Y are sorted 1326 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1327 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1328 zkok = new Mat_SeqAIJKokkos(zcsr); 1329 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1330 PetscCall(MatHeaderReplace(Y, &Z)); 1331 kh.destroy_spadd_handle(); 1332 } 1333 PetscCall(PetscLogGpuTimeEnd()); 1334 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 1335 PetscFunctionReturn(PETSC_SUCCESS); 1336 } 1337 1338 struct MatCOOStruct_SeqAIJKokkos { 1339 PetscCount n; 1340 PetscCount Atot; 1341 PetscInt nz; 1342 PetscCountKokkosView jmap; 1343 PetscCountKokkosView perm; 1344 1345 MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h) 1346 { 1347 nz = coo_h->nz; 1348 n = coo_h->n; 1349 Atot = coo_h->Atot; 1350 jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1)); 1351 perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot)); 1352 } 1353 }; 1354 1355 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data) 1356 { 1357 PetscFunctionBegin; 1358 PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data)); 1359 PetscFunctionReturn(PETSC_SUCCESS); 1360 } 1361 1362 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1363 { 1364 Mat_SeqAIJKokkos *akok; 1365 Mat_SeqAIJ *aseq; 1366 PetscContainer container_h; 1367 MatCOOStruct_SeqAIJ *coo_h; 1368 MatCOOStruct_SeqAIJKokkos *coo_d; 1369 1370 PetscFunctionBegin; 1371 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1372 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1373 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1374 delete akok; 1375 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE); 1376 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1377 1378 // Copy the COO struct to device 1379 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 1380 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 1381 PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h)); 1382 1383 // Put the COO struct in a container and then attach that to the matrix 1384 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos)); 1385 PetscFunctionReturn(PETSC_SUCCESS); 1386 } 1387 1388 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1389 { 1390 MatScalarKokkosView Aa; 1391 ConstMatScalarKokkosView kv; 1392 PetscMemType memtype; 1393 PetscContainer container; 1394 MatCOOStruct_SeqAIJKokkos *coo; 1395 1396 PetscFunctionBegin; 1397 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 1398 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 1399 1400 const auto &n = coo->n; 1401 const auto &Annz = coo->nz; 1402 const auto &jmap = coo->jmap; 1403 const auto &perm = coo->perm; 1404 1405 PetscCall(PetscGetMemType(v, &memtype)); 1406 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1407 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n)); 1408 } else { 1409 kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */ 1410 } 1411 1412 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1413 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1414 1415 PetscCall(PetscLogGpuTimeBegin()); 1416 Kokkos::parallel_for( 1417 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) { 1418 PetscScalar sum = 0.0; 1419 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1420 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1421 }); 1422 PetscCall(PetscLogGpuTimeEnd()); 1423 1424 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1425 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1426 PetscFunctionReturn(PETSC_SUCCESS); 1427 } 1428 1429 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1430 { 1431 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1432 1433 PetscFunctionBegin; 1434 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1435 A->boundtocpu = PETSC_FALSE; 1436 1437 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1438 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1439 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1440 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1441 A->ops->scale = MatScale_SeqAIJKokkos; 1442 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1443 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1444 A->ops->mult = MatMult_SeqAIJKokkos; 1445 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1446 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1447 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1448 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1449 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1450 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1451 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1452 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1453 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1454 A->ops->shift = MatShift_SeqAIJKokkos; 1455 A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos; 1456 A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos; 1457 A->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJKokkos; 1458 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1459 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1460 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1461 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1462 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1463 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1464 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1465 1466 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1467 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1468 #if defined(PETSC_HAVE_HYPRE) 1469 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE)); 1470 #endif 1471 PetscFunctionReturn(PETSC_SUCCESS); 1472 } 1473 1474 /* 1475 Extract the (prescribled) diagonal blocks of the matrix and then invert them 1476 1477 Input Parameters: 1478 + A - the MATSEQAIJKOKKOS matrix 1479 . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i) 1480 . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[] 1481 . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i) 1482 - work - a pre-allocated work buffer (as big as diagVal) for use by this routine 1483 1484 Output Parameter: 1485 . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order) 1486 */ 1487 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal) 1488 { 1489 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1490 PetscInt nblocks = bs.extent(0) - 1; 1491 1492 PetscFunctionBegin; 1493 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device 1494 1495 // Pull out the diagonal blocks of the matrix and then invert the blocks 1496 auto Aa = akok->a_dual.view_device(); 1497 auto Ai = akok->i_dual.view_device(); 1498 auto Aj = akok->j_dual.view_device(); 1499 auto Adiag = akok->diag_dual.view_device(); 1500 // TODO: how to tune the team size? 1501 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY) 1502 auto ts = Kokkos::AUTO(); 1503 #else 1504 auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs 1505 #endif 1506 PetscCallCXX(Kokkos::parallel_for( 1507 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) { 1508 const PetscInt bid = teamMember.league_rank(); // block id 1509 const PetscInt rstart = bs(bid); // this block starts from this row 1510 const PetscInt m = bs(bid + 1) - bs(bid); // size of this block 1511 const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order 1512 const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m); 1513 1514 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B 1515 PetscInt i = rstart + r; // i-th row in A 1516 1517 if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case) 1518 PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row 1519 1520 for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet 1521 if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero 1522 B(r, c) = 0.0; 1523 } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column 1524 B(r, c) = Aa(first + c); 1525 } else { // this entry does not show up in the CSR 1526 B(r, c) = 0.0; 1527 } 1528 } 1529 } else { // rare case that the diagonal does not exist 1530 const PetscInt begin = Ai(i); 1531 const PetscInt end = Ai(i + 1); 1532 for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0; 1533 for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not. 1534 if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j); 1535 else if (Aj(j) >= rstart + m) break; 1536 } 1537 } 1538 }); 1539 1540 // LU-decompose B (w/o pivoting) and then invert B 1541 KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0); 1542 KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W); 1543 })); 1544 // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels 1545 PetscFunctionReturn(PETSC_SUCCESS); 1546 } 1547 1548 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1549 { 1550 Mat_SeqAIJ *aseq; 1551 PetscInt i, m, n; 1552 auto exec = PetscGetKokkosExecutionSpace(); 1553 1554 PetscFunctionBegin; 1555 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1556 1557 m = akok->nrows(); 1558 n = akok->ncols(); 1559 PetscCall(MatSetSizes(A, m, n, m, n)); 1560 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1561 1562 /* Set up data structures of A as a MATSEQAIJ */ 1563 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1564 aseq = (Mat_SeqAIJ *)A->data; 1565 1566 PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */ 1567 PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec)); 1568 1569 aseq->i = akok->i_host_data(); 1570 aseq->j = akok->j_host_data(); 1571 aseq->a = akok->a_host_data(); 1572 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1573 aseq->free_a = PETSC_FALSE; 1574 aseq->free_ij = PETSC_FALSE; 1575 aseq->nz = akok->nnz(); 1576 aseq->maxnz = aseq->nz; 1577 1578 PetscCall(PetscMalloc1(m, &aseq->imax)); 1579 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1580 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1581 1582 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1583 akok->nonzerostate = A->nonzerostate; 1584 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1585 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1586 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1587 PetscFunctionReturn(PETSC_SUCCESS); 1588 } 1589 1590 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 1591 { 1592 PetscFunctionBegin; 1593 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1594 *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 1595 PetscFunctionReturn(PETSC_SUCCESS); 1596 } 1597 1598 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 1599 { 1600 Mat_SeqAIJKokkos *akok; 1601 1602 PetscFunctionBegin; 1603 PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 1604 PetscCall(MatCreate(comm, A)); 1605 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1606 PetscFunctionReturn(PETSC_SUCCESS); 1607 } 1608 1609 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1610 1611 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1612 */ 1613 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1614 { 1615 PetscFunctionBegin; 1616 PetscCall(MatCreate(comm, A)); 1617 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1618 PetscFunctionReturn(PETSC_SUCCESS); 1619 } 1620 1621 /*@C 1622 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1623 (the default parallel PETSc format). This matrix will ultimately be handled by 1624 Kokkos for calculations. 1625 1626 Collective 1627 1628 Input Parameters: 1629 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1630 . m - number of rows 1631 . n - number of columns 1632 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 1633 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 1634 1635 Output Parameter: 1636 . A - the matrix 1637 1638 Level: intermediate 1639 1640 Notes: 1641 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1642 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1643 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1644 1645 The AIJ format, also called 1646 compressed row storage, is fully compatible with standard Fortran 1647 storage. That is, the stored row and column indices can begin at 1648 either one (as in Fortran) or zero. 1649 1650 Specify the preallocated storage with either `nz` or `nnz` (not both). 1651 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1652 allocation. 1653 1654 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()` 1655 @*/ 1656 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1657 { 1658 PetscFunctionBegin; 1659 PetscCall(PetscKokkosInitializeCheck()); 1660 PetscCall(MatCreate(comm, A)); 1661 PetscCall(MatSetSizes(*A, m, n, m, n)); 1662 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1663 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1664 PetscFunctionReturn(PETSC_SUCCESS); 1665 } 1666 1667 // After matrix numeric factorization, there are still steps to do before triangular solve can be called. 1668 // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does). 1669 // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve(). 1670 // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck. 1671 static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A) 1672 { 1673 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1674 const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy 1675 const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy 1676 1677 PetscFunctionBegin; 1678 if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet 1679 if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d)); 1680 if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d)); 1681 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1682 } 1683 PetscFunctionReturn(PETSC_SUCCESS); 1684 } 1685 1686 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1687 { 1688 const PetscInt n = A->rmap->n; 1689 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1690 const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy 1691 const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy 1692 1693 PetscFunctionBegin; 1694 if (!factors->transpose_updated) { 1695 if (has_upper) { 1696 if (!factors->iUt_d.extent(0)) { // Allocate Ut on device if not yet 1697 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix 1698 factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 1699 factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 1700 } 1701 1702 if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host 1703 if (!factors->U) { 1704 Mat_SeqAIJ *seq; 1705 1706 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U)); 1707 PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut)); 1708 1709 seq = static_cast<Mat_SeqAIJ *>(factors->Ut->data); 1710 factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1); 1711 factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz); 1712 factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz); 1713 } else { 1714 PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h 1715 } 1716 // Copy Ut from host to device 1717 PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h)); 1718 PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h)); 1719 PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h)); 1720 } else { // If U was computed on device, we also compute the transpose there 1721 // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. 1722 PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, 1723 factors->jU_d, factors->aU_d, 1724 factors->iUt_d, factors->jUt_d, 1725 factors->aUt_d)); 1726 PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d)); 1727 } 1728 PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d)); 1729 } 1730 1731 // do the same for L with LU 1732 if (has_lower) { 1733 if (!factors->iLt_d.extent(0)) { // Allocate Lt on device if not yet 1734 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix 1735 factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 1736 factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 1737 } 1738 1739 if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host 1740 if (!factors->L) { 1741 Mat_SeqAIJ *seq; 1742 1743 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L)); 1744 PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt)); 1745 1746 seq = static_cast<Mat_SeqAIJ *>(factors->Lt->data); 1747 factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1); 1748 factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz); 1749 factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz); 1750 } else { 1751 PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h 1752 } 1753 // Copy Lt from host to device 1754 PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h)); 1755 PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h)); 1756 PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h)); 1757 } else { // If L was computed on device, we also compute the transpose there 1758 // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. 1759 PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, 1760 factors->jL_d, factors->aL_d, 1761 factors->iLt_d, factors->jLt_d, 1762 factors->aLt_d)); 1763 PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d)); 1764 } 1765 PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d)); 1766 } 1767 1768 factors->transpose_updated = PETSC_TRUE; 1769 } 1770 PetscFunctionReturn(PETSC_SUCCESS); 1771 } 1772 1773 // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A. 1774 // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty. 1775 static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx) 1776 { 1777 auto exec = PetscGetKokkosExecutionSpace(); 1778 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1779 PetscInt m = A->rmap->n; 1780 PetscScalarKokkosView D = factors->D_d; 1781 PetscScalarKokkosView X, Y, B; // alias 1782 ConstPetscScalarKokkosView b; 1783 PetscScalarKokkosView x; 1784 PetscIntKokkosView &rowperm = factors->rowperm; 1785 PetscBool identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1786 1787 PetscFunctionBegin; 1788 PetscCall(PetscLogGpuTimeBegin()); 1789 PetscCall(MatSeqAIJKokkosSolveCheck(A)); // for UX = T 1790 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B 1791 PetscCall(VecGetKokkosView(bb, &b)); 1792 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1793 1794 // Solve U^T Y = B 1795 if (identity) { // Reorder b with the row permutation 1796 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1797 Y = factors->workVector; 1798 } else { 1799 B = factors->workVector; 1800 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); })); 1801 Y = x; 1802 } 1803 PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y)); 1804 1805 // Solve diag(D) Y' = Y. 1806 // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication. 1807 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); })); 1808 1809 // Solve UX = Y 1810 if (identity) { 1811 X = x; 1812 } else { 1813 X = factors->workVector; // B is not needed anymore 1814 } 1815 PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X)); 1816 1817 // Reorder X with the inverse column (row) permutation 1818 if (!identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); })); 1819 1820 PetscCall(VecRestoreKokkosView(bb, &b)); 1821 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1822 PetscCall(PetscLogGpuTimeEnd()); 1823 PetscFunctionReturn(PETSC_SUCCESS); 1824 } 1825 1826 // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively. 1827 // R and C are represented by rowperm and colperm in factors. 1828 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty. 1829 static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx) 1830 { 1831 auto exec = PetscGetKokkosExecutionSpace(); 1832 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1833 PetscInt m = A->rmap->n; 1834 PetscScalarKokkosView X, Y, B; // alias 1835 ConstPetscScalarKokkosView b; 1836 PetscScalarKokkosView x; 1837 PetscIntKokkosView &rowperm = factors->rowperm; 1838 PetscIntKokkosView &colperm = factors->colperm; 1839 PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1840 PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1841 1842 PetscFunctionBegin; 1843 PetscCall(PetscLogGpuTimeBegin()); 1844 PetscCall(MatSeqAIJKokkosSolveCheck(A)); 1845 PetscCall(VecGetKokkosView(bb, &b)); 1846 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1847 1848 // Solve L Y = B (i.e., L (U C^- x) = R b). R b indicates applying the row permutation on b. 1849 if (row_identity) { 1850 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1851 Y = factors->workVector; 1852 } else { 1853 B = factors->workVector; 1854 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); })); 1855 Y = x; 1856 } 1857 PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y)); 1858 1859 // Solve U C^- x = Y 1860 if (col_identity) { 1861 X = x; 1862 } else { 1863 X = factors->workVector; 1864 } 1865 PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X)); 1866 1867 // x = C X; Reorder X with the inverse col permutation 1868 if (!col_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); })); 1869 1870 PetscCall(VecRestoreKokkosView(bb, &b)); 1871 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1872 PetscCall(PetscLogGpuTimeEnd()); 1873 PetscFunctionReturn(PETSC_SUCCESS); 1874 } 1875 1876 // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively. 1877 // R and C are represented by rowperm and colperm in factors. 1878 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty. 1879 // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R. 1880 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx) 1881 { 1882 auto exec = PetscGetKokkosExecutionSpace(); 1883 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1884 PetscInt m = A->rmap->n; 1885 PetscScalarKokkosView X, Y, B; // alias 1886 ConstPetscScalarKokkosView b; 1887 PetscScalarKokkosView x; 1888 PetscIntKokkosView &rowperm = factors->rowperm; 1889 PetscIntKokkosView &colperm = factors->colperm; 1890 PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1891 PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1892 1893 PetscFunctionBegin; 1894 PetscCall(PetscLogGpuTimeBegin()); 1895 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T 1896 PetscCall(VecGetKokkosView(bb, &b)); 1897 PetscCall(VecGetKokkosViewWrite(xx, &x)); 1898 1899 // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b). Note C^- b = C^T b, which means applying the column permutation on b. 1900 if (col_identity) { // Reorder b with the col permutation 1901 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1902 Y = factors->workVector; 1903 } else { 1904 B = factors->workVector; 1905 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); })); 1906 Y = x; 1907 } 1908 PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y)); 1909 1910 // Solve L^T X = Y 1911 if (row_identity) { 1912 X = x; 1913 } else { 1914 X = factors->workVector; 1915 } 1916 PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X)); 1917 1918 // x = R^- X = R^T X; Reorder X with the inverse row permutation 1919 if (!row_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); })); 1920 1921 PetscCall(VecRestoreKokkosView(bb, &b)); 1922 PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1923 PetscCall(PetscLogGpuTimeEnd()); 1924 PetscFunctionReturn(PETSC_SUCCESS); 1925 } 1926 1927 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1928 { 1929 PetscFunctionBegin; 1930 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1931 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1932 1933 if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device 1934 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1935 Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data); 1936 const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag; 1937 const MatScalar *Ba = b->a; 1938 PetscInt m = B->rmap->n, n = B->cmap->n; 1939 1940 if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time 1941 // Allocate memory and copy the structure 1942 factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1); 1943 factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries 1944 factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m); 1945 factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1); 1946 factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m])); 1947 factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m])); 1948 1949 PetscInt *Li = factors->iL_h.data(); 1950 PetscInt *Lj = factors->jL_h.data(); 1951 PetscInt *Ui = factors->iU_h.data(); 1952 PetscInt *Uj = factors->jU_h.data(); 1953 1954 Li[0] = Ui[0] = 0; 1955 for (PetscInt i = 0; i < m; i++) { 1956 PetscInt llen = Bi[i + 1] - Bi[i]; // exclusive of the diagonal entry 1957 PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry 1958 1959 PetscCall(PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen)); // entries of L on the left of the diagonal 1960 Lj[Li[i] + llen] = i; // diagonal entry of L 1961 1962 Uj[Ui[i]] = i; // diagonal entry of U 1963 PetscCall(PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 1964 1965 Li[i + 1] = Li[i] + llen + 1; 1966 Ui[i + 1] = Ui[i] + ulen; 1967 } 1968 1969 factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h); 1970 factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h); 1971 factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h); 1972 factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h); 1973 factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h); 1974 factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h); 1975 1976 // Copy row/col permutation to device 1977 IS rowperm = ((Mat_SeqAIJ *)B->data)->row; 1978 PetscBool row_identity; 1979 PetscCall(ISIdentity(rowperm, &row_identity)); 1980 if (!row_identity) { 1981 const PetscInt *ip; 1982 1983 PetscCall(ISGetIndices(rowperm, &ip)); 1984 factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m); 1985 PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m))); 1986 PetscCall(ISRestoreIndices(rowperm, &ip)); 1987 PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt))); 1988 } 1989 1990 IS colperm = ((Mat_SeqAIJ *)B->data)->col; 1991 PetscBool col_identity; 1992 PetscCall(ISIdentity(colperm, &col_identity)); 1993 if (!col_identity) { 1994 const PetscInt *ip; 1995 1996 PetscCall(ISGetIndices(colperm, &ip)); 1997 factors->colperm = PetscIntKokkosView(NoInit("colperm"), n); 1998 PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n))); 1999 PetscCall(ISRestoreIndices(colperm, &ip)); 2000 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 2001 } 2002 2003 /* Create sptrsv handles for L, U and their transpose */ 2004 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2005 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2006 #else 2007 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2008 #endif 2009 factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */); 2010 factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */); 2011 factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */); 2012 factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */); 2013 } 2014 2015 // Copy the value 2016 for (PetscInt i = 0; i < m; i++) { 2017 PetscInt llen = Bi[i + 1] - Bi[i]; 2018 PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; 2019 const PetscInt *Li = factors->iL_h.data(); 2020 const PetscInt *Ui = factors->iU_h.data(); 2021 2022 PetscScalar *La = factors->aL_h.data(); 2023 PetscScalar *Ua = factors->aU_h.data(); 2024 2025 PetscCall(PetscArraycpy(La + Li[i], Ba + Bi[i], llen)); // entries of L 2026 La[Li[i] + llen] = 1.0; // diagonal entry 2027 2028 Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]]; // diagonal entry 2029 PetscCall(PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1)); // entries of U 2030 } 2031 2032 PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h)); 2033 PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h)); 2034 // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic 2035 factors->transpose_updated = PETSC_FALSE; 2036 factors->sptrsv_symbolic_completed = PETSC_FALSE; 2037 2038 B->ops->solve = MatSolve_SeqAIJKokkos_LU; 2039 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU; 2040 } 2041 2042 B->ops->matsolve = NULL; 2043 B->ops->matsolvetranspose = NULL; 2044 PetscFunctionReturn(PETSC_SUCCESS); 2045 } 2046 2047 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info) 2048 { 2049 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 2050 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2051 PetscInt fill_lev = info->levels; 2052 2053 PetscFunctionBegin; 2054 PetscCall(PetscLogGpuTimeBegin()); 2055 PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false"); 2056 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2057 2058 auto a_d = aijkok->a_dual.view_device(); 2059 auto i_d = aijkok->i_dual.view_device(); 2060 auto j_d = aijkok->j_dual.view_device(); 2061 2062 PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d)); 2063 2064 B->assembled = PETSC_TRUE; 2065 B->preallocated = PETSC_TRUE; 2066 B->ops->solve = MatSolve_SeqAIJKokkos_LU; 2067 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU; 2068 B->ops->matsolve = NULL; 2069 B->ops->matsolvetranspose = NULL; 2070 2071 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 2072 factors->transpose_updated = PETSC_FALSE; 2073 factors->sptrsv_symbolic_completed = PETSC_FALSE; 2074 /* TODO: log flops, but how to know that? */ 2075 PetscCall(PetscLogGpuTimeEnd()); 2076 PetscFunctionReturn(PETSC_SUCCESS); 2077 } 2078 2079 // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering 2080 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info) 2081 { 2082 Mat_SeqAIJKokkos *aijkok; 2083 Mat_SeqAIJ *b; 2084 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2085 PetscInt fill_lev = info->levels; 2086 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 2087 PetscInt n = A->rmap->n; 2088 2089 PetscFunctionBegin; 2090 PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now"); 2091 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2092 2093 /* Create a spiluk handle and then do symbolic factorization */ 2094 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 2095 factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 2096 2097 auto spiluk_handle = factors->kh.get_spiluk_handle(); 2098 2099 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 2100 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 2101 Kokkos::realloc(factors->iU_d, n + 1); 2102 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 2103 2104 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 2105 auto i_d = aijkok->i_dual.view_device(); 2106 auto j_d = aijkok->j_dual.view_device(); 2107 PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d)); 2108 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 2109 2110 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 2111 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 2112 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 2113 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 2114 2115 /* TODO: add options to select sptrsv algorithms */ 2116 /* Create sptrsv handles for L, U and their transpose */ 2117 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2118 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2119 #else 2120 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2121 #endif 2122 2123 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 2124 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 2125 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 2126 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 2127 2128 /* Fill fields of the factor matrix B */ 2129 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 2130 b = (Mat_SeqAIJ *)B->data; 2131 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 2132 B->info.fill_ratio_given = info->fill; 2133 B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 2134 2135 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0; 2136 PetscFunctionReturn(PETSC_SUCCESS); 2137 } 2138 2139 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2140 { 2141 PetscFunctionBegin; 2142 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2143 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2144 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2145 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 2146 PetscFunctionReturn(PETSC_SUCCESS); 2147 } 2148 2149 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2150 { 2151 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2152 2153 PetscFunctionBegin; 2154 if (!info->factoronhost) { 2155 PetscCall(ISIdentity(isrow, &row_identity)); 2156 PetscCall(ISIdentity(iscol, &col_identity)); 2157 } 2158 2159 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2160 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2161 2162 if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering 2163 PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info)); 2164 } else { 2165 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host 2166 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 2167 } 2168 PetscFunctionReturn(PETSC_SUCCESS); 2169 } 2170 2171 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 2172 { 2173 PetscFunctionBegin; 2174 PetscCall(MatSeqAIJKokkosSyncHost(A)); 2175 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 2176 2177 if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device 2178 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2179 Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data); 2180 const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag; 2181 const MatScalar *Ba = b->a; 2182 PetscInt m = B->rmap->n; 2183 2184 if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization 2185 // Allocate memory and copy the structure 2186 factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h 2187 factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]); 2188 factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]); 2189 factors->D_h = MatScalarKokkosViewHost(NoInit("D_h"), m); 2190 factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h); 2191 factors->D_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h); 2192 2193 // Build jU_h from the skewed Aj 2194 PetscInt *Uj = factors->jU_h.data(); 2195 for (PetscInt i = 0; i < m; i++) { 2196 PetscInt ulen = Bi[i + 1] - Bi[i]; 2197 Uj[Bi[i]] = i; // diagonal entry 2198 PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal 2199 } 2200 2201 // Copy iU, jU to device 2202 PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h)); 2203 PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h)); 2204 2205 // Copy row/col permutation to device 2206 IS rowperm = ((Mat_SeqAIJ *)B->data)->row; 2207 PetscBool row_identity; 2208 PetscCall(ISIdentity(rowperm, &row_identity)); 2209 if (!row_identity) { 2210 const PetscInt *ip; 2211 2212 PetscCall(ISGetIndices(rowperm, &ip)); 2213 PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m)); 2214 PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m))); 2215 PetscCall(ISRestoreIndices(rowperm, &ip)); 2216 PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt))); 2217 } 2218 2219 // Create sptrsv handles for U and U^T 2220 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2221 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2222 #else 2223 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2224 #endif 2225 factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */); 2226 factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */); 2227 } 2228 // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them 2229 B->ops->solve = MatSolve_SeqAIJKokkos_Cholesky; 2230 B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky; 2231 2232 // Copy the value 2233 PetscScalar *Ua = factors->aU_h.data(); 2234 PetscScalar *D = factors->D_h.data(); 2235 for (PetscInt i = 0; i < m; i++) { 2236 D[i] = Ba[Bdiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 2237 Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U 2238 for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k]; 2239 } 2240 PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h)); 2241 PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h)); 2242 2243 factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again 2244 factors->transpose_updated = PETSC_FALSE; 2245 } 2246 2247 B->ops->matsolve = NULL; 2248 B->ops->matsolvetranspose = NULL; 2249 PetscFunctionReturn(PETSC_SUCCESS); 2250 } 2251 2252 static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2253 { 2254 PetscFunctionBegin; 2255 if (info->solveonhost) { 2256 // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc. 2257 PetscCall(MatSetType(B, MATSEQSBAIJ)); 2258 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 2259 } 2260 2261 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2262 2263 if (!info->solveonhost) { 2264 // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr 2265 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2266 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2267 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos; 2268 } 2269 PetscFunctionReturn(PETSC_SUCCESS); 2270 } 2271 2272 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2273 { 2274 PetscFunctionBegin; 2275 if (info->solveonhost) { 2276 // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc. 2277 PetscCall(MatSetType(B, MATSEQSBAIJ)); 2278 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 2279 } 2280 2281 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm 2282 2283 if (!info->solveonhost) { 2284 // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr 2285 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2286 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2287 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos; 2288 } 2289 PetscFunctionReturn(PETSC_SUCCESS); 2290 } 2291 2292 // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix 2293 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type) 2294 { 2295 PetscFunctionBegin; 2296 *type = MATSOLVERKOKKOS; 2297 PetscFunctionReturn(PETSC_SUCCESS); 2298 } 2299 2300 /*MC 2301 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 2302 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 2303 2304 Level: beginner 2305 2306 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS` 2307 M*/ 2308 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 2309 { 2310 PetscInt n = A->rmap->n; 2311 MPI_Comm comm; 2312 2313 PetscFunctionBegin; 2314 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 2315 PetscCall(MatCreate(comm, B)); 2316 PetscCall(MatSetSizes(*B, n, n, n, n)); 2317 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2318 (*B)->factortype = ftype; 2319 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 2320 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2321 PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2322 2323 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2324 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 2325 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 2326 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2327 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2328 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2329 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2330 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJKokkos; 2331 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos; 2332 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2333 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2334 } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 2335 2336 // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host 2337 (*B)->canuseordering = PETSC_TRUE; 2338 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos)); 2339 PetscFunctionReturn(PETSC_SUCCESS); 2340 } 2341 2342 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void) 2343 { 2344 PetscFunctionBegin; 2345 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 2346 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos)); 2347 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 2348 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos)); 2349 PetscFunctionReturn(PETSC_SUCCESS); 2350 } 2351 2352 /* Utility to print out a KokkosCsrMatrix for debugging */ 2353 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 2354 { 2355 const auto &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map); 2356 const auto &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries); 2357 const auto &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values); 2358 const PetscInt *i = iv.data(); 2359 const PetscInt *j = jv.data(); 2360 const PetscScalar *a = av.data(); 2361 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 2362 2363 PetscFunctionBegin; 2364 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 2365 for (PetscInt k = 0; k < m; k++) { 2366 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 2367 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 2368 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 2369 } 2370 PetscFunctionReturn(PETSC_SUCCESS); 2371 } 2372