1 #include <petsc_kokkos.hpp> 2 #include <petscvec_kokkos.hpp> 3 #include <petscpkg_version.h> 4 #include <petsc/private/petscimpl.h> 5 #include <petsc/private/sfimpl.h> 6 #include <petscsystypes.h> 7 #include <petscerror.h> 8 9 #include <Kokkos_Core.hpp> 10 #include <KokkosBlas.hpp> 11 #include <KokkosSparse_CrsMatrix.hpp> 12 13 // To suppress compiler warnings: 14 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63: 15 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, 16 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*, 17 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations] 18 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations") 19 #include <KokkosSparse_spmv.hpp> 20 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END() 21 22 #include <KokkosSparse_spiluk.hpp> 23 #include <KokkosSparse_sptrsv.hpp> 24 #include <KokkosSparse_spgemm.hpp> 25 #include <KokkosSparse_spadd.hpp> 26 #include <KokkosBatched_LU_Decl.hpp> 27 #include <KokkosBatched_InverseLU_Decl.hpp> 28 29 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 30 31 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 32 #include <KokkosSparse_Utils.hpp> 33 using KokkosSparse::sort_crs_matrix; 34 using KokkosSparse::Impl::transpose_matrix; 35 #else 36 #include <KokkosKernels_Sorting.hpp> 37 using KokkosKernels::sort_crs_matrix; 38 using KokkosKernels::Impl::transpose_matrix; 39 #endif 40 41 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 42 43 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 44 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 45 In the latter case, it is important to set a_dual's sync state correctly. 46 */ 47 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 48 { 49 Mat_SeqAIJ *aijseq; 50 Mat_SeqAIJKokkos *aijkok; 51 52 PetscFunctionBegin; 53 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 54 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 55 56 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 57 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 58 59 /* If aijkok does not exist, we just copy i, j to device. 60 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 61 In both cases, we build a new aijkok structure. 62 */ 63 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 64 delete aijkok; 65 aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 66 A->spptr = aijkok; 67 } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag. 68 MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n); 69 auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h); 70 aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h); 71 } 72 PetscFunctionReturn(PETSC_SUCCESS); 73 } 74 75 /* Sync CSR data to device if not yet */ 76 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 77 { 78 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 79 80 PetscFunctionBegin; 81 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 82 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 83 if (aijkok->a_dual.need_sync_device()) { 84 aijkok->a_dual.sync_device(); 85 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 86 aijkok->hermitian_updated = PETSC_FALSE; 87 } 88 PetscFunctionReturn(PETSC_SUCCESS); 89 } 90 91 /* Mark the CSR data on device as modified */ 92 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 93 { 94 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 95 96 PetscFunctionBegin; 97 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 98 aijkok->a_dual.clear_sync_state(); 99 aijkok->a_dual.modify_device(); 100 aijkok->transpose_updated = PETSC_FALSE; 101 aijkok->hermitian_updated = PETSC_FALSE; 102 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 103 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 104 PetscFunctionReturn(PETSC_SUCCESS); 105 } 106 107 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 108 { 109 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 110 auto exec = PetscGetKokkosExecutionSpace(); 111 112 PetscFunctionBegin; 113 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 114 /* We do not expect one needs factors on host */ 115 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 116 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 117 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 118 PetscCallCXX(exec.fence()); 119 PetscFunctionReturn(PETSC_SUCCESS); 120 } 121 122 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 123 { 124 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 125 126 PetscFunctionBegin; 127 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 128 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 129 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 130 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 131 */ 132 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 133 auto exec = PetscGetKokkosExecutionSpace(); 134 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 135 PetscCallCXX(exec.fence()); 136 *array = aijkok->a_dual.view_host().data(); 137 } else { /* Happens when calling MatSetValues on a newly created matrix */ 138 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 139 } 140 PetscFunctionReturn(PETSC_SUCCESS); 141 } 142 143 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 144 { 145 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 146 147 PetscFunctionBegin; 148 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 149 PetscFunctionReturn(PETSC_SUCCESS); 150 } 151 152 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 153 { 154 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 155 156 PetscFunctionBegin; 157 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 158 auto exec = PetscGetKokkosExecutionSpace(); 159 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 160 PetscCallCXX(exec.fence()); 161 *array = aijkok->a_dual.view_host().data(); 162 } else { 163 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 164 } 165 PetscFunctionReturn(PETSC_SUCCESS); 166 } 167 168 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 169 { 170 PetscFunctionBegin; 171 *array = NULL; 172 PetscFunctionReturn(PETSC_SUCCESS); 173 } 174 175 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 176 { 177 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 178 179 PetscFunctionBegin; 180 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 181 *array = aijkok->a_dual.view_host().data(); 182 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 183 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 184 } 185 PetscFunctionReturn(PETSC_SUCCESS); 186 } 187 188 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 189 { 190 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 191 192 PetscFunctionBegin; 193 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 194 aijkok->a_dual.clear_sync_state(); 195 aijkok->a_dual.modify_host(); 196 } 197 PetscFunctionReturn(PETSC_SUCCESS); 198 } 199 200 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 201 { 202 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 203 204 PetscFunctionBegin; 205 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 206 207 if (i) *i = aijkok->i_device_data(); 208 if (j) *j = aijkok->j_device_data(); 209 if (a) { 210 aijkok->a_dual.sync_device(); 211 *a = aijkok->a_device_data(); 212 } 213 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 214 PetscFunctionReturn(PETSC_SUCCESS); 215 } 216 217 /* 218 Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 219 220 Input Parameter: 221 . A - the MATSEQAIJKOKKOS matrix 222 223 Output Parameters: 224 + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 225 - T_d - the transpose on device, whose value array is allocated but not initialized 226 */ 227 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 228 { 229 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 230 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 231 const PetscInt *Ai = aseq->i, *Aj = aseq->j; 232 MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 233 MatRowMapType *Ti = Ti_h.data(); 234 MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 235 MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 236 PetscInt *Tj = Tj_h.data(); 237 PetscInt *perm = perm_h.data(); 238 PetscInt *offset; 239 240 PetscFunctionBegin; 241 // Populate Ti 242 PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 243 Ti++; 244 for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 245 Ti--; 246 for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 247 248 // Populate Tj and the permutation array 249 PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 250 for (PetscInt i = 0; i < m; i++) { 251 for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 252 PetscInt r = Aj[j]; // row r of T 253 PetscInt disp = Ti[r] + offset[r]; 254 255 Tj[disp] = i; // col i of T 256 perm[disp] = j; 257 offset[r]++; 258 } 259 } 260 PetscCall(PetscFree(offset)); 261 262 // Sort each row of T, along with the permutation array 263 for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 264 265 // Output perm and T on device 266 auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 267 auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 268 PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 269 PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 270 PetscFunctionReturn(PETSC_SUCCESS); 271 } 272 273 // Generate the transpose on device and cache it internally 274 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 275 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 276 { 277 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 278 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 279 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 280 KokkosCsrMatrix &T = akok->csrmatT; 281 282 PetscFunctionBegin; 283 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 284 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 285 286 const auto &Aa = akok->a_dual.view_device(); 287 288 if (A->symmetric == PETSC_BOOL3_TRUE) { 289 *csrmatT = akok->csrmat; 290 } else { 291 // See if we already have a cached transpose and its value is up to date 292 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 293 if (!akok->transpose_updated) { // if the value is out of date, update the cached version 294 const auto &perm = akok->transpose_perm; // get the permutation array 295 auto &Ta = T.values; 296 297 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 298 } 299 } else { // Generate T of size n x m for the first time 300 MatRowMapKokkosView perm; 301 302 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 303 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 304 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 305 } 306 akok->transpose_updated = PETSC_TRUE; 307 *csrmatT = akok->csrmatT; 308 } 309 PetscFunctionReturn(PETSC_SUCCESS); 310 } 311 312 // Generate the Hermitian on device and cache it internally 313 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 314 { 315 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 316 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 317 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 318 KokkosCsrMatrix &T = akok->csrmatH; 319 320 PetscFunctionBegin; 321 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 322 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 323 324 const auto &Aa = akok->a_dual.view_device(); 325 326 if (A->hermitian == PETSC_BOOL3_TRUE) { 327 *csrmatH = akok->csrmat; 328 } else { 329 // See if we already have a cached hermitian and its value is up to date 330 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 331 if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 332 const auto &perm = akok->transpose_perm; // get the permutation array 333 auto &Ta = T.values; 334 335 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 336 } 337 } else { // Generate T of size n x m for the first time 338 MatRowMapKokkosView perm; 339 340 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 341 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 342 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 343 } 344 akok->hermitian_updated = PETSC_TRUE; 345 *csrmatH = akok->csrmatH; 346 } 347 PetscFunctionReturn(PETSC_SUCCESS); 348 } 349 350 /* y = A x */ 351 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 352 { 353 Mat_SeqAIJKokkos *aijkok; 354 ConstPetscScalarKokkosView xv; 355 PetscScalarKokkosView yv; 356 357 PetscFunctionBegin; 358 PetscCall(PetscLogGpuTimeBegin()); 359 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 360 PetscCall(VecGetKokkosView(xx, &xv)); 361 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 362 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 363 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 364 PetscCall(VecRestoreKokkosView(xx, &xv)); 365 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 366 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 367 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 368 PetscCall(PetscLogGpuTimeEnd()); 369 PetscFunctionReturn(PETSC_SUCCESS); 370 } 371 372 /* y = A^T x */ 373 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 374 { 375 Mat_SeqAIJKokkos *aijkok; 376 const char *mode; 377 ConstPetscScalarKokkosView xv; 378 PetscScalarKokkosView yv; 379 KokkosCsrMatrix csrmat; 380 381 PetscFunctionBegin; 382 PetscCall(PetscLogGpuTimeBegin()); 383 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 384 PetscCall(VecGetKokkosView(xx, &xv)); 385 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 386 if (A->form_explicit_transpose) { 387 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 388 mode = "N"; 389 } else { 390 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 391 csrmat = aijkok->csrmat; 392 mode = "T"; 393 } 394 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 395 PetscCall(VecRestoreKokkosView(xx, &xv)); 396 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 397 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 398 PetscCall(PetscLogGpuTimeEnd()); 399 PetscFunctionReturn(PETSC_SUCCESS); 400 } 401 402 /* y = A^H x */ 403 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 404 { 405 Mat_SeqAIJKokkos *aijkok; 406 const char *mode; 407 ConstPetscScalarKokkosView xv; 408 PetscScalarKokkosView yv; 409 KokkosCsrMatrix csrmat; 410 411 PetscFunctionBegin; 412 PetscCall(PetscLogGpuTimeBegin()); 413 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 414 PetscCall(VecGetKokkosView(xx, &xv)); 415 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 416 if (A->form_explicit_transpose) { 417 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 418 mode = "N"; 419 } else { 420 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 421 csrmat = aijkok->csrmat; 422 mode = "C"; 423 } 424 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 425 PetscCall(VecRestoreKokkosView(xx, &xv)); 426 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 427 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 428 PetscCall(PetscLogGpuTimeEnd()); 429 PetscFunctionReturn(PETSC_SUCCESS); 430 } 431 432 /* z = A x + y */ 433 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 434 { 435 Mat_SeqAIJKokkos *aijkok; 436 ConstPetscScalarKokkosView xv; 437 PetscScalarKokkosView zv; 438 439 PetscFunctionBegin; 440 PetscCall(PetscLogGpuTimeBegin()); 441 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 442 if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host 443 PetscCall(VecGetKokkosView(xx, &xv)); 444 PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device 445 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 446 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 447 PetscCall(VecRestoreKokkosView(xx, &xv)); 448 PetscCall(VecRestoreKokkosView(zz, &zv)); 449 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 450 PetscCall(PetscLogGpuTimeEnd()); 451 PetscFunctionReturn(PETSC_SUCCESS); 452 } 453 454 /* z = A^T x + y */ 455 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 456 { 457 Mat_SeqAIJKokkos *aijkok; 458 const char *mode; 459 ConstPetscScalarKokkosView xv; 460 PetscScalarKokkosView zv; 461 KokkosCsrMatrix csrmat; 462 463 PetscFunctionBegin; 464 PetscCall(PetscLogGpuTimeBegin()); 465 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 466 if (zz != yy) PetscCall(VecCopy(yy, zz)); 467 PetscCall(VecGetKokkosView(xx, &xv)); 468 PetscCall(VecGetKokkosView(zz, &zv)); 469 if (A->form_explicit_transpose) { 470 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 471 mode = "N"; 472 } else { 473 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 474 csrmat = aijkok->csrmat; 475 mode = "T"; 476 } 477 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 478 PetscCall(VecRestoreKokkosView(xx, &xv)); 479 PetscCall(VecRestoreKokkosView(zz, &zv)); 480 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 481 PetscCall(PetscLogGpuTimeEnd()); 482 PetscFunctionReturn(PETSC_SUCCESS); 483 } 484 485 /* z = A^H x + y */ 486 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 487 { 488 Mat_SeqAIJKokkos *aijkok; 489 const char *mode; 490 ConstPetscScalarKokkosView xv; 491 PetscScalarKokkosView zv; 492 KokkosCsrMatrix csrmat; 493 494 PetscFunctionBegin; 495 PetscCall(PetscLogGpuTimeBegin()); 496 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 497 if (zz != yy) PetscCall(VecCopy(yy, zz)); 498 PetscCall(VecGetKokkosView(xx, &xv)); 499 PetscCall(VecGetKokkosView(zz, &zv)); 500 if (A->form_explicit_transpose) { 501 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 502 mode = "N"; 503 } else { 504 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 505 csrmat = aijkok->csrmat; 506 mode = "C"; 507 } 508 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 509 PetscCall(VecRestoreKokkosView(xx, &xv)); 510 PetscCall(VecRestoreKokkosView(zz, &zv)); 511 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 512 PetscCall(PetscLogGpuTimeEnd()); 513 PetscFunctionReturn(PETSC_SUCCESS); 514 } 515 516 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 517 { 518 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 519 520 PetscFunctionBegin; 521 switch (op) { 522 case MAT_FORM_EXPLICIT_TRANSPOSE: 523 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 524 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 525 A->form_explicit_transpose = flg; 526 break; 527 default: 528 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 529 break; 530 } 531 PetscFunctionReturn(PETSC_SUCCESS); 532 } 533 534 /* Depending on reuse, either build a new mat, or use the existing mat */ 535 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 536 { 537 Mat_SeqAIJ *aseq; 538 539 PetscFunctionBegin; 540 PetscCall(PetscKokkosInitializeCheck()); 541 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 542 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */ 543 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 544 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 545 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 546 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 547 PetscCall(PetscFree(A->defaultvectype)); 548 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 549 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 550 PetscCall(MatSetOps_SeqAIJKokkos(A)); 551 aseq = static_cast<Mat_SeqAIJ *>(A->data); 552 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 553 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 554 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE); 555 } 556 } 557 PetscFunctionReturn(PETSC_SUCCESS); 558 } 559 560 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 561 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 562 */ 563 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 564 { 565 Mat_SeqAIJ *bseq; 566 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 567 Mat mat; 568 569 PetscFunctionBegin; 570 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 571 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 572 mat = *B; 573 if (A->assembled) { 574 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 575 bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE); 576 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 577 /* Now copy values to B if needed */ 578 if (dupOption == MAT_COPY_VALUES) { 579 if (akok->a_dual.need_sync_device()) { 580 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 581 bkok->a_dual.modify_host(); 582 } else { /* If device has the latest data, we only copy data on device */ 583 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 584 bkok->a_dual.modify_device(); 585 } 586 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 587 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 588 bkok->a_dual.modify_host(); 589 } 590 mat->spptr = bkok; 591 } 592 593 PetscCall(PetscFree(mat->defaultvectype)); 594 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 595 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 596 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 597 PetscFunctionReturn(PETSC_SUCCESS); 598 } 599 600 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 601 { 602 Mat At; 603 KokkosCsrMatrix internT; 604 Mat_SeqAIJKokkos *atkok, *bkok; 605 606 PetscFunctionBegin; 607 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 608 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 609 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 610 /* Deep copy internT, as we want to isolate the internal transpose */ 611 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 612 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 613 if (reuse == MAT_INITIAL_MATRIX) *B = At; 614 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 615 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 616 if ((*B)->assembled) { 617 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 618 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 619 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 620 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 621 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 622 MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 623 MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 624 PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 625 PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 626 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 627 } 628 PetscFunctionReturn(PETSC_SUCCESS); 629 } 630 631 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 632 { 633 Mat_SeqAIJKokkos *aijkok; 634 635 PetscFunctionBegin; 636 if (A->factortype == MAT_FACTOR_NONE) { 637 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 638 delete aijkok; 639 } else { 640 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 641 } 642 A->spptr = NULL; 643 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 644 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 645 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 646 #if defined(PETSC_HAVE_HYPRE) 647 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL)); 648 #endif 649 PetscCall(MatDestroy_SeqAIJ(A)); 650 PetscFunctionReturn(PETSC_SUCCESS); 651 } 652 653 /*MC 654 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 655 656 A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types 657 658 Options Database Key: 659 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 660 661 Level: beginner 662 663 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 664 M*/ 665 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 666 { 667 PetscFunctionBegin; 668 PetscCall(PetscKokkosInitializeCheck()); 669 PetscCall(MatCreate_SeqAIJ(A)); 670 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 671 PetscFunctionReturn(PETSC_SUCCESS); 672 } 673 674 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 675 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 676 { 677 Mat_SeqAIJ *a, *b; 678 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 679 MatScalarKokkosView aa, ba, ca; 680 MatRowMapKokkosView ai, bi, ci; 681 MatColIdxKokkosView aj, bj, cj; 682 PetscInt m, n, nnz, aN; 683 684 PetscFunctionBegin; 685 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 686 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 687 PetscAssertPointer(C, 4); 688 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 689 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 690 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 691 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 692 693 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 694 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 695 a = static_cast<Mat_SeqAIJ *>(A->data); 696 b = static_cast<Mat_SeqAIJ *>(B->data); 697 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 698 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 699 aa = akok->a_dual.view_device(); 700 ai = akok->i_dual.view_device(); 701 ba = bkok->a_dual.view_device(); 702 bi = bkok->i_dual.view_device(); 703 m = A->rmap->n; /* M, N and nnz of C */ 704 n = A->cmap->n + B->cmap->n; 705 nnz = a->nz + b->nz; 706 aN = A->cmap->n; /* N of A */ 707 if (reuse == MAT_INITIAL_MATRIX) { 708 aj = akok->j_dual.view_device(); 709 bj = bkok->j_dual.view_device(); 710 auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 711 auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 712 auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 713 ca = ca_dual.view_device(); 714 ci = ci_dual.view_device(); 715 cj = cj_dual.view_device(); 716 717 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 718 Kokkos::parallel_for( 719 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 720 PetscInt i = t.league_rank(); /* row i */ 721 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 722 723 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 724 ci(i) = coffset; 725 if (i == m - 1) ci(m) = ai(m) + bi(m); 726 }); 727 728 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 729 if (k < alen) { 730 ca(coffset + k) = aa(ai(i) + k); 731 cj(coffset + k) = aj(ai(i) + k); 732 } else { 733 ca(coffset + k) = ba(bi(i) + k - alen); 734 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 735 } 736 }); 737 }); 738 ca_dual.modify_device(); 739 ci_dual.modify_device(); 740 cj_dual.modify_device(); 741 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 742 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 743 } else if (reuse == MAT_REUSE_MATRIX) { 744 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 745 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 746 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 747 ca = ckok->a_dual.view_device(); 748 ci = ckok->i_dual.view_device(); 749 750 Kokkos::parallel_for( 751 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 752 PetscInt i = t.league_rank(); /* row i */ 753 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 754 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 755 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 756 else ca(ci(i) + k) = ba(bi(i) + k - alen); 757 }); 758 }); 759 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 760 } 761 PetscFunctionReturn(PETSC_SUCCESS); 762 } 763 764 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 765 { 766 PetscFunctionBegin; 767 delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 768 PetscFunctionReturn(PETSC_SUCCESS); 769 } 770 771 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 772 { 773 Mat_Product *product = C->product; 774 Mat A, B; 775 bool transA, transB; /* use bool, since KK needs this type */ 776 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 777 Mat_SeqAIJ *c; 778 MatProductData_SeqAIJKokkos *pdata; 779 KokkosCsrMatrix csrmatA, csrmatB; 780 781 PetscFunctionBegin; 782 MatCheckProduct(C, 1); 783 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 784 pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 785 786 // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 787 // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 788 // we still do numeric. 789 if (pdata->reusesym) { // numeric reuses results from symbolic 790 pdata->reusesym = PETSC_FALSE; 791 PetscFunctionReturn(PETSC_SUCCESS); 792 } 793 794 switch (product->type) { 795 case MATPRODUCT_AB: 796 transA = false; 797 transB = false; 798 break; 799 case MATPRODUCT_AtB: 800 transA = true; 801 transB = false; 802 break; 803 case MATPRODUCT_ABt: 804 transA = false; 805 transB = true; 806 break; 807 default: 808 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 809 } 810 811 A = product->A; 812 B = product->B; 813 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 814 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 815 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 816 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 817 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 818 819 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 820 821 csrmatA = akok->csrmat; 822 csrmatB = bkok->csrmat; 823 824 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 825 if (transA) { 826 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 827 transA = false; 828 } 829 830 if (transB) { 831 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 832 transB = false; 833 } 834 PetscCall(PetscLogGpuTimeBegin()); 835 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 836 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 837 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 838 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 839 #endif 840 841 PetscCall(PetscLogGpuTimeEnd()); 842 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 843 /* shorter version of MatAssemblyEnd_SeqAIJ */ 844 c = (Mat_SeqAIJ *)C->data; 845 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 846 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 847 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 848 c->reallocs = 0; 849 C->info.mallocs = 0; 850 C->info.nz_unneeded = 0; 851 C->assembled = C->was_assembled = PETSC_TRUE; 852 C->num_ass++; 853 PetscFunctionReturn(PETSC_SUCCESS); 854 } 855 856 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 857 { 858 Mat_Product *product = C->product; 859 MatProductType ptype; 860 Mat A, B; 861 bool transA, transB; 862 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 863 MatProductData_SeqAIJKokkos *pdata; 864 MPI_Comm comm; 865 KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 866 867 PetscFunctionBegin; 868 MatCheckProduct(C, 1); 869 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 870 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 871 A = product->A; 872 B = product->B; 873 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 874 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 875 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 876 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 877 csrmatA = akok->csrmat; 878 csrmatB = bkok->csrmat; 879 880 ptype = product->type; 881 // Take advantage of the symmetry if true 882 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 883 ptype = MATPRODUCT_AB; 884 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 885 } 886 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 887 ptype = MATPRODUCT_AB; 888 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 889 } 890 891 switch (ptype) { 892 case MATPRODUCT_AB: 893 transA = false; 894 transB = false; 895 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 896 break; 897 case MATPRODUCT_AtB: 898 transA = true; 899 transB = false; 900 if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs)); 901 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 902 break; 903 case MATPRODUCT_ABt: 904 transA = false; 905 transB = true; 906 if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs)); 907 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 908 break; 909 default: 910 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 911 } 912 PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 913 pdata->reusesym = product->api_user; 914 915 /* TODO: add command line options to select spgemm algorithms */ 916 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 917 918 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 919 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 920 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 921 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 922 #endif 923 #endif 924 PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 925 926 PetscCall(PetscLogGpuTimeBegin()); 927 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 928 if (transA) { 929 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 930 transA = false; 931 } 932 933 if (transB) { 934 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 935 transB = false; 936 } 937 938 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 939 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 940 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 941 calling new Mat_SeqAIJKokkos(). 942 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 943 */ 944 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 945 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 946 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 947 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 948 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 949 #endif 950 PetscCall(PetscLogGpuTimeEnd()); 951 952 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 953 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 954 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 955 PetscFunctionReturn(PETSC_SUCCESS); 956 } 957 958 /* handles sparse matrix matrix ops */ 959 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 960 { 961 Mat_Product *product = mat->product; 962 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 963 964 PetscFunctionBegin; 965 MatCheckProduct(mat, 1); 966 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 967 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 968 if (Biskok && Ciskok) { 969 switch (product->type) { 970 case MATPRODUCT_AB: 971 case MATPRODUCT_AtB: 972 case MATPRODUCT_ABt: 973 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 974 break; 975 case MATPRODUCT_PtAP: 976 case MATPRODUCT_RARt: 977 case MATPRODUCT_ABC: 978 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 979 break; 980 default: 981 break; 982 } 983 } else { /* fallback for AIJ */ 984 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 985 } 986 PetscFunctionReturn(PETSC_SUCCESS); 987 } 988 989 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 990 { 991 Mat_SeqAIJKokkos *aijkok; 992 993 PetscFunctionBegin; 994 PetscCall(PetscLogGpuTimeBegin()); 995 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 996 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 997 KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 998 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 999 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 1000 PetscCall(PetscLogGpuTimeEnd()); 1001 PetscFunctionReturn(PETSC_SUCCESS); 1002 } 1003 1004 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular) 1005 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a) 1006 { 1007 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1008 1009 PetscFunctionBegin; 1010 if (A->assembled && aijseq->diagonaldense) { // no missing diagonals 1011 PetscInt n = PetscMin(A->rmap->n, A->cmap->n); 1012 1013 PetscCall(PetscLogGpuTimeBegin()); 1014 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1015 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1016 const auto &Aa = aijkok->a_dual.view_device(); 1017 const auto &Adiag = aijkok->diag_dual.view_device(); 1018 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; })); 1019 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1020 PetscCall(PetscLogGpuFlops(n)); 1021 PetscCall(PetscLogGpuTimeEnd()); 1022 } else { // need reassembly, very slow! 1023 PetscCall(MatShift_Basic(A, a)); 1024 } 1025 PetscFunctionReturn(PETSC_SUCCESS); 1026 } 1027 1028 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is) 1029 { 1030 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data); 1031 1032 PetscFunctionBegin; 1033 if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals 1034 ConstPetscScalarKokkosView dv; 1035 PetscInt n, nv; 1036 1037 PetscCall(PetscLogGpuTimeBegin()); 1038 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1039 PetscCall(VecGetKokkosView(D, &dv)); 1040 PetscCall(VecGetLocalSize(D, &nv)); 1041 n = PetscMin(Y->rmap->n, Y->cmap->n); 1042 PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match"); 1043 1044 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1045 const auto &Aa = aijkok->a_dual.view_device(); 1046 const auto &Adiag = aijkok->diag_dual.view_device(); 1047 PetscCallCXX(Kokkos::parallel_for( 1048 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1049 if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i); 1050 else Aa(Adiag(i)) += dv(i); 1051 })); 1052 PetscCall(VecRestoreKokkosView(D, &dv)); 1053 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1054 PetscCall(PetscLogGpuFlops(n)); 1055 PetscCall(PetscLogGpuTimeEnd()); 1056 } else { // need reassembly, very slow! 1057 PetscCall(MatDiagonalSet_Default(Y, D, is)); 1058 } 1059 PetscFunctionReturn(PETSC_SUCCESS); 1060 } 1061 1062 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr) 1063 { 1064 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1065 PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz; 1066 ConstPetscScalarKokkosView lv, rv; 1067 1068 PetscFunctionBegin; 1069 PetscCall(PetscLogGpuTimeBegin()); 1070 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1071 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1072 const auto &Aa = aijkok->a_dual.view_device(); 1073 const auto &Ai = aijkok->i_dual.view_device(); 1074 const auto &Aj = aijkok->j_dual.view_device(); 1075 if (ll) { 1076 PetscCall(VecGetLocalSize(ll, &m)); 1077 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1078 PetscCall(VecGetKokkosView(ll, &lv)); 1079 PetscCallCXX(Kokkos::parallel_for( // for each row 1080 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1081 PetscInt i = t.league_rank(); // row i 1082 PetscInt len = Ai(i + 1) - Ai(i); 1083 // scale entries on the row 1084 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); }); 1085 })); 1086 PetscCall(VecRestoreKokkosView(ll, &lv)); 1087 PetscCall(PetscLogGpuFlops(nz)); 1088 } 1089 if (rr) { 1090 PetscCall(VecGetLocalSize(rr, &n)); 1091 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1092 PetscCall(VecGetKokkosView(rr, &rv)); 1093 PetscCallCXX(Kokkos::parallel_for( // for each nonzero 1094 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); })); 1095 PetscCall(VecRestoreKokkosView(rr, &lv)); 1096 PetscCall(PetscLogGpuFlops(nz)); 1097 } 1098 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1099 PetscCall(PetscLogGpuTimeEnd()); 1100 PetscFunctionReturn(PETSC_SUCCESS); 1101 } 1102 1103 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 1104 { 1105 Mat_SeqAIJKokkos *aijkok; 1106 1107 PetscFunctionBegin; 1108 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1109 if (aijkok) { /* Only zero the device if data is already there */ 1110 KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0); 1111 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1112 } else { /* Might be preallocated but not assembled */ 1113 PetscCall(MatZeroEntries_SeqAIJ(A)); 1114 } 1115 PetscFunctionReturn(PETSC_SUCCESS); 1116 } 1117 1118 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1119 { 1120 Mat_SeqAIJKokkos *aijkok; 1121 PetscInt n; 1122 PetscScalarKokkosView xv; 1123 1124 PetscFunctionBegin; 1125 PetscCall(VecGetLocalSize(x, &n)); 1126 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1127 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1128 1129 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1130 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1131 1132 const auto &Aa = aijkok->a_dual.view_device(); 1133 const auto &Ai = aijkok->i_dual.view_device(); 1134 const auto &Adiag = aijkok->diag_dual.view_device(); 1135 1136 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1137 Kokkos::parallel_for( 1138 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1139 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1140 else xv(i) = 0; 1141 }); 1142 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1143 PetscFunctionReturn(PETSC_SUCCESS); 1144 } 1145 1146 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1147 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1148 { 1149 Mat_SeqAIJKokkos *aijkok; 1150 1151 PetscFunctionBegin; 1152 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1153 PetscAssertPointer(kv, 2); 1154 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1155 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1156 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1157 *kv = aijkok->a_dual.view_device(); 1158 PetscFunctionReturn(PETSC_SUCCESS); 1159 } 1160 1161 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1162 { 1163 PetscFunctionBegin; 1164 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1165 PetscAssertPointer(kv, 2); 1166 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1167 PetscFunctionReturn(PETSC_SUCCESS); 1168 } 1169 1170 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1171 { 1172 Mat_SeqAIJKokkos *aijkok; 1173 1174 PetscFunctionBegin; 1175 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1176 PetscAssertPointer(kv, 2); 1177 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1178 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1179 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1180 *kv = aijkok->a_dual.view_device(); 1181 PetscFunctionReturn(PETSC_SUCCESS); 1182 } 1183 1184 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1185 { 1186 PetscFunctionBegin; 1187 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1188 PetscAssertPointer(kv, 2); 1189 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1190 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1191 PetscFunctionReturn(PETSC_SUCCESS); 1192 } 1193 1194 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1195 { 1196 Mat_SeqAIJKokkos *aijkok; 1197 1198 PetscFunctionBegin; 1199 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1200 PetscAssertPointer(kv, 2); 1201 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1202 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1203 *kv = aijkok->a_dual.view_device(); 1204 PetscFunctionReturn(PETSC_SUCCESS); 1205 } 1206 1207 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1208 { 1209 PetscFunctionBegin; 1210 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1211 PetscAssertPointer(kv, 2); 1212 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1213 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1214 PetscFunctionReturn(PETSC_SUCCESS); 1215 } 1216 1217 /* Computes Y += alpha X */ 1218 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1219 { 1220 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1221 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1222 ConstMatScalarKokkosView Xa; 1223 MatScalarKokkosView Ya; 1224 auto exec = PetscGetKokkosExecutionSpace(); 1225 1226 PetscFunctionBegin; 1227 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1228 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1229 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1230 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1231 PetscCall(PetscLogGpuTimeBegin()); 1232 1233 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1234 PetscBool e; 1235 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1236 if (e) { 1237 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1238 if (e) pattern = SAME_NONZERO_PATTERN; 1239 } 1240 } 1241 1242 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1243 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1244 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1245 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1246 */ 1247 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1248 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1249 Xa = xkok->a_dual.view_device(); 1250 Ya = ykok->a_dual.view_device(); 1251 1252 if (pattern == SAME_NONZERO_PATTERN) { 1253 KokkosBlas::axpy(exec, alpha, Xa, Ya); 1254 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1255 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1256 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1257 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1258 1259 Kokkos::parallel_for( 1260 Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1261 PetscInt i = t.league_rank(); // row i 1262 Kokkos::single(Kokkos::PerTeam(t), [=]() { 1263 // Only one thread works in a team 1264 PetscInt p, q = Yi(i); 1265 for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 1266 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 1267 if (Xj(p) == Yj(q)) { // Found it 1268 Ya(q) += alpha * Xa(p); 1269 q++; 1270 } else { 1271 // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1272 // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1273 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 1274 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 1275 #else 1276 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 1277 #endif 1278 } 1279 } 1280 }); 1281 }); 1282 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1283 } else { // different nonzero patterns 1284 Mat Z; 1285 KokkosCsrMatrix zcsr; 1286 KernelHandle kh; 1287 kh.create_spadd_handle(true); // X, Y are sorted 1288 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1289 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1290 zkok = new Mat_SeqAIJKokkos(zcsr); 1291 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1292 PetscCall(MatHeaderReplace(Y, &Z)); 1293 kh.destroy_spadd_handle(); 1294 } 1295 PetscCall(PetscLogGpuTimeEnd()); 1296 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 1297 PetscFunctionReturn(PETSC_SUCCESS); 1298 } 1299 1300 struct MatCOOStruct_SeqAIJKokkos { 1301 PetscCount n; 1302 PetscCount Atot; 1303 PetscInt nz; 1304 PetscCountKokkosView jmap; 1305 PetscCountKokkosView perm; 1306 1307 MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h) 1308 { 1309 nz = coo_h->nz; 1310 n = coo_h->n; 1311 Atot = coo_h->Atot; 1312 jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1)); 1313 perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot)); 1314 } 1315 }; 1316 1317 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data) 1318 { 1319 PetscFunctionBegin; 1320 PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data)); 1321 PetscFunctionReturn(PETSC_SUCCESS); 1322 } 1323 1324 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1325 { 1326 Mat_SeqAIJKokkos *akok; 1327 Mat_SeqAIJ *aseq; 1328 PetscContainer container_h; 1329 MatCOOStruct_SeqAIJ *coo_h; 1330 MatCOOStruct_SeqAIJKokkos *coo_d; 1331 1332 PetscFunctionBegin; 1333 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1334 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1335 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1336 delete akok; 1337 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE); 1338 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1339 1340 // Copy the COO struct to device 1341 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 1342 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 1343 PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h)); 1344 1345 // Put the COO struct in a container and then attach that to the matrix 1346 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos)); 1347 PetscFunctionReturn(PETSC_SUCCESS); 1348 } 1349 1350 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1351 { 1352 MatScalarKokkosView Aa; 1353 ConstMatScalarKokkosView kv; 1354 PetscMemType memtype; 1355 PetscContainer container; 1356 MatCOOStruct_SeqAIJKokkos *coo; 1357 1358 PetscFunctionBegin; 1359 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 1360 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 1361 1362 const auto &n = coo->n; 1363 const auto &Annz = coo->nz; 1364 const auto &jmap = coo->jmap; 1365 const auto &perm = coo->perm; 1366 1367 PetscCall(PetscGetMemType(v, &memtype)); 1368 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1369 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n)); 1370 } else { 1371 kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */ 1372 } 1373 1374 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1375 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1376 1377 PetscCall(PetscLogGpuTimeBegin()); 1378 Kokkos::parallel_for( 1379 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) { 1380 PetscScalar sum = 0.0; 1381 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1382 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1383 }); 1384 PetscCall(PetscLogGpuTimeEnd()); 1385 1386 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1387 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1388 PetscFunctionReturn(PETSC_SUCCESS); 1389 } 1390 1391 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1392 { 1393 PetscFunctionBegin; 1394 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1395 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1396 B->offloadmask = PETSC_OFFLOAD_CPU; 1397 PetscFunctionReturn(PETSC_SUCCESS); 1398 } 1399 1400 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1401 { 1402 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1403 1404 PetscFunctionBegin; 1405 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1406 A->boundtocpu = PETSC_FALSE; 1407 1408 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1409 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1410 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1411 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1412 A->ops->scale = MatScale_SeqAIJKokkos; 1413 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1414 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1415 A->ops->mult = MatMult_SeqAIJKokkos; 1416 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1417 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1418 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1419 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1420 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1421 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1422 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1423 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1424 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1425 A->ops->shift = MatShift_SeqAIJKokkos; 1426 A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos; 1427 A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos; 1428 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1429 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1430 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1431 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1432 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1433 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1434 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1435 1436 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1437 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1438 #if defined(PETSC_HAVE_HYPRE) 1439 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE)); 1440 #endif 1441 PetscFunctionReturn(PETSC_SUCCESS); 1442 } 1443 1444 /* 1445 Extract the (prescribled) diagonal blocks of the matrix and then invert them 1446 1447 Input Parameters: 1448 + A - the MATSEQAIJKOKKOS matrix 1449 . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i) 1450 . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[] 1451 . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i) 1452 - work - a pre-allocated work buffer (as big as diagVal) for use by this routine 1453 1454 Output Parameter: 1455 . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order) 1456 */ 1457 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal) 1458 { 1459 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1460 PetscInt nblocks = bs.extent(0) - 1; 1461 1462 PetscFunctionBegin; 1463 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device 1464 1465 // Pull out the diagonal blocks of the matrix and then invert the blocks 1466 auto Aa = akok->a_dual.view_device(); 1467 auto Ai = akok->i_dual.view_device(); 1468 auto Aj = akok->j_dual.view_device(); 1469 auto Adiag = akok->diag_dual.view_device(); 1470 // TODO: how to tune the team size? 1471 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY) 1472 auto ts = Kokkos::AUTO(); 1473 #else 1474 auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs 1475 #endif 1476 PetscCallCXX(Kokkos::parallel_for( 1477 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) { 1478 const PetscInt bid = teamMember.league_rank(); // block id 1479 const PetscInt rstart = bs(bid); // this block starts from this row 1480 const PetscInt m = bs(bid + 1) - bs(bid); // size of this block 1481 const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order 1482 const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m); 1483 1484 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B 1485 PetscInt i = rstart + r; // i-th row in A 1486 1487 if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case) 1488 PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row 1489 1490 for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet 1491 if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero 1492 B(r, c) = 0.0; 1493 } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column 1494 B(r, c) = Aa(first + c); 1495 } else { // this entry does not show up in the CSR 1496 B(r, c) = 0.0; 1497 } 1498 } 1499 } else { // rare case that the diagonal does not exist 1500 const PetscInt begin = Ai(i); 1501 const PetscInt end = Ai(i + 1); 1502 for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0; 1503 for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not. 1504 if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j); 1505 else if (Aj(j) >= rstart + m) break; 1506 } 1507 } 1508 }); 1509 1510 // LU-decompose B (w/o pivoting) and then invert B 1511 KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0); 1512 KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W); 1513 })); 1514 // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels 1515 PetscFunctionReturn(PETSC_SUCCESS); 1516 } 1517 1518 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1519 { 1520 Mat_SeqAIJ *aseq; 1521 PetscInt i, m, n; 1522 auto exec = PetscGetKokkosExecutionSpace(); 1523 1524 PetscFunctionBegin; 1525 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1526 1527 m = akok->nrows(); 1528 n = akok->ncols(); 1529 PetscCall(MatSetSizes(A, m, n, m, n)); 1530 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1531 1532 /* Set up data structures of A as a MATSEQAIJ */ 1533 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1534 aseq = (Mat_SeqAIJ *)A->data; 1535 1536 PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */ 1537 PetscCallCXX(akok->j_dual.sync_host(exec)); 1538 PetscCallCXX(exec.fence()); 1539 1540 aseq->i = akok->i_host_data(); 1541 aseq->j = akok->j_host_data(); 1542 aseq->a = akok->a_host_data(); 1543 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1544 aseq->free_a = PETSC_FALSE; 1545 aseq->free_ij = PETSC_FALSE; 1546 aseq->nz = akok->nnz(); 1547 aseq->maxnz = aseq->nz; 1548 1549 PetscCall(PetscMalloc1(m, &aseq->imax)); 1550 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1551 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1552 1553 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1554 akok->nonzerostate = A->nonzerostate; 1555 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1556 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1557 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1558 PetscFunctionReturn(PETSC_SUCCESS); 1559 } 1560 1561 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 1562 { 1563 PetscFunctionBegin; 1564 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1565 *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 1566 PetscFunctionReturn(PETSC_SUCCESS); 1567 } 1568 1569 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 1570 { 1571 Mat_SeqAIJKokkos *akok; 1572 1573 PetscFunctionBegin; 1574 PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 1575 PetscCall(MatCreate(comm, A)); 1576 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1577 PetscFunctionReturn(PETSC_SUCCESS); 1578 } 1579 1580 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1581 1582 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1583 */ 1584 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1585 { 1586 PetscFunctionBegin; 1587 PetscCall(MatCreate(comm, A)); 1588 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1589 PetscFunctionReturn(PETSC_SUCCESS); 1590 } 1591 1592 /*@C 1593 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1594 (the default parallel PETSc format). This matrix will ultimately be handled by 1595 Kokkos for calculations. 1596 1597 Collective 1598 1599 Input Parameters: 1600 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1601 . m - number of rows 1602 . n - number of columns 1603 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 1604 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 1605 1606 Output Parameter: 1607 . A - the matrix 1608 1609 Level: intermediate 1610 1611 Notes: 1612 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1613 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1614 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1615 1616 The AIJ format, also called 1617 compressed row storage, is fully compatible with standard Fortran 1618 storage. That is, the stored row and column indices can begin at 1619 either one (as in Fortran) or zero. 1620 1621 Specify the preallocated storage with either `nz` or `nnz` (not both). 1622 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1623 allocation. 1624 1625 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()` 1626 @*/ 1627 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1628 { 1629 PetscFunctionBegin; 1630 PetscCall(PetscKokkosInitializeCheck()); 1631 PetscCall(MatCreate(comm, A)); 1632 PetscCall(MatSetSizes(*A, m, n, m, n)); 1633 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1634 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1635 PetscFunctionReturn(PETSC_SUCCESS); 1636 } 1637 1638 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1639 { 1640 PetscFunctionBegin; 1641 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1642 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1643 PetscFunctionReturn(PETSC_SUCCESS); 1644 } 1645 1646 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1647 { 1648 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1649 1650 PetscFunctionBegin; 1651 if (!factors->sptrsv_symbolic_completed) { 1652 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d); 1653 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d); 1654 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1655 } 1656 PetscFunctionReturn(PETSC_SUCCESS); 1657 } 1658 1659 /* Check if we need to update factors etc for transpose solve */ 1660 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1661 { 1662 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1663 MatColIdxType n = A->rmap->n; 1664 1665 PetscFunctionBegin; 1666 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1667 /* Update L^T and do sptrsv symbolic */ 1668 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0 1669 factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 1670 factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 1671 1672 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d, 1673 factors->iLt_d, factors->jLt_d, factors->aLt_d); 1674 1675 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1676 We have to sort the indices, until KK provides finer control options. 1677 */ 1678 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d); 1679 1680 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d); 1681 1682 /* Update U^T and do sptrsv symbolic */ 1683 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0 1684 factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 1685 factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 1686 1687 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d, 1688 factors->iUt_d, factors->jUt_d, factors->aUt_d); 1689 1690 /* Sort indices. See comments above */ 1691 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d); 1692 1693 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d); 1694 factors->transpose_updated = PETSC_TRUE; 1695 } 1696 PetscFunctionReturn(PETSC_SUCCESS); 1697 } 1698 1699 /* Solve Ax = b, with A = LU */ 1700 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x) 1701 { 1702 ConstPetscScalarKokkosView bv; 1703 PetscScalarKokkosView xv; 1704 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1705 1706 PetscFunctionBegin; 1707 PetscCall(PetscLogGpuTimeBegin()); 1708 PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 1709 PetscCall(VecGetKokkosView(b, &bv)); 1710 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1711 /* Solve L tmpv = b */ 1712 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector)); 1713 /* Solve Ux = tmpv */ 1714 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv)); 1715 PetscCall(VecRestoreKokkosView(b, &bv)); 1716 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1717 PetscCall(PetscLogGpuTimeEnd()); 1718 PetscFunctionReturn(PETSC_SUCCESS); 1719 } 1720 1721 /* Solve A^T x = b, where A^T = U^T L^T */ 1722 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x) 1723 { 1724 ConstPetscScalarKokkosView bv; 1725 PetscScalarKokkosView xv; 1726 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1727 1728 PetscFunctionBegin; 1729 PetscCall(PetscLogGpuTimeBegin()); 1730 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 1731 PetscCall(VecGetKokkosView(b, &bv)); 1732 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1733 /* Solve U^T tmpv = b */ 1734 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector); 1735 1736 /* Solve L^T x = tmpv */ 1737 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv); 1738 PetscCall(VecRestoreKokkosView(b, &bv)); 1739 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1740 PetscCall(PetscLogGpuTimeEnd()); 1741 PetscFunctionReturn(PETSC_SUCCESS); 1742 } 1743 1744 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1745 { 1746 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1747 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1748 PetscInt fill_lev = info->levels; 1749 1750 PetscFunctionBegin; 1751 PetscCall(PetscLogGpuTimeBegin()); 1752 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1753 1754 auto a_d = aijkok->a_dual.view_device(); 1755 auto i_d = aijkok->i_dual.view_device(); 1756 auto j_d = aijkok->j_dual.view_device(); 1757 1758 KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d); 1759 1760 B->assembled = PETSC_TRUE; 1761 B->preallocated = PETSC_TRUE; 1762 B->ops->solve = MatSolve_SeqAIJKokkos; 1763 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1764 B->ops->matsolve = NULL; 1765 B->ops->matsolvetranspose = NULL; 1766 B->offloadmask = PETSC_OFFLOAD_GPU; 1767 1768 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1769 factors->transpose_updated = PETSC_FALSE; 1770 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1771 /* TODO: log flops, but how to know that? */ 1772 PetscCall(PetscLogGpuTimeEnd()); 1773 PetscFunctionReturn(PETSC_SUCCESS); 1774 } 1775 1776 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1777 { 1778 Mat_SeqAIJKokkos *aijkok; 1779 Mat_SeqAIJ *b; 1780 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1781 PetscInt fill_lev = info->levels; 1782 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 1783 PetscInt n = A->rmap->n; 1784 1785 PetscFunctionBegin; 1786 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1787 /* Rebuild factors */ 1788 if (factors) { 1789 factors->Destroy(); 1790 } /* Destroy the old if it exists */ 1791 else { 1792 B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n); 1793 } 1794 1795 /* Create a spiluk handle and then do symbolic factorization */ 1796 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 1797 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 1798 1799 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1800 1801 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 1802 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 1803 Kokkos::realloc(factors->iU_d, n + 1); 1804 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 1805 1806 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1807 auto i_d = aijkok->i_dual.view_device(); 1808 auto j_d = aijkok->j_dual.view_device(); 1809 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d); 1810 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1811 1812 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1813 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 1814 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 1815 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 1816 1817 /* TODO: add options to select sptrsv algorithms */ 1818 /* Create sptrsv handles for L, U and their transpose */ 1819 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1820 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1821 #else 1822 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1823 #endif 1824 1825 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 1826 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 1827 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 1828 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 1829 1830 /* Fill fields of the factor matrix B */ 1831 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 1832 b = (Mat_SeqAIJ *)B->data; 1833 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 1834 B->info.fill_ratio_given = info->fill; 1835 B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 1836 1837 B->offloadmask = PETSC_OFFLOAD_GPU; 1838 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1839 PetscFunctionReturn(PETSC_SUCCESS); 1840 } 1841 1842 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type) 1843 { 1844 PetscFunctionBegin; 1845 *type = MATSOLVERKOKKOS; 1846 PetscFunctionReturn(PETSC_SUCCESS); 1847 } 1848 1849 /*MC 1850 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1851 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 1852 1853 Level: beginner 1854 1855 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1856 M*/ 1857 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1858 { 1859 PetscInt n = A->rmap->n; 1860 1861 PetscFunctionBegin; 1862 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 1863 PetscCall(MatSetSizes(*B, n, n, n, n)); 1864 (*B)->factortype = ftype; 1865 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 1866 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1867 1868 if (ftype == MAT_FACTOR_LU) { 1869 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1870 (*B)->canuseordering = PETSC_TRUE; 1871 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1872 } else if (ftype == MAT_FACTOR_ILU) { 1873 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1874 (*B)->canuseordering = PETSC_FALSE; 1875 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1876 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1877 1878 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 1879 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos)); 1880 PetscFunctionReturn(PETSC_SUCCESS); 1881 } 1882 1883 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1884 { 1885 PetscFunctionBegin; 1886 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1887 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1888 PetscFunctionReturn(PETSC_SUCCESS); 1889 } 1890 1891 /* Utility to print out a KokkosCsrMatrix for debugging */ 1892 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 1893 { 1894 const auto &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map); 1895 const auto &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries); 1896 const auto &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values); 1897 const PetscInt *i = iv.data(); 1898 const PetscInt *j = jv.data(); 1899 const PetscScalar *a = av.data(); 1900 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 1901 1902 PetscFunctionBegin; 1903 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 1904 for (PetscInt k = 0; k < m; k++) { 1905 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 1906 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 1907 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 1908 } 1909 PetscFunctionReturn(PETSC_SUCCESS); 1910 } 1911