1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petsc/private/sfimpl.h> 5 #include <petscsystypes.h> 6 #include <petscerror.h> 7 8 #include <Kokkos_Core.hpp> 9 #include <KokkosBlas.hpp> 10 #include <KokkosSparse_CrsMatrix.hpp> 11 #include <KokkosSparse_spmv.hpp> 12 #include <KokkosSparse_spiluk.hpp> 13 #include <KokkosSparse_sptrsv.hpp> 14 #include <KokkosSparse_spgemm.hpp> 15 #include <KokkosSparse_spadd.hpp> 16 17 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 18 19 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 20 #include <KokkosSparse_Utils.hpp> 21 using KokkosSparse::sort_crs_matrix; 22 using KokkosSparse::Impl::transpose_matrix; 23 #else 24 #include <KokkosKernels_Sorting.hpp> 25 using KokkosKernels::sort_crs_matrix; 26 using KokkosKernels::Impl::transpose_matrix; 27 #endif 28 29 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 30 31 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 32 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 33 In the latter case, it is important to set a_dual's sync state correctly. 34 */ 35 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 36 { 37 Mat_SeqAIJ *aijseq; 38 Mat_SeqAIJKokkos *aijkok; 39 40 PetscFunctionBegin; 41 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 42 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 43 44 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 45 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 46 47 /* If aijkok does not exist, we just copy i, j to device. 48 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 49 In both cases, we build a new aijkok structure. 50 */ 51 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 52 delete aijkok; 53 aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 54 A->spptr = aijkok; 55 } 56 57 PetscFunctionReturn(PETSC_SUCCESS); 58 } 59 60 /* Sync CSR data to device if not yet */ 61 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 62 { 63 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 64 65 PetscFunctionBegin; 66 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 67 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 68 if (aijkok->a_dual.need_sync_device()) { 69 aijkok->a_dual.sync_device(); 70 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 71 aijkok->hermitian_updated = PETSC_FALSE; 72 } 73 PetscFunctionReturn(PETSC_SUCCESS); 74 } 75 76 /* Mark the CSR data on device as modified */ 77 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 78 { 79 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 80 81 PetscFunctionBegin; 82 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 83 aijkok->a_dual.clear_sync_state(); 84 aijkok->a_dual.modify_device(); 85 aijkok->transpose_updated = PETSC_FALSE; 86 aijkok->hermitian_updated = PETSC_FALSE; 87 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 88 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 89 PetscFunctionReturn(PETSC_SUCCESS); 90 } 91 92 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 93 { 94 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 95 96 PetscFunctionBegin; 97 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 98 /* We do not expect one needs factors on host */ 99 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 100 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 101 aijkok->a_dual.sync_host(); 102 PetscFunctionReturn(PETSC_SUCCESS); 103 } 104 105 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 106 { 107 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 108 109 PetscFunctionBegin; 110 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 111 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 112 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 113 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 114 */ 115 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 116 aijkok->a_dual.sync_host(); 117 *array = aijkok->a_dual.view_host().data(); 118 } else { /* Happens when calling MatSetValues on a newly created matrix */ 119 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 120 } 121 PetscFunctionReturn(PETSC_SUCCESS); 122 } 123 124 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 125 { 126 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 127 128 PetscFunctionBegin; 129 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 130 PetscFunctionReturn(PETSC_SUCCESS); 131 } 132 133 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 134 { 135 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 136 137 PetscFunctionBegin; 138 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 139 aijkok->a_dual.sync_host(); 140 *array = aijkok->a_dual.view_host().data(); 141 } else { 142 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 143 } 144 PetscFunctionReturn(PETSC_SUCCESS); 145 } 146 147 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 148 { 149 PetscFunctionBegin; 150 *array = NULL; 151 PetscFunctionReturn(PETSC_SUCCESS); 152 } 153 154 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 155 { 156 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 157 158 PetscFunctionBegin; 159 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 160 *array = aijkok->a_dual.view_host().data(); 161 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 162 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 163 } 164 PetscFunctionReturn(PETSC_SUCCESS); 165 } 166 167 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 168 { 169 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 170 171 PetscFunctionBegin; 172 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 173 aijkok->a_dual.clear_sync_state(); 174 aijkok->a_dual.modify_host(); 175 } 176 PetscFunctionReturn(PETSC_SUCCESS); 177 } 178 179 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 180 { 181 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 182 183 PetscFunctionBegin; 184 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 185 186 if (i) *i = aijkok->i_device_data(); 187 if (j) *j = aijkok->j_device_data(); 188 if (a) { 189 aijkok->a_dual.sync_device(); 190 *a = aijkok->a_device_data(); 191 } 192 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 193 PetscFunctionReturn(PETSC_SUCCESS); 194 } 195 196 /* 197 Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 198 199 Input Parameter: 200 . A - the MATSEQAIJKOKKOS matrix 201 202 Output Parameters: 203 + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 204 - T_d - the transpose on device, whose value array is allocated but not initialized 205 */ 206 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 207 { 208 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 209 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 210 const PetscInt *Ai = aseq->i, *Aj = aseq->j; 211 MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 212 MatRowMapType *Ti = Ti_h.data(); 213 MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 214 MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 215 PetscInt *Tj = Tj_h.data(); 216 PetscInt *perm = perm_h.data(); 217 PetscInt *offset; 218 219 PetscFunctionBegin; 220 // Populate Ti 221 PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 222 Ti++; 223 for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 224 Ti--; 225 for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 226 227 // Populate Tj and the permutation array 228 PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 229 for (PetscInt i = 0; i < m; i++) { 230 for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 231 PetscInt r = Aj[j]; // row r of T 232 PetscInt disp = Ti[r] + offset[r]; 233 234 Tj[disp] = i; // col i of T 235 perm[disp] = j; 236 offset[r]++; 237 } 238 } 239 PetscCall(PetscFree(offset)); 240 241 // Sort each row of T, along with the permutation array 242 for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 243 244 // Output perm and T on device 245 auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 246 auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 247 PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 248 PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 249 PetscFunctionReturn(PETSC_SUCCESS); 250 } 251 252 // Generate the transpose on device and cache it internally 253 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 254 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 255 { 256 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 257 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 258 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 259 KokkosCsrMatrix &T = akok->csrmatT; 260 261 PetscFunctionBegin; 262 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 263 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's valeus since we are going to access them on device 264 265 const auto &Aa = akok->a_dual.view_device(); 266 267 if (A->symmetric == PETSC_BOOL3_TRUE) { 268 *csrmatT = akok->csrmat; 269 } else { 270 // See if we already have a cached transpose and its value is up to date 271 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 272 if (!akok->transpose_updated) { // if the value is out of date, update the cached version 273 const auto &perm = akok->transpose_perm; // get the permutation array 274 auto &Ta = T.values; 275 276 PetscCallCXX(Kokkos::parallel_for( 277 nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 278 } 279 } else { // Generate T of size n x m for the first time 280 MatRowMapKokkosView perm; 281 282 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 283 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 284 PetscCallCXX(Kokkos::parallel_for( 285 nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 286 } 287 akok->transpose_updated = PETSC_TRUE; 288 *csrmatT = akok->csrmatT; 289 } 290 PetscFunctionReturn(PETSC_SUCCESS); 291 } 292 293 // Generate the Hermitian on device and cache it internally 294 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 295 { 296 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 297 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 298 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 299 KokkosCsrMatrix &T = akok->csrmatH; 300 301 PetscFunctionBegin; 302 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 303 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 304 305 const auto &Aa = akok->a_dual.view_device(); 306 307 if (A->hermitian == PETSC_BOOL3_TRUE) { 308 *csrmatH = akok->csrmat; 309 } else { 310 // See if we already have a cached hermitian and its value is up to date 311 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 312 if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 313 const auto &perm = akok->transpose_perm; // get the permutation array 314 auto &Ta = T.values; 315 316 PetscCallCXX(Kokkos::parallel_for( 317 nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 318 } 319 } else { // Generate T of size n x m for the first time 320 MatRowMapKokkosView perm; 321 322 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 323 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 324 PetscCallCXX(Kokkos::parallel_for( 325 nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 326 } 327 akok->hermitian_updated = PETSC_TRUE; 328 *csrmatH = akok->csrmatH; 329 } 330 PetscFunctionReturn(PETSC_SUCCESS); 331 } 332 333 /* y = A x */ 334 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 335 { 336 Mat_SeqAIJKokkos *aijkok; 337 ConstPetscScalarKokkosView xv; 338 PetscScalarKokkosView yv; 339 340 PetscFunctionBegin; 341 PetscCall(PetscLogGpuTimeBegin()); 342 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 343 PetscCall(VecGetKokkosView(xx, &xv)); 344 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 345 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 346 PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 347 PetscCall(VecRestoreKokkosView(xx, &xv)); 348 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 349 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 350 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 351 PetscCall(PetscLogGpuTimeEnd()); 352 PetscFunctionReturn(PETSC_SUCCESS); 353 } 354 355 /* y = A^T x */ 356 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 357 { 358 Mat_SeqAIJKokkos *aijkok; 359 const char *mode; 360 ConstPetscScalarKokkosView xv; 361 PetscScalarKokkosView yv; 362 KokkosCsrMatrix csrmat; 363 364 PetscFunctionBegin; 365 PetscCall(PetscLogGpuTimeBegin()); 366 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 367 PetscCall(VecGetKokkosView(xx, &xv)); 368 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 369 if (A->form_explicit_transpose) { 370 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 371 mode = "N"; 372 } else { 373 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 374 csrmat = aijkok->csrmat; 375 mode = "T"; 376 } 377 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 378 PetscCall(VecRestoreKokkosView(xx, &xv)); 379 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 380 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 381 PetscCall(PetscLogGpuTimeEnd()); 382 PetscFunctionReturn(PETSC_SUCCESS); 383 } 384 385 /* y = A^H x */ 386 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 387 { 388 Mat_SeqAIJKokkos *aijkok; 389 const char *mode; 390 ConstPetscScalarKokkosView xv; 391 PetscScalarKokkosView yv; 392 KokkosCsrMatrix csrmat; 393 394 PetscFunctionBegin; 395 PetscCall(PetscLogGpuTimeBegin()); 396 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 397 PetscCall(VecGetKokkosView(xx, &xv)); 398 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 399 if (A->form_explicit_transpose) { 400 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 401 mode = "N"; 402 } else { 403 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 404 csrmat = aijkok->csrmat; 405 mode = "C"; 406 } 407 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 408 PetscCall(VecRestoreKokkosView(xx, &xv)); 409 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 410 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 411 PetscCall(PetscLogGpuTimeEnd()); 412 PetscFunctionReturn(PETSC_SUCCESS); 413 } 414 415 /* z = A x + y */ 416 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 417 { 418 Mat_SeqAIJKokkos *aijkok; 419 ConstPetscScalarKokkosView xv, yv; 420 PetscScalarKokkosView zv; 421 422 PetscFunctionBegin; 423 PetscCall(PetscLogGpuTimeBegin()); 424 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 425 PetscCall(VecGetKokkosView(xx, &xv)); 426 PetscCall(VecGetKokkosView(yy, &yv)); 427 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 428 if (zz != yy) Kokkos::deep_copy(zv, yv); 429 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 430 PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 431 PetscCall(VecRestoreKokkosView(xx, &xv)); 432 PetscCall(VecRestoreKokkosView(yy, &yv)); 433 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 434 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 435 PetscCall(PetscLogGpuTimeEnd()); 436 PetscFunctionReturn(PETSC_SUCCESS); 437 } 438 439 /* z = A^T x + y */ 440 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 441 { 442 Mat_SeqAIJKokkos *aijkok; 443 const char *mode; 444 ConstPetscScalarKokkosView xv, yv; 445 PetscScalarKokkosView zv; 446 KokkosCsrMatrix csrmat; 447 448 PetscFunctionBegin; 449 PetscCall(PetscLogGpuTimeBegin()); 450 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 451 PetscCall(VecGetKokkosView(xx, &xv)); 452 PetscCall(VecGetKokkosView(yy, &yv)); 453 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 454 if (zz != yy) Kokkos::deep_copy(zv, yv); 455 if (A->form_explicit_transpose) { 456 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 457 mode = "N"; 458 } else { 459 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 460 csrmat = aijkok->csrmat; 461 mode = "T"; 462 } 463 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 464 PetscCall(VecRestoreKokkosView(xx, &xv)); 465 PetscCall(VecRestoreKokkosView(yy, &yv)); 466 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 467 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 468 PetscCall(PetscLogGpuTimeEnd()); 469 PetscFunctionReturn(PETSC_SUCCESS); 470 } 471 472 /* z = A^H x + y */ 473 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 474 { 475 Mat_SeqAIJKokkos *aijkok; 476 const char *mode; 477 ConstPetscScalarKokkosView xv, yv; 478 PetscScalarKokkosView zv; 479 KokkosCsrMatrix csrmat; 480 481 PetscFunctionBegin; 482 PetscCall(PetscLogGpuTimeBegin()); 483 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 484 PetscCall(VecGetKokkosView(xx, &xv)); 485 PetscCall(VecGetKokkosView(yy, &yv)); 486 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 487 if (zz != yy) Kokkos::deep_copy(zv, yv); 488 if (A->form_explicit_transpose) { 489 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 490 mode = "N"; 491 } else { 492 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 493 csrmat = aijkok->csrmat; 494 mode = "C"; 495 } 496 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 497 PetscCall(VecRestoreKokkosView(xx, &xv)); 498 PetscCall(VecRestoreKokkosView(yy, &yv)); 499 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 500 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 501 PetscCall(PetscLogGpuTimeEnd()); 502 PetscFunctionReturn(PETSC_SUCCESS); 503 } 504 505 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 506 { 507 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 508 509 PetscFunctionBegin; 510 switch (op) { 511 case MAT_FORM_EXPLICIT_TRANSPOSE: 512 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 513 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 514 A->form_explicit_transpose = flg; 515 break; 516 default: 517 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 518 break; 519 } 520 PetscFunctionReturn(PETSC_SUCCESS); 521 } 522 523 /* Depending on reuse, either build a new mat, or use the existing mat */ 524 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 525 { 526 Mat_SeqAIJ *aseq; 527 528 PetscFunctionBegin; 529 PetscCall(PetscKokkosInitializeCheck()); 530 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 531 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */ 532 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 533 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 534 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 535 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 536 PetscCall(PetscFree(A->defaultvectype)); 537 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 538 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 539 PetscCall(MatSetOps_SeqAIJKokkos(A)); 540 aseq = static_cast<Mat_SeqAIJ *>(A->data); 541 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 542 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 543 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE); 544 } 545 } 546 PetscFunctionReturn(PETSC_SUCCESS); 547 } 548 549 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 550 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 551 */ 552 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 553 { 554 Mat_SeqAIJ *bseq; 555 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 556 Mat mat; 557 558 PetscFunctionBegin; 559 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 560 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 561 mat = *B; 562 if (A->preallocated) { 563 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 564 bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE); 565 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 566 /* Now copy values to B if needed */ 567 if (dupOption == MAT_COPY_VALUES) { 568 if (akok->a_dual.need_sync_device()) { 569 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 570 bkok->a_dual.modify_host(); 571 } else { /* If device has the latest data, we only copy data on device */ 572 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 573 bkok->a_dual.modify_device(); 574 } 575 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 576 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 577 bkok->a_dual.modify_host(); 578 } 579 mat->spptr = bkok; 580 } 581 582 PetscCall(PetscFree(mat->defaultvectype)); 583 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 584 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 585 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 586 PetscFunctionReturn(PETSC_SUCCESS); 587 } 588 589 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 590 { 591 Mat At; 592 KokkosCsrMatrix internT; 593 Mat_SeqAIJKokkos *atkok, *bkok; 594 595 PetscFunctionBegin; 596 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 597 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 598 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 599 /* Deep copy internT, as we want to isolate the internal transpose */ 600 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 601 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 602 if (reuse == MAT_INITIAL_MATRIX) *B = At; 603 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 604 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 605 if ((*B)->assembled) { 606 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 607 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 608 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 609 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 610 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 611 MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 612 MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 613 PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 614 PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 615 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 616 } 617 PetscFunctionReturn(PETSC_SUCCESS); 618 } 619 620 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 621 { 622 Mat_SeqAIJKokkos *aijkok; 623 624 PetscFunctionBegin; 625 if (A->factortype == MAT_FACTOR_NONE) { 626 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 627 delete aijkok; 628 } else { 629 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 630 } 631 A->spptr = NULL; 632 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 633 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 634 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 635 PetscCall(MatDestroy_SeqAIJ(A)); 636 PetscFunctionReturn(PETSC_SUCCESS); 637 } 638 639 /*MC 640 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 641 642 A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 643 644 Options Database Key: 645 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 646 647 Level: beginner 648 649 .seealso: [](chapter_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 650 M*/ 651 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 652 { 653 PetscFunctionBegin; 654 PetscCall(PetscKokkosInitializeCheck()); 655 PetscCall(MatCreate_SeqAIJ(A)); 656 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 657 PetscFunctionReturn(PETSC_SUCCESS); 658 } 659 660 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 661 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 662 { 663 Mat_SeqAIJ *a, *b; 664 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 665 MatScalarKokkosView aa, ba, ca; 666 MatRowMapKokkosView ai, bi, ci; 667 MatColIdxKokkosView aj, bj, cj; 668 PetscInt m, n, nnz, aN; 669 670 PetscFunctionBegin; 671 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 672 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 673 PetscValidPointer(C, 4); 674 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 675 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 676 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 677 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 678 679 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 680 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 681 a = static_cast<Mat_SeqAIJ *>(A->data); 682 b = static_cast<Mat_SeqAIJ *>(B->data); 683 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 684 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 685 aa = akok->a_dual.view_device(); 686 ai = akok->i_dual.view_device(); 687 ba = bkok->a_dual.view_device(); 688 bi = bkok->i_dual.view_device(); 689 m = A->rmap->n; /* M, N and nnz of C */ 690 n = A->cmap->n + B->cmap->n; 691 nnz = a->nz + b->nz; 692 aN = A->cmap->n; /* N of A */ 693 if (reuse == MAT_INITIAL_MATRIX) { 694 aj = akok->j_dual.view_device(); 695 bj = bkok->j_dual.view_device(); 696 auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 697 auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 698 auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 699 ca = ca_dual.view_device(); 700 ci = ci_dual.view_device(); 701 cj = cj_dual.view_device(); 702 703 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 704 Kokkos::parallel_for( 705 Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 706 PetscInt i = t.league_rank(); /* row i */ 707 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 708 709 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 710 ci(i) = coffset; 711 if (i == m - 1) ci(m) = ai(m) + bi(m); 712 }); 713 714 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 715 if (k < alen) { 716 ca(coffset + k) = aa(ai(i) + k); 717 cj(coffset + k) = aj(ai(i) + k); 718 } else { 719 ca(coffset + k) = ba(bi(i) + k - alen); 720 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 721 } 722 }); 723 }); 724 ca_dual.modify_device(); 725 ci_dual.modify_device(); 726 cj_dual.modify_device(); 727 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 728 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 729 } else if (reuse == MAT_REUSE_MATRIX) { 730 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 731 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 732 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 733 ca = ckok->a_dual.view_device(); 734 ci = ckok->i_dual.view_device(); 735 736 Kokkos::parallel_for( 737 Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 738 PetscInt i = t.league_rank(); /* row i */ 739 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 740 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 741 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 742 else ca(ci(i) + k) = ba(bi(i) + k - alen); 743 }); 744 }); 745 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 746 } 747 PetscFunctionReturn(PETSC_SUCCESS); 748 } 749 750 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 751 { 752 PetscFunctionBegin; 753 delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 754 PetscFunctionReturn(PETSC_SUCCESS); 755 } 756 757 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 758 { 759 Mat_Product *product = C->product; 760 Mat A, B; 761 bool transA, transB; /* use bool, since KK needs this type */ 762 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 763 Mat_SeqAIJ *c; 764 MatProductData_SeqAIJKokkos *pdata; 765 KokkosCsrMatrix csrmatA, csrmatB; 766 767 PetscFunctionBegin; 768 MatCheckProduct(C, 1); 769 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 770 pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 771 772 // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 773 // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 774 // we still do numeric. 775 if (pdata->reusesym) { // numeric reuses results from symbolic 776 pdata->reusesym = PETSC_FALSE; 777 PetscFunctionReturn(PETSC_SUCCESS); 778 } 779 780 switch (product->type) { 781 case MATPRODUCT_AB: 782 transA = false; 783 transB = false; 784 break; 785 case MATPRODUCT_AtB: 786 transA = true; 787 transB = false; 788 break; 789 case MATPRODUCT_ABt: 790 transA = false; 791 transB = true; 792 break; 793 default: 794 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 795 } 796 797 A = product->A; 798 B = product->B; 799 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 800 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 801 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 802 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 803 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 804 805 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 806 807 csrmatA = akok->csrmat; 808 csrmatB = bkok->csrmat; 809 810 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 811 if (transA) { 812 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 813 transA = false; 814 } 815 816 if (transB) { 817 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 818 transB = false; 819 } 820 PetscCall(PetscLogGpuTimeBegin()); 821 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 822 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 823 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 824 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 825 #endif 826 827 PetscCall(PetscLogGpuTimeEnd()); 828 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 829 /* shorter version of MatAssemblyEnd_SeqAIJ */ 830 c = (Mat_SeqAIJ *)C->data; 831 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 832 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 833 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 834 c->reallocs = 0; 835 C->info.mallocs = 0; 836 C->info.nz_unneeded = 0; 837 C->assembled = C->was_assembled = PETSC_TRUE; 838 C->num_ass++; 839 PetscFunctionReturn(PETSC_SUCCESS); 840 } 841 842 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 843 { 844 Mat_Product *product = C->product; 845 MatProductType ptype; 846 Mat A, B; 847 bool transA, transB; 848 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 849 MatProductData_SeqAIJKokkos *pdata; 850 MPI_Comm comm; 851 KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 852 853 PetscFunctionBegin; 854 MatCheckProduct(C, 1); 855 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 856 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 857 A = product->A; 858 B = product->B; 859 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 860 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 861 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 862 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 863 csrmatA = akok->csrmat; 864 csrmatB = bkok->csrmat; 865 866 ptype = product->type; 867 // Take advantage of the symmetry if true 868 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 869 ptype = MATPRODUCT_AB; 870 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 871 } 872 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 873 ptype = MATPRODUCT_AB; 874 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 875 } 876 877 switch (ptype) { 878 case MATPRODUCT_AB: 879 transA = false; 880 transB = false; 881 break; 882 case MATPRODUCT_AtB: 883 transA = true; 884 transB = false; 885 break; 886 case MATPRODUCT_ABt: 887 transA = false; 888 transB = true; 889 break; 890 default: 891 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 892 } 893 PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 894 pdata->reusesym = product->api_user; 895 896 /* TODO: add command line options to select spgemm algorithms */ 897 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 898 899 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 900 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 901 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 902 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 903 #endif 904 #endif 905 PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 906 907 PetscCall(PetscLogGpuTimeBegin()); 908 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 909 if (transA) { 910 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 911 transA = false; 912 } 913 914 if (transB) { 915 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 916 transB = false; 917 } 918 919 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 920 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 921 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 922 calling new Mat_SeqAIJKokkos(). 923 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 924 */ 925 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 926 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 927 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 928 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 929 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 930 #endif 931 PetscCall(PetscLogGpuTimeEnd()); 932 933 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 934 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 935 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 936 PetscFunctionReturn(PETSC_SUCCESS); 937 } 938 939 /* handles sparse matrix matrix ops */ 940 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 941 { 942 Mat_Product *product = mat->product; 943 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 944 945 PetscFunctionBegin; 946 MatCheckProduct(mat, 1); 947 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 948 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 949 if (Biskok && Ciskok) { 950 switch (product->type) { 951 case MATPRODUCT_AB: 952 case MATPRODUCT_AtB: 953 case MATPRODUCT_ABt: 954 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 955 break; 956 case MATPRODUCT_PtAP: 957 case MATPRODUCT_RARt: 958 case MATPRODUCT_ABC: 959 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 960 break; 961 default: 962 break; 963 } 964 } else { /* fallback for AIJ */ 965 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 966 } 967 PetscFunctionReturn(PETSC_SUCCESS); 968 } 969 970 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 971 { 972 Mat_SeqAIJKokkos *aijkok; 973 974 PetscFunctionBegin; 975 PetscCall(PetscLogGpuTimeBegin()); 976 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 977 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 978 KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 979 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 980 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 981 PetscCall(PetscLogGpuTimeEnd()); 982 PetscFunctionReturn(PETSC_SUCCESS); 983 } 984 985 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 986 { 987 Mat_SeqAIJKokkos *aijkok; 988 989 PetscFunctionBegin; 990 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 991 if (aijkok) { /* Only zero the device if data is already there */ 992 KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0); 993 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 994 } else { /* Might be preallocated but not assembled */ 995 PetscCall(MatZeroEntries_SeqAIJ(A)); 996 } 997 PetscFunctionReturn(PETSC_SUCCESS); 998 } 999 1000 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1001 { 1002 Mat_SeqAIJ *aijseq; 1003 Mat_SeqAIJKokkos *aijkok; 1004 PetscInt n; 1005 PetscScalarKokkosView xv; 1006 1007 PetscFunctionBegin; 1008 PetscCall(VecGetLocalSize(x, &n)); 1009 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1010 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1011 1012 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1013 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1014 1015 if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */ 1016 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1017 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1018 aijkok->SetDiagonal(aijseq->diag); 1019 } 1020 1021 const auto &Aa = aijkok->a_dual.view_device(); 1022 const auto &Ai = aijkok->i_dual.view_device(); 1023 const auto &Adiag = aijkok->diag_dual.view_device(); 1024 1025 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1026 Kokkos::parallel_for( 1027 n, KOKKOS_LAMBDA(const PetscInt i) { 1028 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1029 else xv(i) = 0; 1030 }); 1031 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1032 PetscFunctionReturn(PETSC_SUCCESS); 1033 } 1034 1035 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1036 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1037 { 1038 Mat_SeqAIJKokkos *aijkok; 1039 1040 PetscFunctionBegin; 1041 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1042 PetscValidPointer(kv, 2); 1043 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1044 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1045 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1046 *kv = aijkok->a_dual.view_device(); 1047 PetscFunctionReturn(PETSC_SUCCESS); 1048 } 1049 1050 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1051 { 1052 PetscFunctionBegin; 1053 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1054 PetscValidPointer(kv, 2); 1055 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1056 PetscFunctionReturn(PETSC_SUCCESS); 1057 } 1058 1059 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1060 { 1061 Mat_SeqAIJKokkos *aijkok; 1062 1063 PetscFunctionBegin; 1064 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1065 PetscValidPointer(kv, 2); 1066 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1067 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1068 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1069 *kv = aijkok->a_dual.view_device(); 1070 PetscFunctionReturn(PETSC_SUCCESS); 1071 } 1072 1073 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1074 { 1075 PetscFunctionBegin; 1076 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1077 PetscValidPointer(kv, 2); 1078 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1079 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1080 PetscFunctionReturn(PETSC_SUCCESS); 1081 } 1082 1083 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1084 { 1085 Mat_SeqAIJKokkos *aijkok; 1086 1087 PetscFunctionBegin; 1088 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1089 PetscValidPointer(kv, 2); 1090 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1091 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1092 *kv = aijkok->a_dual.view_device(); 1093 PetscFunctionReturn(PETSC_SUCCESS); 1094 } 1095 1096 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1097 { 1098 PetscFunctionBegin; 1099 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1100 PetscValidPointer(kv, 2); 1101 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1102 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1103 PetscFunctionReturn(PETSC_SUCCESS); 1104 } 1105 1106 /* Computes Y += alpha X */ 1107 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1108 { 1109 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1110 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1111 ConstMatScalarKokkosView Xa; 1112 MatScalarKokkosView Ya; 1113 1114 PetscFunctionBegin; 1115 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1116 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1117 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1118 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1119 PetscCall(PetscLogGpuTimeBegin()); 1120 1121 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1122 PetscBool e; 1123 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1124 if (e) { 1125 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1126 if (e) pattern = SAME_NONZERO_PATTERN; 1127 } 1128 } 1129 1130 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1131 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1132 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1133 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1134 */ 1135 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1136 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1137 Xa = xkok->a_dual.view_device(); 1138 Ya = ykok->a_dual.view_device(); 1139 1140 if (pattern == SAME_NONZERO_PATTERN) { 1141 KokkosBlas::axpy(alpha, Xa, Ya); 1142 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1143 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1144 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1145 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1146 1147 Kokkos::parallel_for( 1148 Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1149 PetscInt i = t.league_rank(); // row i 1150 Kokkos::single(Kokkos::PerTeam(t), [=]() { 1151 // Only one thread works in a team 1152 PetscInt p, q = Yi(i); 1153 for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 1154 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 1155 if (Xj(p) == Yj(q)) { // Found it 1156 Ya(q) += alpha * Xa(p); 1157 q++; 1158 } else { 1159 // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1160 // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1161 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 1162 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 1163 #else 1164 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 1165 #endif 1166 } 1167 } 1168 }); 1169 }); 1170 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1171 } else { // different nonzero patterns 1172 Mat Z; 1173 KokkosCsrMatrix zcsr; 1174 KernelHandle kh; 1175 kh.create_spadd_handle(true); // X, Y are sorted 1176 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1177 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1178 zkok = new Mat_SeqAIJKokkos(zcsr); 1179 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1180 PetscCall(MatHeaderReplace(Y, &Z)); 1181 kh.destroy_spadd_handle(); 1182 } 1183 PetscCall(PetscLogGpuTimeEnd()); 1184 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 1185 PetscFunctionReturn(PETSC_SUCCESS); 1186 } 1187 1188 struct MatCOOStruct_SeqAIJKokkos { 1189 PetscCount n; 1190 PetscCount Atot; 1191 PetscInt nz; 1192 PetscCountKokkosView jmap; 1193 PetscCountKokkosView perm; 1194 1195 MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h) 1196 { 1197 nz = coo_h->nz; 1198 n = coo_h->n; 1199 Atot = coo_h->Atot; 1200 jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1)); 1201 perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot)); 1202 } 1203 }; 1204 1205 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void *data) 1206 { 1207 PetscFunctionBegin; 1208 PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(data)); 1209 PetscFunctionReturn(PETSC_SUCCESS); 1210 } 1211 1212 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1213 { 1214 Mat_SeqAIJKokkos *akok; 1215 Mat_SeqAIJ *aseq; 1216 PetscContainer container_h, container_d; 1217 MatCOOStruct_SeqAIJ *coo_h; 1218 MatCOOStruct_SeqAIJKokkos *coo_d; 1219 1220 PetscFunctionBegin; 1221 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1222 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1223 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1224 delete akok; 1225 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE); 1226 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1227 1228 // Copy the COO struct to device 1229 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 1230 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 1231 PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h)); 1232 1233 // Put the COO struct in a container and then attach that to the matrix 1234 PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d)); 1235 PetscCall(PetscContainerSetPointer(container_d, coo_d)); 1236 PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_SeqAIJKokkos)); 1237 PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d)); 1238 PetscCall(PetscContainerDestroy(&container_d)); 1239 PetscFunctionReturn(PETSC_SUCCESS); 1240 } 1241 1242 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1243 { 1244 MatScalarKokkosView Aa; 1245 ConstMatScalarKokkosView kv; 1246 PetscMemType memtype; 1247 PetscContainer container; 1248 MatCOOStruct_SeqAIJKokkos *coo; 1249 1250 PetscFunctionBegin; 1251 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 1252 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 1253 1254 const auto &n = coo->n; 1255 const auto &Annz = coo->nz; 1256 const auto &jmap = coo->jmap; 1257 const auto &perm = coo->perm; 1258 1259 PetscCall(PetscGetMemType(v, &memtype)); 1260 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1261 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n)); 1262 } else { 1263 kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */ 1264 } 1265 1266 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1267 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1268 1269 Kokkos::parallel_for( 1270 Annz, KOKKOS_LAMBDA(const PetscCount i) { 1271 PetscScalar sum = 0.0; 1272 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1273 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1274 }); 1275 1276 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1277 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1278 PetscFunctionReturn(PETSC_SUCCESS); 1279 } 1280 1281 PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A, const PetscInt *diag) 1282 { 1283 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1284 MatScalarKokkosView Aa; 1285 const MatRowMapKokkosView &Ai = akok->i_dual.view_device(); 1286 PetscInt m = A->rmap->n; 1287 ConstMatRowMapKokkosView Adiag(diag, m); /* diag is a device pointer */ 1288 1289 PetscFunctionBegin; 1290 PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); 1291 Kokkos::parallel_for( 1292 m, KOKKOS_LAMBDA(const PetscInt i) { 1293 PetscScalar tmp; 1294 if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i + 1)) { /* The diagonal element exists */ 1295 tmp = Aa(Ai(i)); 1296 Aa(Ai(i)) = Aa(Adiag(i)); 1297 Aa(Adiag(i)) = tmp; 1298 } 1299 }); 1300 PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1301 PetscFunctionReturn(PETSC_SUCCESS); 1302 } 1303 1304 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1305 { 1306 PetscFunctionBegin; 1307 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1308 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1309 B->offloadmask = PETSC_OFFLOAD_CPU; 1310 PetscFunctionReturn(PETSC_SUCCESS); 1311 } 1312 1313 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1314 { 1315 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1316 1317 PetscFunctionBegin; 1318 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1319 A->boundtocpu = PETSC_FALSE; 1320 1321 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1322 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1323 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1324 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1325 A->ops->scale = MatScale_SeqAIJKokkos; 1326 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1327 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1328 A->ops->mult = MatMult_SeqAIJKokkos; 1329 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1330 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1331 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1332 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1333 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1334 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1335 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1336 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1337 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1338 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1339 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1340 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1341 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1342 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1343 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1344 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1345 1346 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1347 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1348 PetscFunctionReturn(PETSC_SUCCESS); 1349 } 1350 1351 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1352 { 1353 Mat_SeqAIJ *aseq; 1354 PetscInt i, m, n; 1355 1356 PetscFunctionBegin; 1357 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1358 1359 m = akok->nrows(); 1360 n = akok->ncols(); 1361 PetscCall(MatSetSizes(A, m, n, m, n)); 1362 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1363 1364 /* Set up data structures of A as a MATSEQAIJ */ 1365 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1366 aseq = (Mat_SeqAIJ *)(A)->data; 1367 1368 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1369 akok->j_dual.sync_host(); 1370 1371 aseq->i = akok->i_host_data(); 1372 aseq->j = akok->j_host_data(); 1373 aseq->a = akok->a_host_data(); 1374 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1375 aseq->singlemalloc = PETSC_FALSE; 1376 aseq->free_a = PETSC_FALSE; 1377 aseq->free_ij = PETSC_FALSE; 1378 aseq->nz = akok->nnz(); 1379 aseq->maxnz = aseq->nz; 1380 1381 PetscCall(PetscMalloc1(m, &aseq->imax)); 1382 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1383 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1384 1385 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1386 akok->nonzerostate = A->nonzerostate; 1387 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1388 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1389 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1390 PetscFunctionReturn(PETSC_SUCCESS); 1391 } 1392 1393 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 1394 { 1395 PetscFunctionBegin; 1396 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1397 *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 1398 PetscFunctionReturn(PETSC_SUCCESS); 1399 } 1400 1401 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 1402 { 1403 Mat_SeqAIJKokkos *akok; 1404 PetscFunctionBegin; 1405 PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 1406 PetscCall(MatCreate(comm, A)); 1407 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1408 PetscFunctionReturn(PETSC_SUCCESS); 1409 } 1410 1411 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1412 1413 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1414 */ 1415 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1416 { 1417 PetscFunctionBegin; 1418 PetscCall(MatCreate(comm, A)); 1419 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1420 PetscFunctionReturn(PETSC_SUCCESS); 1421 } 1422 1423 /*@C 1424 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1425 (the default parallel PETSc format). This matrix will ultimately be handled by 1426 Kokkos for calculations. 1427 1428 Collective 1429 1430 Input Parameters: 1431 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1432 . m - number of rows 1433 . n - number of columns 1434 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 1435 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 1436 1437 Output Parameter: 1438 . A - the matrix 1439 1440 Level: intermediate 1441 1442 Notes: 1443 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1444 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1445 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1446 1447 The AIJ format, also called 1448 compressed row storage, is fully compatible with standard Fortran 1449 storage. That is, the stored row and column indices can begin at 1450 either one (as in Fortran) or zero. 1451 1452 Specify the preallocated storage with either `nz` or `nnz` (not both). 1453 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1454 allocation. 1455 1456 .seealso: [](chapter_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()` 1457 @*/ 1458 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1459 { 1460 PetscFunctionBegin; 1461 PetscCall(PetscKokkosInitializeCheck()); 1462 PetscCall(MatCreate(comm, A)); 1463 PetscCall(MatSetSizes(*A, m, n, m, n)); 1464 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1465 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1466 PetscFunctionReturn(PETSC_SUCCESS); 1467 } 1468 1469 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1470 { 1471 PetscFunctionBegin; 1472 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1473 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1474 PetscFunctionReturn(PETSC_SUCCESS); 1475 } 1476 1477 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1478 { 1479 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1480 1481 PetscFunctionBegin; 1482 if (!factors->sptrsv_symbolic_completed) { 1483 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d); 1484 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d); 1485 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1486 } 1487 PetscFunctionReturn(PETSC_SUCCESS); 1488 } 1489 1490 /* Check if we need to update factors etc for transpose solve */ 1491 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1492 { 1493 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1494 MatColIdxType n = A->rmap->n; 1495 1496 PetscFunctionBegin; 1497 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1498 /* Update L^T and do sptrsv symbolic */ 1499 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0 1500 factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 1501 factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 1502 1503 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d, 1504 factors->iLt_d, factors->jLt_d, factors->aLt_d); 1505 1506 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1507 We have to sort the indices, until KK provides finer control options. 1508 */ 1509 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d); 1510 1511 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d); 1512 1513 /* Update U^T and do sptrsv symbolic */ 1514 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0 1515 factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 1516 factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 1517 1518 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d, 1519 factors->iUt_d, factors->jUt_d, factors->aUt_d); 1520 1521 /* Sort indices. See comments above */ 1522 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d); 1523 1524 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d); 1525 factors->transpose_updated = PETSC_TRUE; 1526 } 1527 PetscFunctionReturn(PETSC_SUCCESS); 1528 } 1529 1530 /* Solve Ax = b, with A = LU */ 1531 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x) 1532 { 1533 ConstPetscScalarKokkosView bv; 1534 PetscScalarKokkosView xv; 1535 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1536 1537 PetscFunctionBegin; 1538 PetscCall(PetscLogGpuTimeBegin()); 1539 PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 1540 PetscCall(VecGetKokkosView(b, &bv)); 1541 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1542 /* Solve L tmpv = b */ 1543 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector)); 1544 /* Solve Ux = tmpv */ 1545 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv)); 1546 PetscCall(VecRestoreKokkosView(b, &bv)); 1547 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1548 PetscCall(PetscLogGpuTimeEnd()); 1549 PetscFunctionReturn(PETSC_SUCCESS); 1550 } 1551 1552 /* Solve A^T x = b, where A^T = U^T L^T */ 1553 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x) 1554 { 1555 ConstPetscScalarKokkosView bv; 1556 PetscScalarKokkosView xv; 1557 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1558 1559 PetscFunctionBegin; 1560 PetscCall(PetscLogGpuTimeBegin()); 1561 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 1562 PetscCall(VecGetKokkosView(b, &bv)); 1563 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1564 /* Solve U^T tmpv = b */ 1565 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector); 1566 1567 /* Solve L^T x = tmpv */ 1568 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv); 1569 PetscCall(VecRestoreKokkosView(b, &bv)); 1570 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1571 PetscCall(PetscLogGpuTimeEnd()); 1572 PetscFunctionReturn(PETSC_SUCCESS); 1573 } 1574 1575 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1576 { 1577 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1578 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1579 PetscInt fill_lev = info->levels; 1580 1581 PetscFunctionBegin; 1582 PetscCall(PetscLogGpuTimeBegin()); 1583 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1584 1585 auto a_d = aijkok->a_dual.view_device(); 1586 auto i_d = aijkok->i_dual.view_device(); 1587 auto j_d = aijkok->j_dual.view_device(); 1588 1589 KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d); 1590 1591 B->assembled = PETSC_TRUE; 1592 B->preallocated = PETSC_TRUE; 1593 B->ops->solve = MatSolve_SeqAIJKokkos; 1594 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1595 B->ops->matsolve = NULL; 1596 B->ops->matsolvetranspose = NULL; 1597 B->offloadmask = PETSC_OFFLOAD_GPU; 1598 1599 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1600 factors->transpose_updated = PETSC_FALSE; 1601 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1602 /* TODO: log flops, but how to know that? */ 1603 PetscCall(PetscLogGpuTimeEnd()); 1604 PetscFunctionReturn(PETSC_SUCCESS); 1605 } 1606 1607 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1608 { 1609 Mat_SeqAIJKokkos *aijkok; 1610 Mat_SeqAIJ *b; 1611 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1612 PetscInt fill_lev = info->levels; 1613 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 1614 PetscInt n = A->rmap->n; 1615 1616 PetscFunctionBegin; 1617 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1618 /* Rebuild factors */ 1619 if (factors) { 1620 factors->Destroy(); 1621 } /* Destroy the old if it exists */ 1622 else { 1623 B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n); 1624 } 1625 1626 /* Create a spiluk handle and then do symbolic factorization */ 1627 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 1628 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 1629 1630 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1631 1632 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 1633 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 1634 Kokkos::realloc(factors->iU_d, n + 1); 1635 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 1636 1637 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1638 auto i_d = aijkok->i_dual.view_device(); 1639 auto j_d = aijkok->j_dual.view_device(); 1640 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d); 1641 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1642 1643 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1644 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 1645 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 1646 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 1647 1648 /* TODO: add options to select sptrsv algorithms */ 1649 /* Create sptrsv handles for L, U and their transpose */ 1650 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1651 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1652 #else 1653 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1654 #endif 1655 1656 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 1657 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 1658 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 1659 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 1660 1661 /* Fill fields of the factor matrix B */ 1662 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 1663 b = (Mat_SeqAIJ *)B->data; 1664 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 1665 B->info.fill_ratio_given = info->fill; 1666 B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 1667 1668 B->offloadmask = PETSC_OFFLOAD_GPU; 1669 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1670 PetscFunctionReturn(PETSC_SUCCESS); 1671 } 1672 1673 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type) 1674 { 1675 PetscFunctionBegin; 1676 *type = MATSOLVERKOKKOS; 1677 PetscFunctionReturn(PETSC_SUCCESS); 1678 } 1679 1680 /*MC 1681 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1682 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 1683 1684 Level: beginner 1685 1686 .seealso: [](chapter_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1687 M*/ 1688 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1689 { 1690 PetscInt n = A->rmap->n; 1691 1692 PetscFunctionBegin; 1693 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 1694 PetscCall(MatSetSizes(*B, n, n, n, n)); 1695 (*B)->factortype = ftype; 1696 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 1697 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1698 1699 if (ftype == MAT_FACTOR_LU) { 1700 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1701 (*B)->canuseordering = PETSC_TRUE; 1702 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1703 } else if (ftype == MAT_FACTOR_ILU) { 1704 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1705 (*B)->canuseordering = PETSC_FALSE; 1706 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1707 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1708 1709 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 1710 PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos)); 1711 PetscFunctionReturn(PETSC_SUCCESS); 1712 } 1713 1714 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1715 { 1716 PetscFunctionBegin; 1717 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1718 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1719 PetscFunctionReturn(PETSC_SUCCESS); 1720 } 1721 1722 /* Utility to print out a KokkosCsrMatrix for debugging */ 1723 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 1724 { 1725 const auto &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map); 1726 const auto &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries); 1727 const auto &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values); 1728 const PetscInt *i = iv.data(); 1729 const PetscInt *j = jv.data(); 1730 const PetscScalar *a = av.data(); 1731 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 1732 1733 PetscFunctionBegin; 1734 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 1735 for (PetscInt k = 0; k < m; k++) { 1736 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 1737 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 1738 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 1739 } 1740 PetscFunctionReturn(PETSC_SUCCESS); 1741 } 1742