1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petsc/private/sfimpl.h> 5 #include <petscsystypes.h> 6 #include <petscerror.h> 7 8 #include <Kokkos_Core.hpp> 9 #include <KokkosBlas.hpp> 10 #include <KokkosSparse_CrsMatrix.hpp> 11 #include <KokkosSparse_spmv.hpp> 12 #include <KokkosSparse_spiluk.hpp> 13 #include <KokkosSparse_sptrsv.hpp> 14 #include <KokkosSparse_spgemm.hpp> 15 #include <KokkosSparse_spadd.hpp> 16 17 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 18 19 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 20 #include <KokkosSparse_Utils.hpp> 21 using KokkosSparse::sort_crs_matrix; 22 using KokkosSparse::Impl::transpose_matrix; 23 #else 24 #include <KokkosKernels_Sorting.hpp> 25 using KokkosKernels::sort_crs_matrix; 26 using KokkosKernels::Impl::transpose_matrix; 27 #endif 28 29 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 30 31 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 32 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 33 In the latter case, it is important to set a_dual's sync state correctly. 34 */ 35 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 36 { 37 Mat_SeqAIJ *aijseq; 38 Mat_SeqAIJKokkos *aijkok; 39 40 PetscFunctionBegin; 41 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 42 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 43 44 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 45 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 46 47 /* If aijkok does not exist, we just copy i, j to device. 48 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 49 In both cases, we build a new aijkok structure. 50 */ 51 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 52 delete aijkok; 53 aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 54 A->spptr = aijkok; 55 } 56 57 PetscFunctionReturn(PETSC_SUCCESS); 58 } 59 60 /* Sync CSR data to device if not yet */ 61 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 62 { 63 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 64 65 PetscFunctionBegin; 66 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 67 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 68 if (aijkok->a_dual.need_sync_device()) { 69 aijkok->a_dual.sync_device(); 70 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 71 aijkok->hermitian_updated = PETSC_FALSE; 72 } 73 PetscFunctionReturn(PETSC_SUCCESS); 74 } 75 76 /* Mark the CSR data on device as modified */ 77 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 78 { 79 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 80 81 PetscFunctionBegin; 82 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 83 aijkok->a_dual.clear_sync_state(); 84 aijkok->a_dual.modify_device(); 85 aijkok->transpose_updated = PETSC_FALSE; 86 aijkok->hermitian_updated = PETSC_FALSE; 87 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 88 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 89 PetscFunctionReturn(PETSC_SUCCESS); 90 } 91 92 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 93 { 94 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 95 96 PetscFunctionBegin; 97 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 98 /* We do not expect one needs factors on host */ 99 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 100 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 101 aijkok->a_dual.sync_host(); 102 PetscFunctionReturn(PETSC_SUCCESS); 103 } 104 105 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 106 { 107 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 108 109 PetscFunctionBegin; 110 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 111 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 112 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 113 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 114 */ 115 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 116 aijkok->a_dual.sync_host(); 117 *array = aijkok->a_dual.view_host().data(); 118 } else { /* Happens when calling MatSetValues on a newly created matrix */ 119 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 120 } 121 PetscFunctionReturn(PETSC_SUCCESS); 122 } 123 124 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 125 { 126 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 127 128 PetscFunctionBegin; 129 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 130 PetscFunctionReturn(PETSC_SUCCESS); 131 } 132 133 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 134 { 135 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 136 137 PetscFunctionBegin; 138 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 139 aijkok->a_dual.sync_host(); 140 *array = aijkok->a_dual.view_host().data(); 141 } else { 142 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 143 } 144 PetscFunctionReturn(PETSC_SUCCESS); 145 } 146 147 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 148 { 149 PetscFunctionBegin; 150 *array = NULL; 151 PetscFunctionReturn(PETSC_SUCCESS); 152 } 153 154 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 155 { 156 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 157 158 PetscFunctionBegin; 159 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 160 *array = aijkok->a_dual.view_host().data(); 161 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 162 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 163 } 164 PetscFunctionReturn(PETSC_SUCCESS); 165 } 166 167 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 168 { 169 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 170 171 PetscFunctionBegin; 172 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 173 aijkok->a_dual.clear_sync_state(); 174 aijkok->a_dual.modify_host(); 175 } 176 PetscFunctionReturn(PETSC_SUCCESS); 177 } 178 179 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 180 { 181 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 182 183 PetscFunctionBegin; 184 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 185 186 if (i) *i = aijkok->i_device_data(); 187 if (j) *j = aijkok->j_device_data(); 188 if (a) { 189 aijkok->a_dual.sync_device(); 190 *a = aijkok->a_device_data(); 191 } 192 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 193 PetscFunctionReturn(PETSC_SUCCESS); 194 } 195 196 /* 197 Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 198 199 Input Parameter: 200 . A - the MATSEQAIJKOKKOS matrix 201 202 Output Parameters: 203 + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 204 - T_d - the transpose on device, whose value array is allocated but not initialized 205 */ 206 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 207 { 208 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 209 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 210 const PetscInt *Ai = aseq->i, *Aj = aseq->j; 211 MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 212 MatRowMapType *Ti = Ti_h.data(); 213 MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 214 MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 215 PetscInt *Tj = Tj_h.data(); 216 PetscInt *perm = perm_h.data(); 217 PetscInt *offset; 218 219 PetscFunctionBegin; 220 // Populate Ti 221 PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 222 Ti++; 223 for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 224 Ti--; 225 for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 226 227 // Populate Tj and the permutation array 228 PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 229 for (PetscInt i = 0; i < m; i++) { 230 for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 231 PetscInt r = Aj[j]; // row r of T 232 PetscInt disp = Ti[r] + offset[r]; 233 234 Tj[disp] = i; // col i of T 235 perm[disp] = j; 236 offset[r]++; 237 } 238 } 239 PetscCall(PetscFree(offset)); 240 241 // Sort each row of T, along with the permutation array 242 for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 243 244 // Output perm and T on device 245 auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 246 auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 247 PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 248 PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 249 PetscFunctionReturn(PETSC_SUCCESS); 250 } 251 252 // Generate the transpose on device and cache it internally 253 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 254 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 255 { 256 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 257 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 258 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 259 KokkosCsrMatrix &T = akok->csrmatT; 260 261 PetscFunctionBegin; 262 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 263 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's valeus since we are going to access them on device 264 265 const auto &Aa = akok->a_dual.view_device(); 266 267 if (A->symmetric == PETSC_BOOL3_TRUE) { 268 *csrmatT = akok->csrmat; 269 } else { 270 // See if we already have a cached transpose and its value is up to date 271 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 272 if (!akok->transpose_updated) { // if the value is out of date, update the cached version 273 const auto &perm = akok->transpose_perm; // get the permutation array 274 auto &Ta = T.values; 275 276 PetscCallCXX(Kokkos::parallel_for( 277 nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 278 } 279 } else { // Generate T of size n x m for the first time 280 MatRowMapKokkosView perm; 281 282 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 283 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 284 PetscCallCXX(Kokkos::parallel_for( 285 nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 286 } 287 akok->transpose_updated = PETSC_TRUE; 288 *csrmatT = akok->csrmatT; 289 } 290 PetscFunctionReturn(PETSC_SUCCESS); 291 } 292 293 // Generate the Hermitian on device and cache it internally 294 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 295 { 296 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 297 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 298 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 299 KokkosCsrMatrix &T = akok->csrmatH; 300 301 PetscFunctionBegin; 302 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 303 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 304 305 const auto &Aa = akok->a_dual.view_device(); 306 307 if (A->hermitian == PETSC_BOOL3_TRUE) { 308 *csrmatH = akok->csrmat; 309 } else { 310 // See if we already have a cached hermitian and its value is up to date 311 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 312 if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 313 const auto &perm = akok->transpose_perm; // get the permutation array 314 auto &Ta = T.values; 315 316 PetscCallCXX(Kokkos::parallel_for( 317 nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 318 } 319 } else { // Generate T of size n x m for the first time 320 MatRowMapKokkosView perm; 321 322 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 323 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 324 PetscCallCXX(Kokkos::parallel_for( 325 nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 326 } 327 akok->hermitian_updated = PETSC_TRUE; 328 *csrmatH = akok->csrmatH; 329 } 330 PetscFunctionReturn(PETSC_SUCCESS); 331 } 332 333 /* y = A x */ 334 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 335 { 336 Mat_SeqAIJKokkos *aijkok; 337 ConstPetscScalarKokkosView xv; 338 PetscScalarKokkosView yv; 339 340 PetscFunctionBegin; 341 PetscCall(PetscLogGpuTimeBegin()); 342 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 343 PetscCall(VecGetKokkosView(xx, &xv)); 344 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 345 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 346 PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 347 PetscCall(VecRestoreKokkosView(xx, &xv)); 348 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 349 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 350 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 351 PetscCall(PetscLogGpuTimeEnd()); 352 PetscFunctionReturn(PETSC_SUCCESS); 353 } 354 355 /* y = A^T x */ 356 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 357 { 358 Mat_SeqAIJKokkos *aijkok; 359 const char *mode; 360 ConstPetscScalarKokkosView xv; 361 PetscScalarKokkosView yv; 362 KokkosCsrMatrix csrmat; 363 364 PetscFunctionBegin; 365 PetscCall(PetscLogGpuTimeBegin()); 366 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 367 PetscCall(VecGetKokkosView(xx, &xv)); 368 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 369 if (A->form_explicit_transpose) { 370 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 371 mode = "N"; 372 } else { 373 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 374 csrmat = aijkok->csrmat; 375 mode = "T"; 376 } 377 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 378 PetscCall(VecRestoreKokkosView(xx, &xv)); 379 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 380 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 381 PetscCall(PetscLogGpuTimeEnd()); 382 PetscFunctionReturn(PETSC_SUCCESS); 383 } 384 385 /* y = A^H x */ 386 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 387 { 388 Mat_SeqAIJKokkos *aijkok; 389 const char *mode; 390 ConstPetscScalarKokkosView xv; 391 PetscScalarKokkosView yv; 392 KokkosCsrMatrix csrmat; 393 394 PetscFunctionBegin; 395 PetscCall(PetscLogGpuTimeBegin()); 396 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 397 PetscCall(VecGetKokkosView(xx, &xv)); 398 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 399 if (A->form_explicit_transpose) { 400 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 401 mode = "N"; 402 } else { 403 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 404 csrmat = aijkok->csrmat; 405 mode = "C"; 406 } 407 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 408 PetscCall(VecRestoreKokkosView(xx, &xv)); 409 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 410 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 411 PetscCall(PetscLogGpuTimeEnd()); 412 PetscFunctionReturn(PETSC_SUCCESS); 413 } 414 415 /* z = A x + y */ 416 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 417 { 418 Mat_SeqAIJKokkos *aijkok; 419 ConstPetscScalarKokkosView xv, yv; 420 PetscScalarKokkosView zv; 421 422 PetscFunctionBegin; 423 PetscCall(PetscLogGpuTimeBegin()); 424 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 425 PetscCall(VecGetKokkosView(xx, &xv)); 426 PetscCall(VecGetKokkosView(yy, &yv)); 427 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 428 if (zz != yy) Kokkos::deep_copy(zv, yv); 429 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 430 PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 431 PetscCall(VecRestoreKokkosView(xx, &xv)); 432 PetscCall(VecRestoreKokkosView(yy, &yv)); 433 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 434 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 435 PetscCall(PetscLogGpuTimeEnd()); 436 PetscFunctionReturn(PETSC_SUCCESS); 437 } 438 439 /* z = A^T x + y */ 440 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 441 { 442 Mat_SeqAIJKokkos *aijkok; 443 const char *mode; 444 ConstPetscScalarKokkosView xv, yv; 445 PetscScalarKokkosView zv; 446 KokkosCsrMatrix csrmat; 447 448 PetscFunctionBegin; 449 PetscCall(PetscLogGpuTimeBegin()); 450 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 451 PetscCall(VecGetKokkosView(xx, &xv)); 452 PetscCall(VecGetKokkosView(yy, &yv)); 453 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 454 if (zz != yy) Kokkos::deep_copy(zv, yv); 455 if (A->form_explicit_transpose) { 456 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 457 mode = "N"; 458 } else { 459 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 460 csrmat = aijkok->csrmat; 461 mode = "T"; 462 } 463 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 464 PetscCall(VecRestoreKokkosView(xx, &xv)); 465 PetscCall(VecRestoreKokkosView(yy, &yv)); 466 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 467 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 468 PetscCall(PetscLogGpuTimeEnd()); 469 PetscFunctionReturn(PETSC_SUCCESS); 470 } 471 472 /* z = A^H x + y */ 473 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 474 { 475 Mat_SeqAIJKokkos *aijkok; 476 const char *mode; 477 ConstPetscScalarKokkosView xv, yv; 478 PetscScalarKokkosView zv; 479 KokkosCsrMatrix csrmat; 480 481 PetscFunctionBegin; 482 PetscCall(PetscLogGpuTimeBegin()); 483 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 484 PetscCall(VecGetKokkosView(xx, &xv)); 485 PetscCall(VecGetKokkosView(yy, &yv)); 486 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 487 if (zz != yy) Kokkos::deep_copy(zv, yv); 488 if (A->form_explicit_transpose) { 489 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 490 mode = "N"; 491 } else { 492 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 493 csrmat = aijkok->csrmat; 494 mode = "C"; 495 } 496 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 497 PetscCall(VecRestoreKokkosView(xx, &xv)); 498 PetscCall(VecRestoreKokkosView(yy, &yv)); 499 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 500 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 501 PetscCall(PetscLogGpuTimeEnd()); 502 PetscFunctionReturn(PETSC_SUCCESS); 503 } 504 505 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 506 { 507 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 508 509 PetscFunctionBegin; 510 switch (op) { 511 case MAT_FORM_EXPLICIT_TRANSPOSE: 512 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 513 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 514 A->form_explicit_transpose = flg; 515 break; 516 default: 517 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 518 break; 519 } 520 PetscFunctionReturn(PETSC_SUCCESS); 521 } 522 523 /* Depending on reuse, either build a new mat, or use the existing mat */ 524 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 525 { 526 Mat_SeqAIJ *aseq; 527 528 PetscFunctionBegin; 529 PetscCall(PetscKokkosInitializeCheck()); 530 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 531 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */ 532 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 533 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 534 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 535 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 536 PetscCall(PetscFree(A->defaultvectype)); 537 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 538 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 539 PetscCall(MatSetOps_SeqAIJKokkos(A)); 540 aseq = static_cast<Mat_SeqAIJ *>(A->data); 541 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 542 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 543 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE); 544 } 545 } 546 PetscFunctionReturn(PETSC_SUCCESS); 547 } 548 549 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 550 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 551 */ 552 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 553 { 554 Mat_SeqAIJ *bseq; 555 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 556 Mat mat; 557 558 PetscFunctionBegin; 559 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 560 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 561 mat = *B; 562 if (A->preallocated) { 563 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 564 bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE); 565 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 566 /* Now copy values to B if needed */ 567 if (dupOption == MAT_COPY_VALUES) { 568 if (akok->a_dual.need_sync_device()) { 569 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 570 bkok->a_dual.modify_host(); 571 } else { /* If device has the latest data, we only copy data on device */ 572 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 573 bkok->a_dual.modify_device(); 574 } 575 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 576 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 577 bkok->a_dual.modify_host(); 578 } 579 mat->spptr = bkok; 580 } 581 582 PetscCall(PetscFree(mat->defaultvectype)); 583 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 584 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 585 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 586 PetscFunctionReturn(PETSC_SUCCESS); 587 } 588 589 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 590 { 591 Mat At; 592 KokkosCsrMatrix internT; 593 Mat_SeqAIJKokkos *atkok, *bkok; 594 595 PetscFunctionBegin; 596 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 597 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 598 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 599 /* Deep copy internT, as we want to isolate the internal transpose */ 600 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 601 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 602 if (reuse == MAT_INITIAL_MATRIX) *B = At; 603 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 604 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 605 if ((*B)->assembled) { 606 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 607 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 608 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 609 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 610 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 611 MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 612 MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 613 PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 614 PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 615 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 616 } 617 PetscFunctionReturn(PETSC_SUCCESS); 618 } 619 620 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 621 { 622 Mat_SeqAIJKokkos *aijkok; 623 624 PetscFunctionBegin; 625 if (A->factortype == MAT_FACTOR_NONE) { 626 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 627 delete aijkok; 628 } else { 629 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 630 } 631 A->spptr = NULL; 632 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 633 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 634 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 635 PetscCall(MatDestroy_SeqAIJ(A)); 636 PetscFunctionReturn(PETSC_SUCCESS); 637 } 638 639 /*MC 640 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 641 642 A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 643 644 Options Database Key: 645 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 646 647 Level: beginner 648 649 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 650 M*/ 651 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 652 { 653 PetscFunctionBegin; 654 PetscCall(PetscKokkosInitializeCheck()); 655 PetscCall(MatCreate_SeqAIJ(A)); 656 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 657 PetscFunctionReturn(PETSC_SUCCESS); 658 } 659 660 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 661 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 662 { 663 Mat_SeqAIJ *a, *b; 664 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 665 MatScalarKokkosView aa, ba, ca; 666 MatRowMapKokkosView ai, bi, ci; 667 MatColIdxKokkosView aj, bj, cj; 668 PetscInt m, n, nnz, aN; 669 670 PetscFunctionBegin; 671 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 672 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 673 PetscValidPointer(C, 4); 674 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 675 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 676 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 677 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 678 679 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 680 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 681 a = static_cast<Mat_SeqAIJ *>(A->data); 682 b = static_cast<Mat_SeqAIJ *>(B->data); 683 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 684 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 685 aa = akok->a_dual.view_device(); 686 ai = akok->i_dual.view_device(); 687 ba = bkok->a_dual.view_device(); 688 bi = bkok->i_dual.view_device(); 689 m = A->rmap->n; /* M, N and nnz of C */ 690 n = A->cmap->n + B->cmap->n; 691 nnz = a->nz + b->nz; 692 aN = A->cmap->n; /* N of A */ 693 if (reuse == MAT_INITIAL_MATRIX) { 694 aj = akok->j_dual.view_device(); 695 bj = bkok->j_dual.view_device(); 696 auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 697 auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 698 auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 699 ca = ca_dual.view_device(); 700 ci = ci_dual.view_device(); 701 cj = cj_dual.view_device(); 702 703 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 704 Kokkos::parallel_for( 705 Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 706 PetscInt i = t.league_rank(); /* row i */ 707 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 708 709 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 710 ci(i) = coffset; 711 if (i == m - 1) ci(m) = ai(m) + bi(m); 712 }); 713 714 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 715 if (k < alen) { 716 ca(coffset + k) = aa(ai(i) + k); 717 cj(coffset + k) = aj(ai(i) + k); 718 } else { 719 ca(coffset + k) = ba(bi(i) + k - alen); 720 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 721 } 722 }); 723 }); 724 ca_dual.modify_device(); 725 ci_dual.modify_device(); 726 cj_dual.modify_device(); 727 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 728 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 729 } else if (reuse == MAT_REUSE_MATRIX) { 730 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 731 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 732 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 733 ca = ckok->a_dual.view_device(); 734 ci = ckok->i_dual.view_device(); 735 736 Kokkos::parallel_for( 737 Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 738 PetscInt i = t.league_rank(); /* row i */ 739 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 740 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 741 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 742 else ca(ci(i) + k) = ba(bi(i) + k - alen); 743 }); 744 }); 745 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 746 } 747 PetscFunctionReturn(PETSC_SUCCESS); 748 } 749 750 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 751 { 752 PetscFunctionBegin; 753 delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 754 PetscFunctionReturn(PETSC_SUCCESS); 755 } 756 757 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 758 { 759 Mat_Product *product = C->product; 760 Mat A, B; 761 bool transA, transB; /* use bool, since KK needs this type */ 762 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 763 Mat_SeqAIJ *c; 764 MatProductData_SeqAIJKokkos *pdata; 765 KokkosCsrMatrix csrmatA, csrmatB; 766 767 PetscFunctionBegin; 768 MatCheckProduct(C, 1); 769 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 770 pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 771 772 // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 773 // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 774 // we still do numeric. 775 if (pdata->reusesym) { // numeric reuses results from symbolic 776 pdata->reusesym = PETSC_FALSE; 777 PetscFunctionReturn(PETSC_SUCCESS); 778 } 779 780 switch (product->type) { 781 case MATPRODUCT_AB: 782 transA = false; 783 transB = false; 784 break; 785 case MATPRODUCT_AtB: 786 transA = true; 787 transB = false; 788 break; 789 case MATPRODUCT_ABt: 790 transA = false; 791 transB = true; 792 break; 793 default: 794 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 795 } 796 797 A = product->A; 798 B = product->B; 799 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 800 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 801 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 802 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 803 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 804 805 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 806 807 csrmatA = akok->csrmat; 808 csrmatB = bkok->csrmat; 809 810 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 811 if (transA) { 812 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 813 transA = false; 814 } 815 816 if (transB) { 817 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 818 transB = false; 819 } 820 PetscCall(PetscLogGpuTimeBegin()); 821 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 822 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 823 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 824 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 825 #endif 826 827 PetscCall(PetscLogGpuTimeEnd()); 828 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 829 /* shorter version of MatAssemblyEnd_SeqAIJ */ 830 c = (Mat_SeqAIJ *)C->data; 831 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 832 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 833 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 834 c->reallocs = 0; 835 C->info.mallocs = 0; 836 C->info.nz_unneeded = 0; 837 C->assembled = C->was_assembled = PETSC_TRUE; 838 C->num_ass++; 839 PetscFunctionReturn(PETSC_SUCCESS); 840 } 841 842 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 843 { 844 Mat_Product *product = C->product; 845 MatProductType ptype; 846 Mat A, B; 847 bool transA, transB; 848 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 849 MatProductData_SeqAIJKokkos *pdata; 850 MPI_Comm comm; 851 KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 852 853 PetscFunctionBegin; 854 MatCheckProduct(C, 1); 855 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 856 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 857 A = product->A; 858 B = product->B; 859 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 860 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 861 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 862 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 863 csrmatA = akok->csrmat; 864 csrmatB = bkok->csrmat; 865 866 ptype = product->type; 867 // Take advantage of the symmetry if true 868 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 869 ptype = MATPRODUCT_AB; 870 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 871 } 872 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 873 ptype = MATPRODUCT_AB; 874 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 875 } 876 877 switch (ptype) { 878 case MATPRODUCT_AB: 879 transA = false; 880 transB = false; 881 break; 882 case MATPRODUCT_AtB: 883 transA = true; 884 transB = false; 885 break; 886 case MATPRODUCT_ABt: 887 transA = false; 888 transB = true; 889 break; 890 default: 891 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 892 } 893 PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 894 pdata->reusesym = product->api_user; 895 896 /* TODO: add command line options to select spgemm algorithms */ 897 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 898 899 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 900 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 901 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 902 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 903 #endif 904 #endif 905 PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 906 907 PetscCall(PetscLogGpuTimeBegin()); 908 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 909 if (transA) { 910 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 911 transA = false; 912 } 913 914 if (transB) { 915 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 916 transB = false; 917 } 918 919 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 920 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 921 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 922 calling new Mat_SeqAIJKokkos(). 923 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 924 */ 925 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 926 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 927 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 928 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 929 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 930 #endif 931 PetscCall(PetscLogGpuTimeEnd()); 932 933 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 934 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 935 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 936 PetscFunctionReturn(PETSC_SUCCESS); 937 } 938 939 /* handles sparse matrix matrix ops */ 940 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 941 { 942 Mat_Product *product = mat->product; 943 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 944 945 PetscFunctionBegin; 946 MatCheckProduct(mat, 1); 947 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 948 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 949 if (Biskok && Ciskok) { 950 switch (product->type) { 951 case MATPRODUCT_AB: 952 case MATPRODUCT_AtB: 953 case MATPRODUCT_ABt: 954 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 955 break; 956 case MATPRODUCT_PtAP: 957 case MATPRODUCT_RARt: 958 case MATPRODUCT_ABC: 959 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 960 break; 961 default: 962 break; 963 } 964 } else { /* fallback for AIJ */ 965 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 966 } 967 PetscFunctionReturn(PETSC_SUCCESS); 968 } 969 970 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 971 { 972 Mat_SeqAIJKokkos *aijkok; 973 974 PetscFunctionBegin; 975 PetscCall(PetscLogGpuTimeBegin()); 976 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 977 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 978 KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 979 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 980 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 981 PetscCall(PetscLogGpuTimeEnd()); 982 PetscFunctionReturn(PETSC_SUCCESS); 983 } 984 985 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 986 { 987 Mat_SeqAIJKokkos *aijkok; 988 989 PetscFunctionBegin; 990 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 991 if (aijkok) { /* Only zero the device if data is already there */ 992 KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0); 993 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 994 } else { /* Might be preallocated but not assembled */ 995 PetscCall(MatZeroEntries_SeqAIJ(A)); 996 } 997 PetscFunctionReturn(PETSC_SUCCESS); 998 } 999 1000 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1001 { 1002 Mat_SeqAIJ *aijseq; 1003 Mat_SeqAIJKokkos *aijkok; 1004 PetscInt n; 1005 PetscScalarKokkosView xv; 1006 1007 PetscFunctionBegin; 1008 PetscCall(VecGetLocalSize(x, &n)); 1009 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1010 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1011 1012 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1013 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1014 1015 if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */ 1016 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1017 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1018 aijkok->SetDiagonal(aijseq->diag); 1019 } 1020 1021 const auto &Aa = aijkok->a_dual.view_device(); 1022 const auto &Ai = aijkok->i_dual.view_device(); 1023 const auto &Adiag = aijkok->diag_dual.view_device(); 1024 1025 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1026 Kokkos::parallel_for( 1027 n, KOKKOS_LAMBDA(const PetscInt i) { 1028 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1029 else xv(i) = 0; 1030 }); 1031 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1032 PetscFunctionReturn(PETSC_SUCCESS); 1033 } 1034 1035 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1036 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1037 { 1038 Mat_SeqAIJKokkos *aijkok; 1039 1040 PetscFunctionBegin; 1041 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1042 PetscValidPointer(kv, 2); 1043 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1044 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1045 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1046 *kv = aijkok->a_dual.view_device(); 1047 PetscFunctionReturn(PETSC_SUCCESS); 1048 } 1049 1050 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1051 { 1052 PetscFunctionBegin; 1053 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1054 PetscValidPointer(kv, 2); 1055 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1056 PetscFunctionReturn(PETSC_SUCCESS); 1057 } 1058 1059 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1060 { 1061 Mat_SeqAIJKokkos *aijkok; 1062 1063 PetscFunctionBegin; 1064 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1065 PetscValidPointer(kv, 2); 1066 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1067 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1068 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1069 *kv = aijkok->a_dual.view_device(); 1070 PetscFunctionReturn(PETSC_SUCCESS); 1071 } 1072 1073 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1074 { 1075 PetscFunctionBegin; 1076 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1077 PetscValidPointer(kv, 2); 1078 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1079 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1080 PetscFunctionReturn(PETSC_SUCCESS); 1081 } 1082 1083 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1084 { 1085 Mat_SeqAIJKokkos *aijkok; 1086 1087 PetscFunctionBegin; 1088 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1089 PetscValidPointer(kv, 2); 1090 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1091 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1092 *kv = aijkok->a_dual.view_device(); 1093 PetscFunctionReturn(PETSC_SUCCESS); 1094 } 1095 1096 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1097 { 1098 PetscFunctionBegin; 1099 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1100 PetscValidPointer(kv, 2); 1101 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1102 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1103 PetscFunctionReturn(PETSC_SUCCESS); 1104 } 1105 1106 /* Computes Y += alpha X */ 1107 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1108 { 1109 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1110 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1111 ConstMatScalarKokkosView Xa; 1112 MatScalarKokkosView Ya; 1113 1114 PetscFunctionBegin; 1115 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1116 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1117 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1118 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1119 PetscCall(PetscLogGpuTimeBegin()); 1120 1121 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1122 PetscBool e; 1123 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1124 if (e) { 1125 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1126 if (e) pattern = SAME_NONZERO_PATTERN; 1127 } 1128 } 1129 1130 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1131 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1132 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1133 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1134 */ 1135 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1136 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1137 Xa = xkok->a_dual.view_device(); 1138 Ya = ykok->a_dual.view_device(); 1139 1140 if (pattern == SAME_NONZERO_PATTERN) { 1141 KokkosBlas::axpy(alpha, Xa, Ya); 1142 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1143 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1144 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1145 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1146 1147 Kokkos::parallel_for( 1148 Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1149 PetscInt i = t.league_rank(); // row i 1150 Kokkos::single(Kokkos::PerTeam(t), [=]() { 1151 // Only one thread works in a team 1152 PetscInt p, q = Yi(i); 1153 for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 1154 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 1155 if (Xj(p) == Yj(q)) { // Found it 1156 Ya(q) += alpha * Xa(p); 1157 q++; 1158 } else { 1159 // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1160 // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1161 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 1162 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 1163 #else 1164 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 1165 #endif 1166 } 1167 } 1168 }); 1169 }); 1170 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1171 } else { // different nonzero patterns 1172 Mat Z; 1173 KokkosCsrMatrix zcsr; 1174 KernelHandle kh; 1175 kh.create_spadd_handle(true); // X, Y are sorted 1176 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1177 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1178 zkok = new Mat_SeqAIJKokkos(zcsr); 1179 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1180 PetscCall(MatHeaderReplace(Y, &Z)); 1181 kh.destroy_spadd_handle(); 1182 } 1183 PetscCall(PetscLogGpuTimeEnd()); 1184 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 1185 PetscFunctionReturn(PETSC_SUCCESS); 1186 } 1187 1188 struct MatCOOStruct_SeqAIJKokkos { 1189 PetscCount n; 1190 PetscCount Atot; 1191 PetscInt nz; 1192 PetscCountKokkosView jmap; 1193 PetscCountKokkosView perm; 1194 1195 MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h) 1196 { 1197 nz = coo_h->nz; 1198 n = coo_h->n; 1199 Atot = coo_h->Atot; 1200 jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1)); 1201 perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot)); 1202 } 1203 }; 1204 1205 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void *data) 1206 { 1207 PetscFunctionBegin; 1208 PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(data)); 1209 PetscFunctionReturn(PETSC_SUCCESS); 1210 } 1211 1212 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1213 { 1214 Mat_SeqAIJKokkos *akok; 1215 Mat_SeqAIJ *aseq; 1216 PetscContainer container_h, container_d; 1217 MatCOOStruct_SeqAIJ *coo_h; 1218 MatCOOStruct_SeqAIJKokkos *coo_d; 1219 1220 PetscFunctionBegin; 1221 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1222 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1223 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1224 delete akok; 1225 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE); 1226 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1227 1228 // Copy the COO struct to device 1229 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 1230 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 1231 PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h)); 1232 1233 // Put the COO struct in a container and then attach that to the matrix 1234 PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d)); 1235 PetscCall(PetscContainerSetPointer(container_d, coo_d)); 1236 PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_SeqAIJKokkos)); 1237 PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d)); 1238 PetscCall(PetscContainerDestroy(&container_d)); 1239 PetscFunctionReturn(PETSC_SUCCESS); 1240 } 1241 1242 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1243 { 1244 MatScalarKokkosView Aa; 1245 ConstMatScalarKokkosView kv; 1246 PetscMemType memtype; 1247 PetscContainer container; 1248 MatCOOStruct_SeqAIJKokkos *coo; 1249 1250 PetscFunctionBegin; 1251 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 1252 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 1253 1254 const auto &n = coo->n; 1255 const auto &Annz = coo->nz; 1256 const auto &jmap = coo->jmap; 1257 const auto &perm = coo->perm; 1258 1259 PetscCall(PetscGetMemType(v, &memtype)); 1260 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1261 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n)); 1262 } else { 1263 kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */ 1264 } 1265 1266 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1267 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1268 1269 Kokkos::parallel_for( 1270 Annz, KOKKOS_LAMBDA(const PetscCount i) { 1271 PetscScalar sum = 0.0; 1272 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1273 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1274 }); 1275 1276 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1277 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1278 PetscFunctionReturn(PETSC_SUCCESS); 1279 } 1280 1281 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1282 { 1283 PetscFunctionBegin; 1284 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1285 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1286 B->offloadmask = PETSC_OFFLOAD_CPU; 1287 PetscFunctionReturn(PETSC_SUCCESS); 1288 } 1289 1290 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1291 { 1292 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1293 1294 PetscFunctionBegin; 1295 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1296 A->boundtocpu = PETSC_FALSE; 1297 1298 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1299 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1300 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1301 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1302 A->ops->scale = MatScale_SeqAIJKokkos; 1303 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1304 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1305 A->ops->mult = MatMult_SeqAIJKokkos; 1306 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1307 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1308 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1309 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1310 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1311 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1312 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1313 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1314 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1315 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1316 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1317 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1318 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1319 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1320 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1321 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1322 1323 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1324 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1325 PetscFunctionReturn(PETSC_SUCCESS); 1326 } 1327 1328 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1329 { 1330 Mat_SeqAIJ *aseq; 1331 PetscInt i, m, n; 1332 1333 PetscFunctionBegin; 1334 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1335 1336 m = akok->nrows(); 1337 n = akok->ncols(); 1338 PetscCall(MatSetSizes(A, m, n, m, n)); 1339 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1340 1341 /* Set up data structures of A as a MATSEQAIJ */ 1342 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1343 aseq = (Mat_SeqAIJ *)(A)->data; 1344 1345 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1346 akok->j_dual.sync_host(); 1347 1348 aseq->i = akok->i_host_data(); 1349 aseq->j = akok->j_host_data(); 1350 aseq->a = akok->a_host_data(); 1351 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1352 aseq->singlemalloc = PETSC_FALSE; 1353 aseq->free_a = PETSC_FALSE; 1354 aseq->free_ij = PETSC_FALSE; 1355 aseq->nz = akok->nnz(); 1356 aseq->maxnz = aseq->nz; 1357 1358 PetscCall(PetscMalloc1(m, &aseq->imax)); 1359 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1360 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1361 1362 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1363 akok->nonzerostate = A->nonzerostate; 1364 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1365 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1366 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1367 PetscFunctionReturn(PETSC_SUCCESS); 1368 } 1369 1370 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 1371 { 1372 PetscFunctionBegin; 1373 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1374 *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 1375 PetscFunctionReturn(PETSC_SUCCESS); 1376 } 1377 1378 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 1379 { 1380 Mat_SeqAIJKokkos *akok; 1381 PetscFunctionBegin; 1382 PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 1383 PetscCall(MatCreate(comm, A)); 1384 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1385 PetscFunctionReturn(PETSC_SUCCESS); 1386 } 1387 1388 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1389 1390 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1391 */ 1392 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1393 { 1394 PetscFunctionBegin; 1395 PetscCall(MatCreate(comm, A)); 1396 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1397 PetscFunctionReturn(PETSC_SUCCESS); 1398 } 1399 1400 /*@C 1401 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1402 (the default parallel PETSc format). This matrix will ultimately be handled by 1403 Kokkos for calculations. 1404 1405 Collective 1406 1407 Input Parameters: 1408 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1409 . m - number of rows 1410 . n - number of columns 1411 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 1412 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 1413 1414 Output Parameter: 1415 . A - the matrix 1416 1417 Level: intermediate 1418 1419 Notes: 1420 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1421 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1422 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1423 1424 The AIJ format, also called 1425 compressed row storage, is fully compatible with standard Fortran 1426 storage. That is, the stored row and column indices can begin at 1427 either one (as in Fortran) or zero. 1428 1429 Specify the preallocated storage with either `nz` or `nnz` (not both). 1430 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1431 allocation. 1432 1433 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()` 1434 @*/ 1435 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1436 { 1437 PetscFunctionBegin; 1438 PetscCall(PetscKokkosInitializeCheck()); 1439 PetscCall(MatCreate(comm, A)); 1440 PetscCall(MatSetSizes(*A, m, n, m, n)); 1441 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1442 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1443 PetscFunctionReturn(PETSC_SUCCESS); 1444 } 1445 1446 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1447 { 1448 PetscFunctionBegin; 1449 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1450 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1451 PetscFunctionReturn(PETSC_SUCCESS); 1452 } 1453 1454 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1455 { 1456 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1457 1458 PetscFunctionBegin; 1459 if (!factors->sptrsv_symbolic_completed) { 1460 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d); 1461 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d); 1462 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1463 } 1464 PetscFunctionReturn(PETSC_SUCCESS); 1465 } 1466 1467 /* Check if we need to update factors etc for transpose solve */ 1468 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1469 { 1470 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1471 MatColIdxType n = A->rmap->n; 1472 1473 PetscFunctionBegin; 1474 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1475 /* Update L^T and do sptrsv symbolic */ 1476 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0 1477 factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 1478 factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 1479 1480 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d, 1481 factors->iLt_d, factors->jLt_d, factors->aLt_d); 1482 1483 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1484 We have to sort the indices, until KK provides finer control options. 1485 */ 1486 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d); 1487 1488 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d); 1489 1490 /* Update U^T and do sptrsv symbolic */ 1491 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0 1492 factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 1493 factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 1494 1495 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d, 1496 factors->iUt_d, factors->jUt_d, factors->aUt_d); 1497 1498 /* Sort indices. See comments above */ 1499 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d); 1500 1501 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d); 1502 factors->transpose_updated = PETSC_TRUE; 1503 } 1504 PetscFunctionReturn(PETSC_SUCCESS); 1505 } 1506 1507 /* Solve Ax = b, with A = LU */ 1508 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x) 1509 { 1510 ConstPetscScalarKokkosView bv; 1511 PetscScalarKokkosView xv; 1512 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1513 1514 PetscFunctionBegin; 1515 PetscCall(PetscLogGpuTimeBegin()); 1516 PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 1517 PetscCall(VecGetKokkosView(b, &bv)); 1518 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1519 /* Solve L tmpv = b */ 1520 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector)); 1521 /* Solve Ux = tmpv */ 1522 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv)); 1523 PetscCall(VecRestoreKokkosView(b, &bv)); 1524 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1525 PetscCall(PetscLogGpuTimeEnd()); 1526 PetscFunctionReturn(PETSC_SUCCESS); 1527 } 1528 1529 /* Solve A^T x = b, where A^T = U^T L^T */ 1530 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x) 1531 { 1532 ConstPetscScalarKokkosView bv; 1533 PetscScalarKokkosView xv; 1534 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1535 1536 PetscFunctionBegin; 1537 PetscCall(PetscLogGpuTimeBegin()); 1538 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 1539 PetscCall(VecGetKokkosView(b, &bv)); 1540 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1541 /* Solve U^T tmpv = b */ 1542 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector); 1543 1544 /* Solve L^T x = tmpv */ 1545 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv); 1546 PetscCall(VecRestoreKokkosView(b, &bv)); 1547 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1548 PetscCall(PetscLogGpuTimeEnd()); 1549 PetscFunctionReturn(PETSC_SUCCESS); 1550 } 1551 1552 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1553 { 1554 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1555 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1556 PetscInt fill_lev = info->levels; 1557 1558 PetscFunctionBegin; 1559 PetscCall(PetscLogGpuTimeBegin()); 1560 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1561 1562 auto a_d = aijkok->a_dual.view_device(); 1563 auto i_d = aijkok->i_dual.view_device(); 1564 auto j_d = aijkok->j_dual.view_device(); 1565 1566 KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d); 1567 1568 B->assembled = PETSC_TRUE; 1569 B->preallocated = PETSC_TRUE; 1570 B->ops->solve = MatSolve_SeqAIJKokkos; 1571 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1572 B->ops->matsolve = NULL; 1573 B->ops->matsolvetranspose = NULL; 1574 B->offloadmask = PETSC_OFFLOAD_GPU; 1575 1576 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1577 factors->transpose_updated = PETSC_FALSE; 1578 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1579 /* TODO: log flops, but how to know that? */ 1580 PetscCall(PetscLogGpuTimeEnd()); 1581 PetscFunctionReturn(PETSC_SUCCESS); 1582 } 1583 1584 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1585 { 1586 Mat_SeqAIJKokkos *aijkok; 1587 Mat_SeqAIJ *b; 1588 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1589 PetscInt fill_lev = info->levels; 1590 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 1591 PetscInt n = A->rmap->n; 1592 1593 PetscFunctionBegin; 1594 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1595 /* Rebuild factors */ 1596 if (factors) { 1597 factors->Destroy(); 1598 } /* Destroy the old if it exists */ 1599 else { 1600 B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n); 1601 } 1602 1603 /* Create a spiluk handle and then do symbolic factorization */ 1604 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 1605 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 1606 1607 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1608 1609 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 1610 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 1611 Kokkos::realloc(factors->iU_d, n + 1); 1612 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 1613 1614 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1615 auto i_d = aijkok->i_dual.view_device(); 1616 auto j_d = aijkok->j_dual.view_device(); 1617 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d); 1618 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1619 1620 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1621 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 1622 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 1623 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 1624 1625 /* TODO: add options to select sptrsv algorithms */ 1626 /* Create sptrsv handles for L, U and their transpose */ 1627 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1628 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1629 #else 1630 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1631 #endif 1632 1633 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 1634 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 1635 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 1636 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 1637 1638 /* Fill fields of the factor matrix B */ 1639 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 1640 b = (Mat_SeqAIJ *)B->data; 1641 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 1642 B->info.fill_ratio_given = info->fill; 1643 B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 1644 1645 B->offloadmask = PETSC_OFFLOAD_GPU; 1646 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1647 PetscFunctionReturn(PETSC_SUCCESS); 1648 } 1649 1650 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type) 1651 { 1652 PetscFunctionBegin; 1653 *type = MATSOLVERKOKKOS; 1654 PetscFunctionReturn(PETSC_SUCCESS); 1655 } 1656 1657 /*MC 1658 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1659 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 1660 1661 Level: beginner 1662 1663 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1664 M*/ 1665 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1666 { 1667 PetscInt n = A->rmap->n; 1668 1669 PetscFunctionBegin; 1670 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 1671 PetscCall(MatSetSizes(*B, n, n, n, n)); 1672 (*B)->factortype = ftype; 1673 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 1674 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1675 1676 if (ftype == MAT_FACTOR_LU) { 1677 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1678 (*B)->canuseordering = PETSC_TRUE; 1679 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1680 } else if (ftype == MAT_FACTOR_ILU) { 1681 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1682 (*B)->canuseordering = PETSC_FALSE; 1683 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1684 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1685 1686 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 1687 PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos)); 1688 PetscFunctionReturn(PETSC_SUCCESS); 1689 } 1690 1691 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1692 { 1693 PetscFunctionBegin; 1694 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1695 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1696 PetscFunctionReturn(PETSC_SUCCESS); 1697 } 1698 1699 /* Utility to print out a KokkosCsrMatrix for debugging */ 1700 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 1701 { 1702 const auto &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map); 1703 const auto &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries); 1704 const auto &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values); 1705 const PetscInt *i = iv.data(); 1706 const PetscInt *j = jv.data(); 1707 const PetscScalar *a = av.data(); 1708 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 1709 1710 PetscFunctionBegin; 1711 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 1712 for (PetscInt k = 0; k < m; k++) { 1713 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 1714 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 1715 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 1716 } 1717 PetscFunctionReturn(PETSC_SUCCESS); 1718 } 1719