1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petsc/private/sfimpl.h> 5 #include <petscsystypes.h> 6 #include <petscerror.h> 7 8 #include <Kokkos_Core.hpp> 9 #include <KokkosBlas.hpp> 10 #include <KokkosSparse_CrsMatrix.hpp> 11 #include <KokkosSparse_spmv.hpp> 12 #include <KokkosSparse_spiluk.hpp> 13 #include <KokkosSparse_sptrsv.hpp> 14 #include <KokkosSparse_spgemm.hpp> 15 #include <KokkosSparse_spadd.hpp> 16 #include <KokkosBatched_LU_Decl.hpp> 17 #include <KokkosBatched_InverseLU_Decl.hpp> 18 19 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 20 21 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 22 #include <KokkosSparse_Utils.hpp> 23 using KokkosSparse::sort_crs_matrix; 24 using KokkosSparse::Impl::transpose_matrix; 25 #else 26 #include <KokkosKernels_Sorting.hpp> 27 using KokkosKernels::sort_crs_matrix; 28 using KokkosKernels::Impl::transpose_matrix; 29 #endif 30 31 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 32 33 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 34 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 35 In the latter case, it is important to set a_dual's sync state correctly. 36 */ 37 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 38 { 39 Mat_SeqAIJ *aijseq; 40 Mat_SeqAIJKokkos *aijkok; 41 42 PetscFunctionBegin; 43 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 44 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 45 46 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 47 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 48 49 /* If aijkok does not exist, we just copy i, j to device. 50 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 51 In both cases, we build a new aijkok structure. 52 */ 53 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 54 delete aijkok; 55 aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 56 A->spptr = aijkok; 57 } 58 59 PetscFunctionReturn(PETSC_SUCCESS); 60 } 61 62 /* Sync CSR data to device if not yet */ 63 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 64 { 65 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 66 67 PetscFunctionBegin; 68 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 69 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 70 if (aijkok->a_dual.need_sync_device()) { 71 aijkok->a_dual.sync_device(); 72 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 73 aijkok->hermitian_updated = PETSC_FALSE; 74 } 75 PetscFunctionReturn(PETSC_SUCCESS); 76 } 77 78 /* Mark the CSR data on device as modified */ 79 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 80 { 81 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 82 83 PetscFunctionBegin; 84 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 85 aijkok->a_dual.clear_sync_state(); 86 aijkok->a_dual.modify_device(); 87 aijkok->transpose_updated = PETSC_FALSE; 88 aijkok->hermitian_updated = PETSC_FALSE; 89 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 90 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 91 PetscFunctionReturn(PETSC_SUCCESS); 92 } 93 94 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 95 { 96 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 97 98 PetscFunctionBegin; 99 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 100 /* We do not expect one needs factors on host */ 101 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 102 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 103 aijkok->a_dual.sync_host(); 104 PetscFunctionReturn(PETSC_SUCCESS); 105 } 106 107 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 108 { 109 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 110 111 PetscFunctionBegin; 112 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 113 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 114 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 115 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 116 */ 117 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 118 aijkok->a_dual.sync_host(); 119 *array = aijkok->a_dual.view_host().data(); 120 } else { /* Happens when calling MatSetValues on a newly created matrix */ 121 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 122 } 123 PetscFunctionReturn(PETSC_SUCCESS); 124 } 125 126 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 127 { 128 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 129 130 PetscFunctionBegin; 131 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 132 PetscFunctionReturn(PETSC_SUCCESS); 133 } 134 135 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 136 { 137 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 138 139 PetscFunctionBegin; 140 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 141 aijkok->a_dual.sync_host(); 142 *array = aijkok->a_dual.view_host().data(); 143 } else { 144 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 145 } 146 PetscFunctionReturn(PETSC_SUCCESS); 147 } 148 149 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 150 { 151 PetscFunctionBegin; 152 *array = NULL; 153 PetscFunctionReturn(PETSC_SUCCESS); 154 } 155 156 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 157 { 158 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 159 160 PetscFunctionBegin; 161 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 162 *array = aijkok->a_dual.view_host().data(); 163 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 164 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 165 } 166 PetscFunctionReturn(PETSC_SUCCESS); 167 } 168 169 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 170 { 171 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 172 173 PetscFunctionBegin; 174 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 175 aijkok->a_dual.clear_sync_state(); 176 aijkok->a_dual.modify_host(); 177 } 178 PetscFunctionReturn(PETSC_SUCCESS); 179 } 180 181 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 182 { 183 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 184 185 PetscFunctionBegin; 186 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 187 188 if (i) *i = aijkok->i_device_data(); 189 if (j) *j = aijkok->j_device_data(); 190 if (a) { 191 aijkok->a_dual.sync_device(); 192 *a = aijkok->a_device_data(); 193 } 194 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 195 PetscFunctionReturn(PETSC_SUCCESS); 196 } 197 198 /* 199 Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 200 201 Input Parameter: 202 . A - the MATSEQAIJKOKKOS matrix 203 204 Output Parameters: 205 + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 206 - T_d - the transpose on device, whose value array is allocated but not initialized 207 */ 208 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 209 { 210 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 211 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 212 const PetscInt *Ai = aseq->i, *Aj = aseq->j; 213 MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 214 MatRowMapType *Ti = Ti_h.data(); 215 MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 216 MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 217 PetscInt *Tj = Tj_h.data(); 218 PetscInt *perm = perm_h.data(); 219 PetscInt *offset; 220 221 PetscFunctionBegin; 222 // Populate Ti 223 PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 224 Ti++; 225 for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 226 Ti--; 227 for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 228 229 // Populate Tj and the permutation array 230 PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 231 for (PetscInt i = 0; i < m; i++) { 232 for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 233 PetscInt r = Aj[j]; // row r of T 234 PetscInt disp = Ti[r] + offset[r]; 235 236 Tj[disp] = i; // col i of T 237 perm[disp] = j; 238 offset[r]++; 239 } 240 } 241 PetscCall(PetscFree(offset)); 242 243 // Sort each row of T, along with the permutation array 244 for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 245 246 // Output perm and T on device 247 auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 248 auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 249 PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 250 PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 251 PetscFunctionReturn(PETSC_SUCCESS); 252 } 253 254 // Generate the transpose on device and cache it internally 255 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 256 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 257 { 258 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 259 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 260 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 261 KokkosCsrMatrix &T = akok->csrmatT; 262 263 PetscFunctionBegin; 264 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 265 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's valeus since we are going to access them on device 266 267 const auto &Aa = akok->a_dual.view_device(); 268 269 if (A->symmetric == PETSC_BOOL3_TRUE) { 270 *csrmatT = akok->csrmat; 271 } else { 272 // See if we already have a cached transpose and its value is up to date 273 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 274 if (!akok->transpose_updated) { // if the value is out of date, update the cached version 275 const auto &perm = akok->transpose_perm; // get the permutation array 276 auto &Ta = T.values; 277 278 PetscCallCXX(Kokkos::parallel_for( 279 nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 280 } 281 } else { // Generate T of size n x m for the first time 282 MatRowMapKokkosView perm; 283 284 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 285 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 286 PetscCallCXX(Kokkos::parallel_for( 287 nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 288 } 289 akok->transpose_updated = PETSC_TRUE; 290 *csrmatT = akok->csrmatT; 291 } 292 PetscFunctionReturn(PETSC_SUCCESS); 293 } 294 295 // Generate the Hermitian on device and cache it internally 296 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 297 { 298 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 299 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 300 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 301 KokkosCsrMatrix &T = akok->csrmatH; 302 303 PetscFunctionBegin; 304 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 305 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 306 307 const auto &Aa = akok->a_dual.view_device(); 308 309 if (A->hermitian == PETSC_BOOL3_TRUE) { 310 *csrmatH = akok->csrmat; 311 } else { 312 // See if we already have a cached hermitian and its value is up to date 313 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 314 if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 315 const auto &perm = akok->transpose_perm; // get the permutation array 316 auto &Ta = T.values; 317 318 PetscCallCXX(Kokkos::parallel_for( 319 nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 320 } 321 } else { // Generate T of size n x m for the first time 322 MatRowMapKokkosView perm; 323 324 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 325 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 326 PetscCallCXX(Kokkos::parallel_for( 327 nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 328 } 329 akok->hermitian_updated = PETSC_TRUE; 330 *csrmatH = akok->csrmatH; 331 } 332 PetscFunctionReturn(PETSC_SUCCESS); 333 } 334 335 /* y = A x */ 336 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 337 { 338 Mat_SeqAIJKokkos *aijkok; 339 ConstPetscScalarKokkosView xv; 340 PetscScalarKokkosView yv; 341 342 PetscFunctionBegin; 343 PetscCall(PetscLogGpuTimeBegin()); 344 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 345 PetscCall(VecGetKokkosView(xx, &xv)); 346 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 347 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 348 PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 349 PetscCall(VecRestoreKokkosView(xx, &xv)); 350 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 351 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 352 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 353 PetscCall(PetscLogGpuTimeEnd()); 354 PetscFunctionReturn(PETSC_SUCCESS); 355 } 356 357 /* y = A^T x */ 358 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 359 { 360 Mat_SeqAIJKokkos *aijkok; 361 const char *mode; 362 ConstPetscScalarKokkosView xv; 363 PetscScalarKokkosView yv; 364 KokkosCsrMatrix csrmat; 365 366 PetscFunctionBegin; 367 PetscCall(PetscLogGpuTimeBegin()); 368 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 369 PetscCall(VecGetKokkosView(xx, &xv)); 370 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 371 if (A->form_explicit_transpose) { 372 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 373 mode = "N"; 374 } else { 375 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 376 csrmat = aijkok->csrmat; 377 mode = "T"; 378 } 379 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 380 PetscCall(VecRestoreKokkosView(xx, &xv)); 381 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 382 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 383 PetscCall(PetscLogGpuTimeEnd()); 384 PetscFunctionReturn(PETSC_SUCCESS); 385 } 386 387 /* y = A^H x */ 388 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 389 { 390 Mat_SeqAIJKokkos *aijkok; 391 const char *mode; 392 ConstPetscScalarKokkosView xv; 393 PetscScalarKokkosView yv; 394 KokkosCsrMatrix csrmat; 395 396 PetscFunctionBegin; 397 PetscCall(PetscLogGpuTimeBegin()); 398 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 399 PetscCall(VecGetKokkosView(xx, &xv)); 400 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 401 if (A->form_explicit_transpose) { 402 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 403 mode = "N"; 404 } else { 405 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 406 csrmat = aijkok->csrmat; 407 mode = "C"; 408 } 409 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 410 PetscCall(VecRestoreKokkosView(xx, &xv)); 411 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 412 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 413 PetscCall(PetscLogGpuTimeEnd()); 414 PetscFunctionReturn(PETSC_SUCCESS); 415 } 416 417 /* z = A x + y */ 418 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 419 { 420 Mat_SeqAIJKokkos *aijkok; 421 ConstPetscScalarKokkosView xv, yv; 422 PetscScalarKokkosView zv; 423 424 PetscFunctionBegin; 425 PetscCall(PetscLogGpuTimeBegin()); 426 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 427 PetscCall(VecGetKokkosView(xx, &xv)); 428 PetscCall(VecGetKokkosView(yy, &yv)); 429 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 430 if (zz != yy) Kokkos::deep_copy(zv, yv); 431 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 432 PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 433 PetscCall(VecRestoreKokkosView(xx, &xv)); 434 PetscCall(VecRestoreKokkosView(yy, &yv)); 435 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 436 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 437 PetscCall(PetscLogGpuTimeEnd()); 438 PetscFunctionReturn(PETSC_SUCCESS); 439 } 440 441 /* z = A^T x + y */ 442 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 443 { 444 Mat_SeqAIJKokkos *aijkok; 445 const char *mode; 446 ConstPetscScalarKokkosView xv, yv; 447 PetscScalarKokkosView zv; 448 KokkosCsrMatrix csrmat; 449 450 PetscFunctionBegin; 451 PetscCall(PetscLogGpuTimeBegin()); 452 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 453 PetscCall(VecGetKokkosView(xx, &xv)); 454 PetscCall(VecGetKokkosView(yy, &yv)); 455 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 456 if (zz != yy) Kokkos::deep_copy(zv, yv); 457 if (A->form_explicit_transpose) { 458 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 459 mode = "N"; 460 } else { 461 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 462 csrmat = aijkok->csrmat; 463 mode = "T"; 464 } 465 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 466 PetscCall(VecRestoreKokkosView(xx, &xv)); 467 PetscCall(VecRestoreKokkosView(yy, &yv)); 468 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 469 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 470 PetscCall(PetscLogGpuTimeEnd()); 471 PetscFunctionReturn(PETSC_SUCCESS); 472 } 473 474 /* z = A^H x + y */ 475 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 476 { 477 Mat_SeqAIJKokkos *aijkok; 478 const char *mode; 479 ConstPetscScalarKokkosView xv, yv; 480 PetscScalarKokkosView zv; 481 KokkosCsrMatrix csrmat; 482 483 PetscFunctionBegin; 484 PetscCall(PetscLogGpuTimeBegin()); 485 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 486 PetscCall(VecGetKokkosView(xx, &xv)); 487 PetscCall(VecGetKokkosView(yy, &yv)); 488 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 489 if (zz != yy) Kokkos::deep_copy(zv, yv); 490 if (A->form_explicit_transpose) { 491 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 492 mode = "N"; 493 } else { 494 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 495 csrmat = aijkok->csrmat; 496 mode = "C"; 497 } 498 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 499 PetscCall(VecRestoreKokkosView(xx, &xv)); 500 PetscCall(VecRestoreKokkosView(yy, &yv)); 501 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 502 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 503 PetscCall(PetscLogGpuTimeEnd()); 504 PetscFunctionReturn(PETSC_SUCCESS); 505 } 506 507 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 508 { 509 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 510 511 PetscFunctionBegin; 512 switch (op) { 513 case MAT_FORM_EXPLICIT_TRANSPOSE: 514 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 515 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 516 A->form_explicit_transpose = flg; 517 break; 518 default: 519 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 520 break; 521 } 522 PetscFunctionReturn(PETSC_SUCCESS); 523 } 524 525 /* Depending on reuse, either build a new mat, or use the existing mat */ 526 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 527 { 528 Mat_SeqAIJ *aseq; 529 530 PetscFunctionBegin; 531 PetscCall(PetscKokkosInitializeCheck()); 532 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 533 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */ 534 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 535 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 536 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 537 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 538 PetscCall(PetscFree(A->defaultvectype)); 539 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 540 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 541 PetscCall(MatSetOps_SeqAIJKokkos(A)); 542 aseq = static_cast<Mat_SeqAIJ *>(A->data); 543 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 544 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 545 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE); 546 } 547 } 548 PetscFunctionReturn(PETSC_SUCCESS); 549 } 550 551 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 552 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 553 */ 554 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 555 { 556 Mat_SeqAIJ *bseq; 557 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 558 Mat mat; 559 560 PetscFunctionBegin; 561 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 562 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 563 mat = *B; 564 if (A->preallocated) { 565 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 566 bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE); 567 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 568 /* Now copy values to B if needed */ 569 if (dupOption == MAT_COPY_VALUES) { 570 if (akok->a_dual.need_sync_device()) { 571 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 572 bkok->a_dual.modify_host(); 573 } else { /* If device has the latest data, we only copy data on device */ 574 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 575 bkok->a_dual.modify_device(); 576 } 577 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 578 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 579 bkok->a_dual.modify_host(); 580 } 581 mat->spptr = bkok; 582 } 583 584 PetscCall(PetscFree(mat->defaultvectype)); 585 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 586 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 587 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 588 PetscFunctionReturn(PETSC_SUCCESS); 589 } 590 591 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 592 { 593 Mat At; 594 KokkosCsrMatrix internT; 595 Mat_SeqAIJKokkos *atkok, *bkok; 596 597 PetscFunctionBegin; 598 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 599 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 600 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 601 /* Deep copy internT, as we want to isolate the internal transpose */ 602 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 603 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 604 if (reuse == MAT_INITIAL_MATRIX) *B = At; 605 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 606 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 607 if ((*B)->assembled) { 608 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 609 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 610 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 611 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 612 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 613 MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 614 MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 615 PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 616 PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 617 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 618 } 619 PetscFunctionReturn(PETSC_SUCCESS); 620 } 621 622 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 623 { 624 Mat_SeqAIJKokkos *aijkok; 625 626 PetscFunctionBegin; 627 if (A->factortype == MAT_FACTOR_NONE) { 628 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 629 delete aijkok; 630 } else { 631 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 632 } 633 A->spptr = NULL; 634 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 635 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 636 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 637 PetscCall(MatDestroy_SeqAIJ(A)); 638 PetscFunctionReturn(PETSC_SUCCESS); 639 } 640 641 /*MC 642 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 643 644 A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 645 646 Options Database Key: 647 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 648 649 Level: beginner 650 651 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 652 M*/ 653 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 654 { 655 PetscFunctionBegin; 656 PetscCall(PetscKokkosInitializeCheck()); 657 PetscCall(MatCreate_SeqAIJ(A)); 658 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 659 PetscFunctionReturn(PETSC_SUCCESS); 660 } 661 662 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 663 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 664 { 665 Mat_SeqAIJ *a, *b; 666 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 667 MatScalarKokkosView aa, ba, ca; 668 MatRowMapKokkosView ai, bi, ci; 669 MatColIdxKokkosView aj, bj, cj; 670 PetscInt m, n, nnz, aN; 671 672 PetscFunctionBegin; 673 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 674 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 675 PetscValidPointer(C, 4); 676 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 677 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 678 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 679 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 680 681 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 682 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 683 a = static_cast<Mat_SeqAIJ *>(A->data); 684 b = static_cast<Mat_SeqAIJ *>(B->data); 685 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 686 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 687 aa = akok->a_dual.view_device(); 688 ai = akok->i_dual.view_device(); 689 ba = bkok->a_dual.view_device(); 690 bi = bkok->i_dual.view_device(); 691 m = A->rmap->n; /* M, N and nnz of C */ 692 n = A->cmap->n + B->cmap->n; 693 nnz = a->nz + b->nz; 694 aN = A->cmap->n; /* N of A */ 695 if (reuse == MAT_INITIAL_MATRIX) { 696 aj = akok->j_dual.view_device(); 697 bj = bkok->j_dual.view_device(); 698 auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 699 auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 700 auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 701 ca = ca_dual.view_device(); 702 ci = ci_dual.view_device(); 703 cj = cj_dual.view_device(); 704 705 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 706 Kokkos::parallel_for( 707 Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 708 PetscInt i = t.league_rank(); /* row i */ 709 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 710 711 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 712 ci(i) = coffset; 713 if (i == m - 1) ci(m) = ai(m) + bi(m); 714 }); 715 716 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 717 if (k < alen) { 718 ca(coffset + k) = aa(ai(i) + k); 719 cj(coffset + k) = aj(ai(i) + k); 720 } else { 721 ca(coffset + k) = ba(bi(i) + k - alen); 722 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 723 } 724 }); 725 }); 726 ca_dual.modify_device(); 727 ci_dual.modify_device(); 728 cj_dual.modify_device(); 729 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 730 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 731 } else if (reuse == MAT_REUSE_MATRIX) { 732 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 733 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 734 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 735 ca = ckok->a_dual.view_device(); 736 ci = ckok->i_dual.view_device(); 737 738 Kokkos::parallel_for( 739 Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 740 PetscInt i = t.league_rank(); /* row i */ 741 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 742 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 743 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 744 else ca(ci(i) + k) = ba(bi(i) + k - alen); 745 }); 746 }); 747 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 748 } 749 PetscFunctionReturn(PETSC_SUCCESS); 750 } 751 752 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 753 { 754 PetscFunctionBegin; 755 delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 756 PetscFunctionReturn(PETSC_SUCCESS); 757 } 758 759 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 760 { 761 Mat_Product *product = C->product; 762 Mat A, B; 763 bool transA, transB; /* use bool, since KK needs this type */ 764 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 765 Mat_SeqAIJ *c; 766 MatProductData_SeqAIJKokkos *pdata; 767 KokkosCsrMatrix csrmatA, csrmatB; 768 769 PetscFunctionBegin; 770 MatCheckProduct(C, 1); 771 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 772 pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 773 774 // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 775 // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 776 // we still do numeric. 777 if (pdata->reusesym) { // numeric reuses results from symbolic 778 pdata->reusesym = PETSC_FALSE; 779 PetscFunctionReturn(PETSC_SUCCESS); 780 } 781 782 switch (product->type) { 783 case MATPRODUCT_AB: 784 transA = false; 785 transB = false; 786 break; 787 case MATPRODUCT_AtB: 788 transA = true; 789 transB = false; 790 break; 791 case MATPRODUCT_ABt: 792 transA = false; 793 transB = true; 794 break; 795 default: 796 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 797 } 798 799 A = product->A; 800 B = product->B; 801 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 802 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 803 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 804 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 805 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 806 807 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 808 809 csrmatA = akok->csrmat; 810 csrmatB = bkok->csrmat; 811 812 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 813 if (transA) { 814 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 815 transA = false; 816 } 817 818 if (transB) { 819 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 820 transB = false; 821 } 822 PetscCall(PetscLogGpuTimeBegin()); 823 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 824 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 825 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 826 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 827 #endif 828 829 PetscCall(PetscLogGpuTimeEnd()); 830 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 831 /* shorter version of MatAssemblyEnd_SeqAIJ */ 832 c = (Mat_SeqAIJ *)C->data; 833 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 834 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 835 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 836 c->reallocs = 0; 837 C->info.mallocs = 0; 838 C->info.nz_unneeded = 0; 839 C->assembled = C->was_assembled = PETSC_TRUE; 840 C->num_ass++; 841 PetscFunctionReturn(PETSC_SUCCESS); 842 } 843 844 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 845 { 846 Mat_Product *product = C->product; 847 MatProductType ptype; 848 Mat A, B; 849 bool transA, transB; 850 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 851 MatProductData_SeqAIJKokkos *pdata; 852 MPI_Comm comm; 853 KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 854 855 PetscFunctionBegin; 856 MatCheckProduct(C, 1); 857 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 858 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 859 A = product->A; 860 B = product->B; 861 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 862 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 863 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 864 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 865 csrmatA = akok->csrmat; 866 csrmatB = bkok->csrmat; 867 868 ptype = product->type; 869 // Take advantage of the symmetry if true 870 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 871 ptype = MATPRODUCT_AB; 872 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 873 } 874 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 875 ptype = MATPRODUCT_AB; 876 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 877 } 878 879 switch (ptype) { 880 case MATPRODUCT_AB: 881 transA = false; 882 transB = false; 883 break; 884 case MATPRODUCT_AtB: 885 transA = true; 886 transB = false; 887 break; 888 case MATPRODUCT_ABt: 889 transA = false; 890 transB = true; 891 break; 892 default: 893 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 894 } 895 PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 896 pdata->reusesym = product->api_user; 897 898 /* TODO: add command line options to select spgemm algorithms */ 899 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 900 901 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 902 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 903 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 904 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 905 #endif 906 #endif 907 PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 908 909 PetscCall(PetscLogGpuTimeBegin()); 910 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 911 if (transA) { 912 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 913 transA = false; 914 } 915 916 if (transB) { 917 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 918 transB = false; 919 } 920 921 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 922 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 923 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 924 calling new Mat_SeqAIJKokkos(). 925 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 926 */ 927 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 928 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 929 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 930 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 931 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 932 #endif 933 PetscCall(PetscLogGpuTimeEnd()); 934 935 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 936 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 937 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 938 PetscFunctionReturn(PETSC_SUCCESS); 939 } 940 941 /* handles sparse matrix matrix ops */ 942 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 943 { 944 Mat_Product *product = mat->product; 945 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 946 947 PetscFunctionBegin; 948 MatCheckProduct(mat, 1); 949 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 950 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 951 if (Biskok && Ciskok) { 952 switch (product->type) { 953 case MATPRODUCT_AB: 954 case MATPRODUCT_AtB: 955 case MATPRODUCT_ABt: 956 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 957 break; 958 case MATPRODUCT_PtAP: 959 case MATPRODUCT_RARt: 960 case MATPRODUCT_ABC: 961 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 962 break; 963 default: 964 break; 965 } 966 } else { /* fallback for AIJ */ 967 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 968 } 969 PetscFunctionReturn(PETSC_SUCCESS); 970 } 971 972 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 973 { 974 Mat_SeqAIJKokkos *aijkok; 975 976 PetscFunctionBegin; 977 PetscCall(PetscLogGpuTimeBegin()); 978 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 979 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 980 KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 981 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 982 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 983 PetscCall(PetscLogGpuTimeEnd()); 984 PetscFunctionReturn(PETSC_SUCCESS); 985 } 986 987 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 988 { 989 Mat_SeqAIJKokkos *aijkok; 990 991 PetscFunctionBegin; 992 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 993 if (aijkok) { /* Only zero the device if data is already there */ 994 KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0); 995 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 996 } else { /* Might be preallocated but not assembled */ 997 PetscCall(MatZeroEntries_SeqAIJ(A)); 998 } 999 PetscFunctionReturn(PETSC_SUCCESS); 1000 } 1001 1002 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1003 { 1004 Mat_SeqAIJ *aijseq; 1005 Mat_SeqAIJKokkos *aijkok; 1006 PetscInt n; 1007 PetscScalarKokkosView xv; 1008 1009 PetscFunctionBegin; 1010 PetscCall(VecGetLocalSize(x, &n)); 1011 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1012 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1013 1014 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1015 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1016 1017 if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */ 1018 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1019 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1020 aijkok->SetDiagonal(aijseq->diag); 1021 } 1022 1023 const auto &Aa = aijkok->a_dual.view_device(); 1024 const auto &Ai = aijkok->i_dual.view_device(); 1025 const auto &Adiag = aijkok->diag_dual.view_device(); 1026 1027 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1028 Kokkos::parallel_for( 1029 n, KOKKOS_LAMBDA(const PetscInt i) { 1030 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1031 else xv(i) = 0; 1032 }); 1033 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1034 PetscFunctionReturn(PETSC_SUCCESS); 1035 } 1036 1037 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1038 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1039 { 1040 Mat_SeqAIJKokkos *aijkok; 1041 1042 PetscFunctionBegin; 1043 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1044 PetscValidPointer(kv, 2); 1045 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1046 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1047 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1048 *kv = aijkok->a_dual.view_device(); 1049 PetscFunctionReturn(PETSC_SUCCESS); 1050 } 1051 1052 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1053 { 1054 PetscFunctionBegin; 1055 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1056 PetscValidPointer(kv, 2); 1057 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1058 PetscFunctionReturn(PETSC_SUCCESS); 1059 } 1060 1061 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1062 { 1063 Mat_SeqAIJKokkos *aijkok; 1064 1065 PetscFunctionBegin; 1066 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1067 PetscValidPointer(kv, 2); 1068 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1069 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1070 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1071 *kv = aijkok->a_dual.view_device(); 1072 PetscFunctionReturn(PETSC_SUCCESS); 1073 } 1074 1075 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1076 { 1077 PetscFunctionBegin; 1078 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1079 PetscValidPointer(kv, 2); 1080 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1081 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1082 PetscFunctionReturn(PETSC_SUCCESS); 1083 } 1084 1085 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1086 { 1087 Mat_SeqAIJKokkos *aijkok; 1088 1089 PetscFunctionBegin; 1090 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1091 PetscValidPointer(kv, 2); 1092 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1093 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1094 *kv = aijkok->a_dual.view_device(); 1095 PetscFunctionReturn(PETSC_SUCCESS); 1096 } 1097 1098 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1099 { 1100 PetscFunctionBegin; 1101 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1102 PetscValidPointer(kv, 2); 1103 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1104 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1105 PetscFunctionReturn(PETSC_SUCCESS); 1106 } 1107 1108 /* Computes Y += alpha X */ 1109 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1110 { 1111 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1112 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1113 ConstMatScalarKokkosView Xa; 1114 MatScalarKokkosView Ya; 1115 1116 PetscFunctionBegin; 1117 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1118 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1119 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1120 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1121 PetscCall(PetscLogGpuTimeBegin()); 1122 1123 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1124 PetscBool e; 1125 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1126 if (e) { 1127 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1128 if (e) pattern = SAME_NONZERO_PATTERN; 1129 } 1130 } 1131 1132 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1133 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1134 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1135 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1136 */ 1137 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1138 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1139 Xa = xkok->a_dual.view_device(); 1140 Ya = ykok->a_dual.view_device(); 1141 1142 if (pattern == SAME_NONZERO_PATTERN) { 1143 KokkosBlas::axpy(alpha, Xa, Ya); 1144 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1145 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1146 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1147 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1148 1149 Kokkos::parallel_for( 1150 Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1151 PetscInt i = t.league_rank(); // row i 1152 Kokkos::single(Kokkos::PerTeam(t), [=]() { 1153 // Only one thread works in a team 1154 PetscInt p, q = Yi(i); 1155 for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 1156 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 1157 if (Xj(p) == Yj(q)) { // Found it 1158 Ya(q) += alpha * Xa(p); 1159 q++; 1160 } else { 1161 // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1162 // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1163 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 1164 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 1165 #else 1166 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 1167 #endif 1168 } 1169 } 1170 }); 1171 }); 1172 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1173 } else { // different nonzero patterns 1174 Mat Z; 1175 KokkosCsrMatrix zcsr; 1176 KernelHandle kh; 1177 kh.create_spadd_handle(true); // X, Y are sorted 1178 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1179 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1180 zkok = new Mat_SeqAIJKokkos(zcsr); 1181 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1182 PetscCall(MatHeaderReplace(Y, &Z)); 1183 kh.destroy_spadd_handle(); 1184 } 1185 PetscCall(PetscLogGpuTimeEnd()); 1186 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 1187 PetscFunctionReturn(PETSC_SUCCESS); 1188 } 1189 1190 struct MatCOOStruct_SeqAIJKokkos { 1191 PetscCount n; 1192 PetscCount Atot; 1193 PetscInt nz; 1194 PetscCountKokkosView jmap; 1195 PetscCountKokkosView perm; 1196 1197 MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h) 1198 { 1199 nz = coo_h->nz; 1200 n = coo_h->n; 1201 Atot = coo_h->Atot; 1202 jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1)); 1203 perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot)); 1204 } 1205 }; 1206 1207 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void *data) 1208 { 1209 PetscFunctionBegin; 1210 PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(data)); 1211 PetscFunctionReturn(PETSC_SUCCESS); 1212 } 1213 1214 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1215 { 1216 Mat_SeqAIJKokkos *akok; 1217 Mat_SeqAIJ *aseq; 1218 PetscContainer container_h, container_d; 1219 MatCOOStruct_SeqAIJ *coo_h; 1220 MatCOOStruct_SeqAIJKokkos *coo_d; 1221 1222 PetscFunctionBegin; 1223 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1224 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1225 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1226 delete akok; 1227 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE); 1228 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1229 1230 // Copy the COO struct to device 1231 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 1232 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 1233 PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h)); 1234 1235 // Put the COO struct in a container and then attach that to the matrix 1236 PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d)); 1237 PetscCall(PetscContainerSetPointer(container_d, coo_d)); 1238 PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_SeqAIJKokkos)); 1239 PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d)); 1240 PetscCall(PetscContainerDestroy(&container_d)); 1241 PetscFunctionReturn(PETSC_SUCCESS); 1242 } 1243 1244 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1245 { 1246 MatScalarKokkosView Aa; 1247 ConstMatScalarKokkosView kv; 1248 PetscMemType memtype; 1249 PetscContainer container; 1250 MatCOOStruct_SeqAIJKokkos *coo; 1251 1252 PetscFunctionBegin; 1253 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 1254 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 1255 1256 const auto &n = coo->n; 1257 const auto &Annz = coo->nz; 1258 const auto &jmap = coo->jmap; 1259 const auto &perm = coo->perm; 1260 1261 PetscCall(PetscGetMemType(v, &memtype)); 1262 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1263 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n)); 1264 } else { 1265 kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */ 1266 } 1267 1268 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1269 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1270 1271 Kokkos::parallel_for( 1272 Annz, KOKKOS_LAMBDA(const PetscCount i) { 1273 PetscScalar sum = 0.0; 1274 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1275 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1276 }); 1277 1278 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1279 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1280 PetscFunctionReturn(PETSC_SUCCESS); 1281 } 1282 1283 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1284 { 1285 PetscFunctionBegin; 1286 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1287 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1288 B->offloadmask = PETSC_OFFLOAD_CPU; 1289 PetscFunctionReturn(PETSC_SUCCESS); 1290 } 1291 1292 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1293 { 1294 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1295 1296 PetscFunctionBegin; 1297 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1298 A->boundtocpu = PETSC_FALSE; 1299 1300 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1301 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1302 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1303 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1304 A->ops->scale = MatScale_SeqAIJKokkos; 1305 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1306 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1307 A->ops->mult = MatMult_SeqAIJKokkos; 1308 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1309 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1310 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1311 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1312 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1313 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1314 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1315 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1316 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1317 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1318 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1319 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1320 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1321 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1322 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1323 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1324 1325 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1326 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1327 PetscFunctionReturn(PETSC_SUCCESS); 1328 } 1329 1330 /* 1331 Extract the (prescribled) diagonal blocks of the matrix and then invert them 1332 1333 Input Parameters: 1334 + A - the MATSEQAIJKOKKOS matrix 1335 . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i) 1336 . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[] 1337 . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i) 1338 - work - a pre-allocated work buffer (as big as diagVal) for use by this routine 1339 1340 Output Parameter: 1341 . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order) 1342 */ 1343 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal) 1344 { 1345 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1346 PetscInt N = A->rmap->n; 1347 PetscInt nblocks = bs.extent(0) - 1; 1348 1349 PetscFunctionBegin; 1350 // Set the diagonal pointer on device if not already 1351 if (N && akok->diag_dual.extent(0) == 0) { 1352 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1353 akok->SetDiagonal(static_cast<Mat_SeqAIJ *>(A->data)->diag); 1354 } 1355 1356 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device 1357 1358 // Pull out the diagonal blocks of the matrix and then invert the blocks 1359 auto Aa = akok->a_dual.view_device(); 1360 auto Ai = akok->i_dual.view_device(); 1361 auto Aj = akok->j_dual.view_device(); 1362 auto Adiag = akok->diag_dual.view_device(); 1363 // TODO: how to tune the team size? 1364 #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HOST) 1365 auto ts = Kokkos::AUTO(); 1366 #else 1367 auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs 1368 #endif 1369 PetscCallCXX(Kokkos::parallel_for( 1370 Kokkos::TeamPolicy<>(nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) { 1371 const PetscInt bid = teamMember.league_rank(); // block id 1372 const PetscInt rstart = bs(bid); // this block starts from this row 1373 const PetscInt m = bs(bid + 1) - bs(bid); // size of this block 1374 const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order 1375 const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m); 1376 1377 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B 1378 PetscInt i = rstart + r; // i-th row in A 1379 1380 if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case) 1381 PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row 1382 1383 for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet 1384 if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero 1385 B(r, c) = 0.0; 1386 } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column 1387 B(r, c) = Aa(first + c); 1388 } else { // this entry does not show up in the CSR 1389 B(r, c) = 0.0; 1390 } 1391 } 1392 } else { // rare case that the diagonal does not exist 1393 const PetscInt begin = Ai(i); 1394 const PetscInt end = Ai(i + 1); 1395 for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0; 1396 for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not. 1397 if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j); 1398 else if (Aj(j) >= rstart + m) break; 1399 } 1400 } 1401 }); 1402 1403 // LU-decompose B (w/o pivoting) and then invert B 1404 KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0); 1405 KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W); 1406 })); 1407 // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels 1408 PetscFunctionReturn(PETSC_SUCCESS); 1409 } 1410 1411 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1412 { 1413 Mat_SeqAIJ *aseq; 1414 PetscInt i, m, n; 1415 1416 PetscFunctionBegin; 1417 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1418 1419 m = akok->nrows(); 1420 n = akok->ncols(); 1421 PetscCall(MatSetSizes(A, m, n, m, n)); 1422 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1423 1424 /* Set up data structures of A as a MATSEQAIJ */ 1425 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1426 aseq = (Mat_SeqAIJ *)(A)->data; 1427 1428 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1429 akok->j_dual.sync_host(); 1430 1431 aseq->i = akok->i_host_data(); 1432 aseq->j = akok->j_host_data(); 1433 aseq->a = akok->a_host_data(); 1434 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1435 aseq->singlemalloc = PETSC_FALSE; 1436 aseq->free_a = PETSC_FALSE; 1437 aseq->free_ij = PETSC_FALSE; 1438 aseq->nz = akok->nnz(); 1439 aseq->maxnz = aseq->nz; 1440 1441 PetscCall(PetscMalloc1(m, &aseq->imax)); 1442 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1443 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1444 1445 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1446 akok->nonzerostate = A->nonzerostate; 1447 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1448 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1449 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1450 PetscFunctionReturn(PETSC_SUCCESS); 1451 } 1452 1453 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 1454 { 1455 PetscFunctionBegin; 1456 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1457 *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 1458 PetscFunctionReturn(PETSC_SUCCESS); 1459 } 1460 1461 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 1462 { 1463 Mat_SeqAIJKokkos *akok; 1464 PetscFunctionBegin; 1465 PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 1466 PetscCall(MatCreate(comm, A)); 1467 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1468 PetscFunctionReturn(PETSC_SUCCESS); 1469 } 1470 1471 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1472 1473 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1474 */ 1475 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1476 { 1477 PetscFunctionBegin; 1478 PetscCall(MatCreate(comm, A)); 1479 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1480 PetscFunctionReturn(PETSC_SUCCESS); 1481 } 1482 1483 /*@C 1484 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1485 (the default parallel PETSc format). This matrix will ultimately be handled by 1486 Kokkos for calculations. 1487 1488 Collective 1489 1490 Input Parameters: 1491 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1492 . m - number of rows 1493 . n - number of columns 1494 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 1495 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 1496 1497 Output Parameter: 1498 . A - the matrix 1499 1500 Level: intermediate 1501 1502 Notes: 1503 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1504 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1505 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1506 1507 The AIJ format, also called 1508 compressed row storage, is fully compatible with standard Fortran 1509 storage. That is, the stored row and column indices can begin at 1510 either one (as in Fortran) or zero. 1511 1512 Specify the preallocated storage with either `nz` or `nnz` (not both). 1513 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1514 allocation. 1515 1516 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()` 1517 @*/ 1518 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1519 { 1520 PetscFunctionBegin; 1521 PetscCall(PetscKokkosInitializeCheck()); 1522 PetscCall(MatCreate(comm, A)); 1523 PetscCall(MatSetSizes(*A, m, n, m, n)); 1524 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1525 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1526 PetscFunctionReturn(PETSC_SUCCESS); 1527 } 1528 1529 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1530 { 1531 PetscFunctionBegin; 1532 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1533 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1534 PetscFunctionReturn(PETSC_SUCCESS); 1535 } 1536 1537 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1538 { 1539 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1540 1541 PetscFunctionBegin; 1542 if (!factors->sptrsv_symbolic_completed) { 1543 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d); 1544 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d); 1545 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1546 } 1547 PetscFunctionReturn(PETSC_SUCCESS); 1548 } 1549 1550 /* Check if we need to update factors etc for transpose solve */ 1551 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1552 { 1553 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1554 MatColIdxType n = A->rmap->n; 1555 1556 PetscFunctionBegin; 1557 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1558 /* Update L^T and do sptrsv symbolic */ 1559 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0 1560 factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 1561 factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 1562 1563 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d, 1564 factors->iLt_d, factors->jLt_d, factors->aLt_d); 1565 1566 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1567 We have to sort the indices, until KK provides finer control options. 1568 */ 1569 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d); 1570 1571 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d); 1572 1573 /* Update U^T and do sptrsv symbolic */ 1574 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0 1575 factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 1576 factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 1577 1578 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d, 1579 factors->iUt_d, factors->jUt_d, factors->aUt_d); 1580 1581 /* Sort indices. See comments above */ 1582 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d); 1583 1584 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d); 1585 factors->transpose_updated = PETSC_TRUE; 1586 } 1587 PetscFunctionReturn(PETSC_SUCCESS); 1588 } 1589 1590 /* Solve Ax = b, with A = LU */ 1591 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x) 1592 { 1593 ConstPetscScalarKokkosView bv; 1594 PetscScalarKokkosView xv; 1595 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1596 1597 PetscFunctionBegin; 1598 PetscCall(PetscLogGpuTimeBegin()); 1599 PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 1600 PetscCall(VecGetKokkosView(b, &bv)); 1601 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1602 /* Solve L tmpv = b */ 1603 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector)); 1604 /* Solve Ux = tmpv */ 1605 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv)); 1606 PetscCall(VecRestoreKokkosView(b, &bv)); 1607 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1608 PetscCall(PetscLogGpuTimeEnd()); 1609 PetscFunctionReturn(PETSC_SUCCESS); 1610 } 1611 1612 /* Solve A^T x = b, where A^T = U^T L^T */ 1613 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x) 1614 { 1615 ConstPetscScalarKokkosView bv; 1616 PetscScalarKokkosView xv; 1617 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1618 1619 PetscFunctionBegin; 1620 PetscCall(PetscLogGpuTimeBegin()); 1621 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 1622 PetscCall(VecGetKokkosView(b, &bv)); 1623 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1624 /* Solve U^T tmpv = b */ 1625 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector); 1626 1627 /* Solve L^T x = tmpv */ 1628 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv); 1629 PetscCall(VecRestoreKokkosView(b, &bv)); 1630 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1631 PetscCall(PetscLogGpuTimeEnd()); 1632 PetscFunctionReturn(PETSC_SUCCESS); 1633 } 1634 1635 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1636 { 1637 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1638 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1639 PetscInt fill_lev = info->levels; 1640 1641 PetscFunctionBegin; 1642 PetscCall(PetscLogGpuTimeBegin()); 1643 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1644 1645 auto a_d = aijkok->a_dual.view_device(); 1646 auto i_d = aijkok->i_dual.view_device(); 1647 auto j_d = aijkok->j_dual.view_device(); 1648 1649 KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d); 1650 1651 B->assembled = PETSC_TRUE; 1652 B->preallocated = PETSC_TRUE; 1653 B->ops->solve = MatSolve_SeqAIJKokkos; 1654 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1655 B->ops->matsolve = NULL; 1656 B->ops->matsolvetranspose = NULL; 1657 B->offloadmask = PETSC_OFFLOAD_GPU; 1658 1659 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1660 factors->transpose_updated = PETSC_FALSE; 1661 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1662 /* TODO: log flops, but how to know that? */ 1663 PetscCall(PetscLogGpuTimeEnd()); 1664 PetscFunctionReturn(PETSC_SUCCESS); 1665 } 1666 1667 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1668 { 1669 Mat_SeqAIJKokkos *aijkok; 1670 Mat_SeqAIJ *b; 1671 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1672 PetscInt fill_lev = info->levels; 1673 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 1674 PetscInt n = A->rmap->n; 1675 1676 PetscFunctionBegin; 1677 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1678 /* Rebuild factors */ 1679 if (factors) { 1680 factors->Destroy(); 1681 } /* Destroy the old if it exists */ 1682 else { 1683 B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n); 1684 } 1685 1686 /* Create a spiluk handle and then do symbolic factorization */ 1687 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 1688 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 1689 1690 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1691 1692 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 1693 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 1694 Kokkos::realloc(factors->iU_d, n + 1); 1695 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 1696 1697 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1698 auto i_d = aijkok->i_dual.view_device(); 1699 auto j_d = aijkok->j_dual.view_device(); 1700 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d); 1701 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1702 1703 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1704 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 1705 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 1706 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 1707 1708 /* TODO: add options to select sptrsv algorithms */ 1709 /* Create sptrsv handles for L, U and their transpose */ 1710 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1711 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1712 #else 1713 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1714 #endif 1715 1716 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 1717 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 1718 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 1719 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 1720 1721 /* Fill fields of the factor matrix B */ 1722 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 1723 b = (Mat_SeqAIJ *)B->data; 1724 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 1725 B->info.fill_ratio_given = info->fill; 1726 B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 1727 1728 B->offloadmask = PETSC_OFFLOAD_GPU; 1729 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1730 PetscFunctionReturn(PETSC_SUCCESS); 1731 } 1732 1733 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type) 1734 { 1735 PetscFunctionBegin; 1736 *type = MATSOLVERKOKKOS; 1737 PetscFunctionReturn(PETSC_SUCCESS); 1738 } 1739 1740 /*MC 1741 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1742 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 1743 1744 Level: beginner 1745 1746 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1747 M*/ 1748 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1749 { 1750 PetscInt n = A->rmap->n; 1751 1752 PetscFunctionBegin; 1753 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 1754 PetscCall(MatSetSizes(*B, n, n, n, n)); 1755 (*B)->factortype = ftype; 1756 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 1757 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1758 1759 if (ftype == MAT_FACTOR_LU) { 1760 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1761 (*B)->canuseordering = PETSC_TRUE; 1762 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1763 } else if (ftype == MAT_FACTOR_ILU) { 1764 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1765 (*B)->canuseordering = PETSC_FALSE; 1766 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1767 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1768 1769 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 1770 PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos)); 1771 PetscFunctionReturn(PETSC_SUCCESS); 1772 } 1773 1774 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1775 { 1776 PetscFunctionBegin; 1777 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1778 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1779 PetscFunctionReturn(PETSC_SUCCESS); 1780 } 1781 1782 /* Utility to print out a KokkosCsrMatrix for debugging */ 1783 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 1784 { 1785 const auto &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map); 1786 const auto &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries); 1787 const auto &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values); 1788 const PetscInt *i = iv.data(); 1789 const PetscInt *j = jv.data(); 1790 const PetscScalar *a = av.data(); 1791 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 1792 1793 PetscFunctionBegin; 1794 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 1795 for (PetscInt k = 0; k < m; k++) { 1796 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 1797 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 1798 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 1799 } 1800 PetscFunctionReturn(PETSC_SUCCESS); 1801 } 1802