1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petsc/private/sfimpl.h> 5 #include <petscsystypes.h> 6 #include <petscerror.h> 7 8 #include <Kokkos_Core.hpp> 9 #include <KokkosBlas.hpp> 10 #include <KokkosSparse_CrsMatrix.hpp> 11 #include <KokkosSparse_spmv.hpp> 12 #include <KokkosSparse_spiluk.hpp> 13 #include <KokkosSparse_sptrsv.hpp> 14 #include <KokkosSparse_spgemm.hpp> 15 #include <KokkosSparse_spadd.hpp> 16 #include <KokkosBatched_LU_Decl.hpp> 17 #include <KokkosBatched_InverseLU_Decl.hpp> 18 19 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 20 21 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 22 #include <KokkosSparse_Utils.hpp> 23 using KokkosSparse::sort_crs_matrix; 24 using KokkosSparse::Impl::transpose_matrix; 25 #else 26 #include <KokkosKernels_Sorting.hpp> 27 using KokkosKernels::sort_crs_matrix; 28 using KokkosKernels::Impl::transpose_matrix; 29 #endif 30 31 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 32 33 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 34 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 35 In the latter case, it is important to set a_dual's sync state correctly. 36 */ 37 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 38 { 39 Mat_SeqAIJ *aijseq; 40 Mat_SeqAIJKokkos *aijkok; 41 42 PetscFunctionBegin; 43 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 44 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 45 46 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 47 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 48 49 /* If aijkok does not exist, we just copy i, j to device. 50 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 51 In both cases, we build a new aijkok structure. 52 */ 53 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 54 delete aijkok; 55 aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 56 A->spptr = aijkok; 57 } 58 59 PetscFunctionReturn(PETSC_SUCCESS); 60 } 61 62 /* Sync CSR data to device if not yet */ 63 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 64 { 65 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 66 67 PetscFunctionBegin; 68 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 69 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 70 if (aijkok->a_dual.need_sync_device()) { 71 aijkok->a_dual.sync_device(); 72 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 73 aijkok->hermitian_updated = PETSC_FALSE; 74 } 75 PetscFunctionReturn(PETSC_SUCCESS); 76 } 77 78 /* Mark the CSR data on device as modified */ 79 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 80 { 81 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 82 83 PetscFunctionBegin; 84 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 85 aijkok->a_dual.clear_sync_state(); 86 aijkok->a_dual.modify_device(); 87 aijkok->transpose_updated = PETSC_FALSE; 88 aijkok->hermitian_updated = PETSC_FALSE; 89 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 90 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 91 PetscFunctionReturn(PETSC_SUCCESS); 92 } 93 94 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 95 { 96 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 97 98 PetscFunctionBegin; 99 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 100 /* We do not expect one needs factors on host */ 101 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 102 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 103 aijkok->a_dual.sync_host(); 104 PetscFunctionReturn(PETSC_SUCCESS); 105 } 106 107 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 108 { 109 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 110 111 PetscFunctionBegin; 112 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 113 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 114 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 115 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 116 */ 117 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 118 aijkok->a_dual.sync_host(); 119 *array = aijkok->a_dual.view_host().data(); 120 } else { /* Happens when calling MatSetValues on a newly created matrix */ 121 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 122 } 123 PetscFunctionReturn(PETSC_SUCCESS); 124 } 125 126 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 127 { 128 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 129 130 PetscFunctionBegin; 131 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 132 PetscFunctionReturn(PETSC_SUCCESS); 133 } 134 135 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 136 { 137 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 138 139 PetscFunctionBegin; 140 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 141 aijkok->a_dual.sync_host(); 142 *array = aijkok->a_dual.view_host().data(); 143 } else { 144 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 145 } 146 PetscFunctionReturn(PETSC_SUCCESS); 147 } 148 149 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 150 { 151 PetscFunctionBegin; 152 *array = NULL; 153 PetscFunctionReturn(PETSC_SUCCESS); 154 } 155 156 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 157 { 158 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 159 160 PetscFunctionBegin; 161 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 162 *array = aijkok->a_dual.view_host().data(); 163 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 164 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 165 } 166 PetscFunctionReturn(PETSC_SUCCESS); 167 } 168 169 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 170 { 171 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 172 173 PetscFunctionBegin; 174 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 175 aijkok->a_dual.clear_sync_state(); 176 aijkok->a_dual.modify_host(); 177 } 178 PetscFunctionReturn(PETSC_SUCCESS); 179 } 180 181 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 182 { 183 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 184 185 PetscFunctionBegin; 186 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 187 188 if (i) *i = aijkok->i_device_data(); 189 if (j) *j = aijkok->j_device_data(); 190 if (a) { 191 aijkok->a_dual.sync_device(); 192 *a = aijkok->a_device_data(); 193 } 194 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 195 PetscFunctionReturn(PETSC_SUCCESS); 196 } 197 198 /* 199 Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 200 201 Input Parameter: 202 . A - the MATSEQAIJKOKKOS matrix 203 204 Output Parameters: 205 + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 206 - T_d - the transpose on device, whose value array is allocated but not initialized 207 */ 208 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 209 { 210 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 211 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 212 const PetscInt *Ai = aseq->i, *Aj = aseq->j; 213 MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 214 MatRowMapType *Ti = Ti_h.data(); 215 MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 216 MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 217 PetscInt *Tj = Tj_h.data(); 218 PetscInt *perm = perm_h.data(); 219 PetscInt *offset; 220 221 PetscFunctionBegin; 222 // Populate Ti 223 PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 224 Ti++; 225 for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 226 Ti--; 227 for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 228 229 // Populate Tj and the permutation array 230 PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 231 for (PetscInt i = 0; i < m; i++) { 232 for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 233 PetscInt r = Aj[j]; // row r of T 234 PetscInt disp = Ti[r] + offset[r]; 235 236 Tj[disp] = i; // col i of T 237 perm[disp] = j; 238 offset[r]++; 239 } 240 } 241 PetscCall(PetscFree(offset)); 242 243 // Sort each row of T, along with the permutation array 244 for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 245 246 // Output perm and T on device 247 auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 248 auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 249 PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 250 PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 251 PetscFunctionReturn(PETSC_SUCCESS); 252 } 253 254 // Generate the transpose on device and cache it internally 255 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 256 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 257 { 258 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 259 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 260 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 261 KokkosCsrMatrix &T = akok->csrmatT; 262 263 PetscFunctionBegin; 264 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 265 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 266 267 const auto &Aa = akok->a_dual.view_device(); 268 269 if (A->symmetric == PETSC_BOOL3_TRUE) { 270 *csrmatT = akok->csrmat; 271 } else { 272 // See if we already have a cached transpose and its value is up to date 273 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 274 if (!akok->transpose_updated) { // if the value is out of date, update the cached version 275 const auto &perm = akok->transpose_perm; // get the permutation array 276 auto &Ta = T.values; 277 278 PetscCallCXX(Kokkos::parallel_for( 279 nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 280 } 281 } else { // Generate T of size n x m for the first time 282 MatRowMapKokkosView perm; 283 284 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 285 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 286 PetscCallCXX(Kokkos::parallel_for( 287 nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 288 } 289 akok->transpose_updated = PETSC_TRUE; 290 *csrmatT = akok->csrmatT; 291 } 292 PetscFunctionReturn(PETSC_SUCCESS); 293 } 294 295 // Generate the Hermitian on device and cache it internally 296 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 297 { 298 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 299 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 300 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 301 KokkosCsrMatrix &T = akok->csrmatH; 302 303 PetscFunctionBegin; 304 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 305 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 306 307 const auto &Aa = akok->a_dual.view_device(); 308 309 if (A->hermitian == PETSC_BOOL3_TRUE) { 310 *csrmatH = akok->csrmat; 311 } else { 312 // See if we already have a cached hermitian and its value is up to date 313 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 314 if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 315 const auto &perm = akok->transpose_perm; // get the permutation array 316 auto &Ta = T.values; 317 318 PetscCallCXX(Kokkos::parallel_for( 319 nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 320 } 321 } else { // Generate T of size n x m for the first time 322 MatRowMapKokkosView perm; 323 324 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 325 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 326 PetscCallCXX(Kokkos::parallel_for( 327 nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 328 } 329 akok->hermitian_updated = PETSC_TRUE; 330 *csrmatH = akok->csrmatH; 331 } 332 PetscFunctionReturn(PETSC_SUCCESS); 333 } 334 335 /* y = A x */ 336 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 337 { 338 Mat_SeqAIJKokkos *aijkok; 339 ConstPetscScalarKokkosView xv; 340 PetscScalarKokkosView yv; 341 342 PetscFunctionBegin; 343 PetscCall(PetscLogGpuTimeBegin()); 344 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 345 PetscCall(VecGetKokkosView(xx, &xv)); 346 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 347 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 348 PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 349 PetscCall(VecRestoreKokkosView(xx, &xv)); 350 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 351 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 352 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 353 PetscCall(PetscLogGpuTimeEnd()); 354 PetscFunctionReturn(PETSC_SUCCESS); 355 } 356 357 /* y = A^T x */ 358 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 359 { 360 Mat_SeqAIJKokkos *aijkok; 361 const char *mode; 362 ConstPetscScalarKokkosView xv; 363 PetscScalarKokkosView yv; 364 KokkosCsrMatrix csrmat; 365 366 PetscFunctionBegin; 367 PetscCall(PetscLogGpuTimeBegin()); 368 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 369 PetscCall(VecGetKokkosView(xx, &xv)); 370 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 371 if (A->form_explicit_transpose) { 372 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 373 mode = "N"; 374 } else { 375 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 376 csrmat = aijkok->csrmat; 377 mode = "T"; 378 } 379 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 380 PetscCall(VecRestoreKokkosView(xx, &xv)); 381 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 382 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 383 PetscCall(PetscLogGpuTimeEnd()); 384 PetscFunctionReturn(PETSC_SUCCESS); 385 } 386 387 /* y = A^H x */ 388 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 389 { 390 Mat_SeqAIJKokkos *aijkok; 391 const char *mode; 392 ConstPetscScalarKokkosView xv; 393 PetscScalarKokkosView yv; 394 KokkosCsrMatrix csrmat; 395 396 PetscFunctionBegin; 397 PetscCall(PetscLogGpuTimeBegin()); 398 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 399 PetscCall(VecGetKokkosView(xx, &xv)); 400 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 401 if (A->form_explicit_transpose) { 402 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 403 mode = "N"; 404 } else { 405 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 406 csrmat = aijkok->csrmat; 407 mode = "C"; 408 } 409 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 410 PetscCall(VecRestoreKokkosView(xx, &xv)); 411 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 412 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 413 PetscCall(PetscLogGpuTimeEnd()); 414 PetscFunctionReturn(PETSC_SUCCESS); 415 } 416 417 /* z = A x + y */ 418 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 419 { 420 Mat_SeqAIJKokkos *aijkok; 421 ConstPetscScalarKokkosView xv, yv; 422 PetscScalarKokkosView zv; 423 424 PetscFunctionBegin; 425 PetscCall(PetscLogGpuTimeBegin()); 426 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 427 PetscCall(VecGetKokkosView(xx, &xv)); 428 PetscCall(VecGetKokkosView(yy, &yv)); 429 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 430 if (zz != yy) Kokkos::deep_copy(zv, yv); 431 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 432 PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 433 PetscCall(VecRestoreKokkosView(xx, &xv)); 434 PetscCall(VecRestoreKokkosView(yy, &yv)); 435 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 436 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 437 PetscCall(PetscLogGpuTimeEnd()); 438 PetscFunctionReturn(PETSC_SUCCESS); 439 } 440 441 /* z = A^T x + y */ 442 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 443 { 444 Mat_SeqAIJKokkos *aijkok; 445 const char *mode; 446 ConstPetscScalarKokkosView xv, yv; 447 PetscScalarKokkosView zv; 448 KokkosCsrMatrix csrmat; 449 450 PetscFunctionBegin; 451 PetscCall(PetscLogGpuTimeBegin()); 452 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 453 PetscCall(VecGetKokkosView(xx, &xv)); 454 PetscCall(VecGetKokkosView(yy, &yv)); 455 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 456 if (zz != yy) Kokkos::deep_copy(zv, yv); 457 if (A->form_explicit_transpose) { 458 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 459 mode = "N"; 460 } else { 461 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 462 csrmat = aijkok->csrmat; 463 mode = "T"; 464 } 465 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 466 PetscCall(VecRestoreKokkosView(xx, &xv)); 467 PetscCall(VecRestoreKokkosView(yy, &yv)); 468 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 469 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 470 PetscCall(PetscLogGpuTimeEnd()); 471 PetscFunctionReturn(PETSC_SUCCESS); 472 } 473 474 /* z = A^H x + y */ 475 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 476 { 477 Mat_SeqAIJKokkos *aijkok; 478 const char *mode; 479 ConstPetscScalarKokkosView xv, yv; 480 PetscScalarKokkosView zv; 481 KokkosCsrMatrix csrmat; 482 483 PetscFunctionBegin; 484 PetscCall(PetscLogGpuTimeBegin()); 485 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 486 PetscCall(VecGetKokkosView(xx, &xv)); 487 PetscCall(VecGetKokkosView(yy, &yv)); 488 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 489 if (zz != yy) Kokkos::deep_copy(zv, yv); 490 if (A->form_explicit_transpose) { 491 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 492 mode = "N"; 493 } else { 494 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 495 csrmat = aijkok->csrmat; 496 mode = "C"; 497 } 498 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 499 PetscCall(VecRestoreKokkosView(xx, &xv)); 500 PetscCall(VecRestoreKokkosView(yy, &yv)); 501 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 502 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 503 PetscCall(PetscLogGpuTimeEnd()); 504 PetscFunctionReturn(PETSC_SUCCESS); 505 } 506 507 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 508 { 509 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 510 511 PetscFunctionBegin; 512 switch (op) { 513 case MAT_FORM_EXPLICIT_TRANSPOSE: 514 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 515 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 516 A->form_explicit_transpose = flg; 517 break; 518 default: 519 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 520 break; 521 } 522 PetscFunctionReturn(PETSC_SUCCESS); 523 } 524 525 /* Depending on reuse, either build a new mat, or use the existing mat */ 526 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 527 { 528 Mat_SeqAIJ *aseq; 529 530 PetscFunctionBegin; 531 PetscCall(PetscKokkosInitializeCheck()); 532 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 533 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */ 534 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 535 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 536 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 537 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 538 PetscCall(PetscFree(A->defaultvectype)); 539 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 540 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 541 PetscCall(MatSetOps_SeqAIJKokkos(A)); 542 aseq = static_cast<Mat_SeqAIJ *>(A->data); 543 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 544 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 545 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE); 546 } 547 } 548 PetscFunctionReturn(PETSC_SUCCESS); 549 } 550 551 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 552 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 553 */ 554 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 555 { 556 Mat_SeqAIJ *bseq; 557 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 558 Mat mat; 559 560 PetscFunctionBegin; 561 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 562 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 563 mat = *B; 564 if (A->preallocated) { 565 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 566 bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE); 567 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 568 /* Now copy values to B if needed */ 569 if (dupOption == MAT_COPY_VALUES) { 570 if (akok->a_dual.need_sync_device()) { 571 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 572 bkok->a_dual.modify_host(); 573 } else { /* If device has the latest data, we only copy data on device */ 574 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 575 bkok->a_dual.modify_device(); 576 } 577 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 578 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 579 bkok->a_dual.modify_host(); 580 } 581 mat->spptr = bkok; 582 } 583 584 PetscCall(PetscFree(mat->defaultvectype)); 585 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 586 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 587 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 588 PetscFunctionReturn(PETSC_SUCCESS); 589 } 590 591 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 592 { 593 Mat At; 594 KokkosCsrMatrix internT; 595 Mat_SeqAIJKokkos *atkok, *bkok; 596 597 PetscFunctionBegin; 598 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 599 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 600 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 601 /* Deep copy internT, as we want to isolate the internal transpose */ 602 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 603 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 604 if (reuse == MAT_INITIAL_MATRIX) *B = At; 605 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 606 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 607 if ((*B)->assembled) { 608 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 609 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 610 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 611 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 612 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 613 MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 614 MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 615 PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 616 PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 617 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 618 } 619 PetscFunctionReturn(PETSC_SUCCESS); 620 } 621 622 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 623 { 624 Mat_SeqAIJKokkos *aijkok; 625 626 PetscFunctionBegin; 627 if (A->factortype == MAT_FACTOR_NONE) { 628 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 629 delete aijkok; 630 } else { 631 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 632 } 633 A->spptr = NULL; 634 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 635 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 636 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 637 PetscCall(MatDestroy_SeqAIJ(A)); 638 PetscFunctionReturn(PETSC_SUCCESS); 639 } 640 641 /*MC 642 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 643 644 A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 645 646 Options Database Key: 647 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 648 649 Level: beginner 650 651 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 652 M*/ 653 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 654 { 655 PetscFunctionBegin; 656 PetscCall(PetscKokkosInitializeCheck()); 657 PetscCall(MatCreate_SeqAIJ(A)); 658 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 659 PetscFunctionReturn(PETSC_SUCCESS); 660 } 661 662 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 663 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 664 { 665 Mat_SeqAIJ *a, *b; 666 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 667 MatScalarKokkosView aa, ba, ca; 668 MatRowMapKokkosView ai, bi, ci; 669 MatColIdxKokkosView aj, bj, cj; 670 PetscInt m, n, nnz, aN; 671 672 PetscFunctionBegin; 673 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 674 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 675 PetscAssertPointer(C, 4); 676 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 677 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 678 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 679 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 680 681 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 682 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 683 a = static_cast<Mat_SeqAIJ *>(A->data); 684 b = static_cast<Mat_SeqAIJ *>(B->data); 685 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 686 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 687 aa = akok->a_dual.view_device(); 688 ai = akok->i_dual.view_device(); 689 ba = bkok->a_dual.view_device(); 690 bi = bkok->i_dual.view_device(); 691 m = A->rmap->n; /* M, N and nnz of C */ 692 n = A->cmap->n + B->cmap->n; 693 nnz = a->nz + b->nz; 694 aN = A->cmap->n; /* N of A */ 695 if (reuse == MAT_INITIAL_MATRIX) { 696 aj = akok->j_dual.view_device(); 697 bj = bkok->j_dual.view_device(); 698 auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 699 auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 700 auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 701 ca = ca_dual.view_device(); 702 ci = ci_dual.view_device(); 703 cj = cj_dual.view_device(); 704 705 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 706 Kokkos::parallel_for( 707 Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 708 PetscInt i = t.league_rank(); /* row i */ 709 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 710 711 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 712 ci(i) = coffset; 713 if (i == m - 1) ci(m) = ai(m) + bi(m); 714 }); 715 716 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 717 if (k < alen) { 718 ca(coffset + k) = aa(ai(i) + k); 719 cj(coffset + k) = aj(ai(i) + k); 720 } else { 721 ca(coffset + k) = ba(bi(i) + k - alen); 722 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 723 } 724 }); 725 }); 726 ca_dual.modify_device(); 727 ci_dual.modify_device(); 728 cj_dual.modify_device(); 729 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 730 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 731 } else if (reuse == MAT_REUSE_MATRIX) { 732 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 733 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 734 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 735 ca = ckok->a_dual.view_device(); 736 ci = ckok->i_dual.view_device(); 737 738 Kokkos::parallel_for( 739 Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 740 PetscInt i = t.league_rank(); /* row i */ 741 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 742 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 743 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 744 else ca(ci(i) + k) = ba(bi(i) + k - alen); 745 }); 746 }); 747 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 748 } 749 PetscFunctionReturn(PETSC_SUCCESS); 750 } 751 752 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 753 { 754 PetscFunctionBegin; 755 delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 756 PetscFunctionReturn(PETSC_SUCCESS); 757 } 758 759 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 760 { 761 Mat_Product *product = C->product; 762 Mat A, B; 763 bool transA, transB; /* use bool, since KK needs this type */ 764 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 765 Mat_SeqAIJ *c; 766 MatProductData_SeqAIJKokkos *pdata; 767 KokkosCsrMatrix csrmatA, csrmatB; 768 769 PetscFunctionBegin; 770 MatCheckProduct(C, 1); 771 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 772 pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 773 774 // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 775 // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 776 // we still do numeric. 777 if (pdata->reusesym) { // numeric reuses results from symbolic 778 pdata->reusesym = PETSC_FALSE; 779 PetscFunctionReturn(PETSC_SUCCESS); 780 } 781 782 switch (product->type) { 783 case MATPRODUCT_AB: 784 transA = false; 785 transB = false; 786 break; 787 case MATPRODUCT_AtB: 788 transA = true; 789 transB = false; 790 break; 791 case MATPRODUCT_ABt: 792 transA = false; 793 transB = true; 794 break; 795 default: 796 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 797 } 798 799 A = product->A; 800 B = product->B; 801 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 802 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 803 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 804 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 805 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 806 807 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 808 809 csrmatA = akok->csrmat; 810 csrmatB = bkok->csrmat; 811 812 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 813 if (transA) { 814 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 815 transA = false; 816 } 817 818 if (transB) { 819 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 820 transB = false; 821 } 822 PetscCall(PetscLogGpuTimeBegin()); 823 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 824 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 825 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 826 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 827 #endif 828 829 PetscCall(PetscLogGpuTimeEnd()); 830 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 831 /* shorter version of MatAssemblyEnd_SeqAIJ */ 832 c = (Mat_SeqAIJ *)C->data; 833 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 834 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 835 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 836 c->reallocs = 0; 837 C->info.mallocs = 0; 838 C->info.nz_unneeded = 0; 839 C->assembled = C->was_assembled = PETSC_TRUE; 840 C->num_ass++; 841 PetscFunctionReturn(PETSC_SUCCESS); 842 } 843 844 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 845 { 846 Mat_Product *product = C->product; 847 MatProductType ptype; 848 Mat A, B; 849 bool transA, transB; 850 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 851 MatProductData_SeqAIJKokkos *pdata; 852 MPI_Comm comm; 853 KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 854 855 PetscFunctionBegin; 856 MatCheckProduct(C, 1); 857 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 858 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 859 A = product->A; 860 B = product->B; 861 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 862 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 863 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 864 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 865 csrmatA = akok->csrmat; 866 csrmatB = bkok->csrmat; 867 868 ptype = product->type; 869 // Take advantage of the symmetry if true 870 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 871 ptype = MATPRODUCT_AB; 872 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 873 } 874 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 875 ptype = MATPRODUCT_AB; 876 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 877 } 878 879 switch (ptype) { 880 case MATPRODUCT_AB: 881 transA = false; 882 transB = false; 883 break; 884 case MATPRODUCT_AtB: 885 transA = true; 886 transB = false; 887 break; 888 case MATPRODUCT_ABt: 889 transA = false; 890 transB = true; 891 break; 892 default: 893 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 894 } 895 PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 896 pdata->reusesym = product->api_user; 897 898 /* TODO: add command line options to select spgemm algorithms */ 899 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 900 901 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 902 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 903 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 904 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 905 #endif 906 #endif 907 PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 908 909 PetscCall(PetscLogGpuTimeBegin()); 910 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 911 if (transA) { 912 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 913 transA = false; 914 } 915 916 if (transB) { 917 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 918 transB = false; 919 } 920 921 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 922 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 923 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 924 calling new Mat_SeqAIJKokkos(). 925 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 926 */ 927 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 928 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 929 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 930 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 931 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 932 #endif 933 PetscCall(PetscLogGpuTimeEnd()); 934 935 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 936 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 937 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 938 PetscFunctionReturn(PETSC_SUCCESS); 939 } 940 941 /* handles sparse matrix matrix ops */ 942 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 943 { 944 Mat_Product *product = mat->product; 945 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 946 947 PetscFunctionBegin; 948 MatCheckProduct(mat, 1); 949 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 950 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 951 if (Biskok && Ciskok) { 952 switch (product->type) { 953 case MATPRODUCT_AB: 954 case MATPRODUCT_AtB: 955 case MATPRODUCT_ABt: 956 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 957 break; 958 case MATPRODUCT_PtAP: 959 case MATPRODUCT_RARt: 960 case MATPRODUCT_ABC: 961 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 962 break; 963 default: 964 break; 965 } 966 } else { /* fallback for AIJ */ 967 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 968 } 969 PetscFunctionReturn(PETSC_SUCCESS); 970 } 971 972 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 973 { 974 Mat_SeqAIJKokkos *aijkok; 975 976 PetscFunctionBegin; 977 PetscCall(PetscLogGpuTimeBegin()); 978 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 979 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 980 KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 981 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 982 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 983 PetscCall(PetscLogGpuTimeEnd()); 984 PetscFunctionReturn(PETSC_SUCCESS); 985 } 986 987 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 988 { 989 Mat_SeqAIJKokkos *aijkok; 990 991 PetscFunctionBegin; 992 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 993 if (aijkok) { /* Only zero the device if data is already there */ 994 KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0); 995 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 996 } else { /* Might be preallocated but not assembled */ 997 PetscCall(MatZeroEntries_SeqAIJ(A)); 998 } 999 PetscFunctionReturn(PETSC_SUCCESS); 1000 } 1001 1002 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1003 { 1004 Mat_SeqAIJ *aijseq; 1005 Mat_SeqAIJKokkos *aijkok; 1006 PetscInt n; 1007 PetscScalarKokkosView xv; 1008 1009 PetscFunctionBegin; 1010 PetscCall(VecGetLocalSize(x, &n)); 1011 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1012 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1013 1014 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1015 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1016 1017 if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */ 1018 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1019 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1020 aijkok->SetDiagonal(aijseq->diag); 1021 } 1022 1023 const auto &Aa = aijkok->a_dual.view_device(); 1024 const auto &Ai = aijkok->i_dual.view_device(); 1025 const auto &Adiag = aijkok->diag_dual.view_device(); 1026 1027 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1028 Kokkos::parallel_for( 1029 n, KOKKOS_LAMBDA(const PetscInt i) { 1030 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1031 else xv(i) = 0; 1032 }); 1033 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1034 PetscFunctionReturn(PETSC_SUCCESS); 1035 } 1036 1037 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1038 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1039 { 1040 Mat_SeqAIJKokkos *aijkok; 1041 1042 PetscFunctionBegin; 1043 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1044 PetscAssertPointer(kv, 2); 1045 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1046 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1047 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1048 *kv = aijkok->a_dual.view_device(); 1049 PetscFunctionReturn(PETSC_SUCCESS); 1050 } 1051 1052 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1053 { 1054 PetscFunctionBegin; 1055 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1056 PetscAssertPointer(kv, 2); 1057 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1058 PetscFunctionReturn(PETSC_SUCCESS); 1059 } 1060 1061 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1062 { 1063 Mat_SeqAIJKokkos *aijkok; 1064 1065 PetscFunctionBegin; 1066 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1067 PetscAssertPointer(kv, 2); 1068 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1069 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1070 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1071 *kv = aijkok->a_dual.view_device(); 1072 PetscFunctionReturn(PETSC_SUCCESS); 1073 } 1074 1075 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1076 { 1077 PetscFunctionBegin; 1078 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1079 PetscAssertPointer(kv, 2); 1080 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1081 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1082 PetscFunctionReturn(PETSC_SUCCESS); 1083 } 1084 1085 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1086 { 1087 Mat_SeqAIJKokkos *aijkok; 1088 1089 PetscFunctionBegin; 1090 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1091 PetscAssertPointer(kv, 2); 1092 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1093 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1094 *kv = aijkok->a_dual.view_device(); 1095 PetscFunctionReturn(PETSC_SUCCESS); 1096 } 1097 1098 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1099 { 1100 PetscFunctionBegin; 1101 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1102 PetscAssertPointer(kv, 2); 1103 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1104 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1105 PetscFunctionReturn(PETSC_SUCCESS); 1106 } 1107 1108 /* Computes Y += alpha X */ 1109 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1110 { 1111 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1112 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1113 ConstMatScalarKokkosView Xa; 1114 MatScalarKokkosView Ya; 1115 1116 PetscFunctionBegin; 1117 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1118 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1119 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1120 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1121 PetscCall(PetscLogGpuTimeBegin()); 1122 1123 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1124 PetscBool e; 1125 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1126 if (e) { 1127 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1128 if (e) pattern = SAME_NONZERO_PATTERN; 1129 } 1130 } 1131 1132 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1133 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1134 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1135 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1136 */ 1137 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1138 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1139 Xa = xkok->a_dual.view_device(); 1140 Ya = ykok->a_dual.view_device(); 1141 1142 if (pattern == SAME_NONZERO_PATTERN) { 1143 KokkosBlas::axpy(alpha, Xa, Ya); 1144 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1145 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1146 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1147 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1148 1149 Kokkos::parallel_for( 1150 Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1151 PetscInt i = t.league_rank(); // row i 1152 Kokkos::single(Kokkos::PerTeam(t), [=]() { 1153 // Only one thread works in a team 1154 PetscInt p, q = Yi(i); 1155 for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 1156 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 1157 if (Xj(p) == Yj(q)) { // Found it 1158 Ya(q) += alpha * Xa(p); 1159 q++; 1160 } else { 1161 // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1162 // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1163 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 1164 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 1165 #else 1166 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 1167 #endif 1168 } 1169 } 1170 }); 1171 }); 1172 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1173 } else { // different nonzero patterns 1174 Mat Z; 1175 KokkosCsrMatrix zcsr; 1176 KernelHandle kh; 1177 kh.create_spadd_handle(true); // X, Y are sorted 1178 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1179 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1180 zkok = new Mat_SeqAIJKokkos(zcsr); 1181 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1182 PetscCall(MatHeaderReplace(Y, &Z)); 1183 kh.destroy_spadd_handle(); 1184 } 1185 PetscCall(PetscLogGpuTimeEnd()); 1186 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 1187 PetscFunctionReturn(PETSC_SUCCESS); 1188 } 1189 1190 struct MatCOOStruct_SeqAIJKokkos { 1191 PetscCount n; 1192 PetscCount Atot; 1193 PetscInt nz; 1194 PetscCountKokkosView jmap; 1195 PetscCountKokkosView perm; 1196 1197 MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h) 1198 { 1199 nz = coo_h->nz; 1200 n = coo_h->n; 1201 Atot = coo_h->Atot; 1202 jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1)); 1203 perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot)); 1204 } 1205 }; 1206 1207 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void *data) 1208 { 1209 PetscFunctionBegin; 1210 PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(data)); 1211 PetscFunctionReturn(PETSC_SUCCESS); 1212 } 1213 1214 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1215 { 1216 Mat_SeqAIJKokkos *akok; 1217 Mat_SeqAIJ *aseq; 1218 PetscContainer container_h, container_d; 1219 MatCOOStruct_SeqAIJ *coo_h; 1220 MatCOOStruct_SeqAIJKokkos *coo_d; 1221 1222 PetscFunctionBegin; 1223 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1224 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1225 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1226 delete akok; 1227 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE); 1228 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1229 1230 // Copy the COO struct to device 1231 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 1232 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 1233 PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h)); 1234 1235 // Put the COO struct in a container and then attach that to the matrix 1236 PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d)); 1237 PetscCall(PetscContainerSetPointer(container_d, coo_d)); 1238 PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_SeqAIJKokkos)); 1239 PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d)); 1240 PetscCall(PetscContainerDestroy(&container_d)); 1241 PetscFunctionReturn(PETSC_SUCCESS); 1242 } 1243 1244 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1245 { 1246 MatScalarKokkosView Aa; 1247 ConstMatScalarKokkosView kv; 1248 PetscMemType memtype; 1249 PetscContainer container; 1250 MatCOOStruct_SeqAIJKokkos *coo; 1251 1252 PetscFunctionBegin; 1253 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 1254 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 1255 1256 const auto &n = coo->n; 1257 const auto &Annz = coo->nz; 1258 const auto &jmap = coo->jmap; 1259 const auto &perm = coo->perm; 1260 1261 PetscCall(PetscGetMemType(v, &memtype)); 1262 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1263 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n)); 1264 } else { 1265 kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */ 1266 } 1267 1268 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1269 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1270 1271 PetscCall(PetscLogGpuTimeBegin()); 1272 Kokkos::parallel_for( 1273 Annz, KOKKOS_LAMBDA(const PetscCount i) { 1274 PetscScalar sum = 0.0; 1275 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1276 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1277 }); 1278 PetscCall(PetscLogGpuTimeEnd()); 1279 1280 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1281 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1282 PetscFunctionReturn(PETSC_SUCCESS); 1283 } 1284 1285 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1286 { 1287 PetscFunctionBegin; 1288 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1289 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1290 B->offloadmask = PETSC_OFFLOAD_CPU; 1291 PetscFunctionReturn(PETSC_SUCCESS); 1292 } 1293 1294 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1295 { 1296 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1297 1298 PetscFunctionBegin; 1299 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1300 A->boundtocpu = PETSC_FALSE; 1301 1302 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1303 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1304 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1305 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1306 A->ops->scale = MatScale_SeqAIJKokkos; 1307 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1308 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1309 A->ops->mult = MatMult_SeqAIJKokkos; 1310 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1311 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1312 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1313 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1314 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1315 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1316 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1317 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1318 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1319 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1320 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1321 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1322 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1323 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1324 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1325 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1326 1327 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1328 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1329 PetscFunctionReturn(PETSC_SUCCESS); 1330 } 1331 1332 /* 1333 Extract the (prescribled) diagonal blocks of the matrix and then invert them 1334 1335 Input Parameters: 1336 + A - the MATSEQAIJKOKKOS matrix 1337 . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i) 1338 . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[] 1339 . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i) 1340 - work - a pre-allocated work buffer (as big as diagVal) for use by this routine 1341 1342 Output Parameter: 1343 . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order) 1344 */ 1345 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal) 1346 { 1347 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1348 PetscInt N = A->rmap->n; 1349 PetscInt nblocks = bs.extent(0) - 1; 1350 1351 PetscFunctionBegin; 1352 // Set the diagonal pointer on device if not already 1353 if (N && akok->diag_dual.extent(0) == 0) { 1354 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1355 akok->SetDiagonal(static_cast<Mat_SeqAIJ *>(A->data)->diag); 1356 } 1357 1358 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device 1359 1360 // Pull out the diagonal blocks of the matrix and then invert the blocks 1361 auto Aa = akok->a_dual.view_device(); 1362 auto Ai = akok->i_dual.view_device(); 1363 auto Aj = akok->j_dual.view_device(); 1364 auto Adiag = akok->diag_dual.view_device(); 1365 // TODO: how to tune the team size? 1366 #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HOST) 1367 auto ts = Kokkos::AUTO(); 1368 #else 1369 auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs 1370 #endif 1371 PetscCallCXX(Kokkos::parallel_for( 1372 Kokkos::TeamPolicy<>(nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) { 1373 const PetscInt bid = teamMember.league_rank(); // block id 1374 const PetscInt rstart = bs(bid); // this block starts from this row 1375 const PetscInt m = bs(bid + 1) - bs(bid); // size of this block 1376 const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order 1377 const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m); 1378 1379 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B 1380 PetscInt i = rstart + r; // i-th row in A 1381 1382 if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case) 1383 PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row 1384 1385 for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet 1386 if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero 1387 B(r, c) = 0.0; 1388 } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column 1389 B(r, c) = Aa(first + c); 1390 } else { // this entry does not show up in the CSR 1391 B(r, c) = 0.0; 1392 } 1393 } 1394 } else { // rare case that the diagonal does not exist 1395 const PetscInt begin = Ai(i); 1396 const PetscInt end = Ai(i + 1); 1397 for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0; 1398 for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not. 1399 if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j); 1400 else if (Aj(j) >= rstart + m) break; 1401 } 1402 } 1403 }); 1404 1405 // LU-decompose B (w/o pivoting) and then invert B 1406 KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0); 1407 KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W); 1408 })); 1409 // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels 1410 PetscFunctionReturn(PETSC_SUCCESS); 1411 } 1412 1413 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1414 { 1415 Mat_SeqAIJ *aseq; 1416 PetscInt i, m, n; 1417 1418 PetscFunctionBegin; 1419 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1420 1421 m = akok->nrows(); 1422 n = akok->ncols(); 1423 PetscCall(MatSetSizes(A, m, n, m, n)); 1424 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1425 1426 /* Set up data structures of A as a MATSEQAIJ */ 1427 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1428 aseq = (Mat_SeqAIJ *)(A)->data; 1429 1430 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1431 akok->j_dual.sync_host(); 1432 1433 aseq->i = akok->i_host_data(); 1434 aseq->j = akok->j_host_data(); 1435 aseq->a = akok->a_host_data(); 1436 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1437 aseq->singlemalloc = PETSC_FALSE; 1438 aseq->free_a = PETSC_FALSE; 1439 aseq->free_ij = PETSC_FALSE; 1440 aseq->nz = akok->nnz(); 1441 aseq->maxnz = aseq->nz; 1442 1443 PetscCall(PetscMalloc1(m, &aseq->imax)); 1444 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1445 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1446 1447 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1448 akok->nonzerostate = A->nonzerostate; 1449 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1450 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1451 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1452 PetscFunctionReturn(PETSC_SUCCESS); 1453 } 1454 1455 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 1456 { 1457 PetscFunctionBegin; 1458 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1459 *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 1460 PetscFunctionReturn(PETSC_SUCCESS); 1461 } 1462 1463 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 1464 { 1465 Mat_SeqAIJKokkos *akok; 1466 PetscFunctionBegin; 1467 PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 1468 PetscCall(MatCreate(comm, A)); 1469 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1470 PetscFunctionReturn(PETSC_SUCCESS); 1471 } 1472 1473 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1474 1475 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1476 */ 1477 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1478 { 1479 PetscFunctionBegin; 1480 PetscCall(MatCreate(comm, A)); 1481 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1482 PetscFunctionReturn(PETSC_SUCCESS); 1483 } 1484 1485 /*@C 1486 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1487 (the default parallel PETSc format). This matrix will ultimately be handled by 1488 Kokkos for calculations. 1489 1490 Collective 1491 1492 Input Parameters: 1493 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1494 . m - number of rows 1495 . n - number of columns 1496 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 1497 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 1498 1499 Output Parameter: 1500 . A - the matrix 1501 1502 Level: intermediate 1503 1504 Notes: 1505 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1506 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1507 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1508 1509 The AIJ format, also called 1510 compressed row storage, is fully compatible with standard Fortran 1511 storage. That is, the stored row and column indices can begin at 1512 either one (as in Fortran) or zero. 1513 1514 Specify the preallocated storage with either `nz` or `nnz` (not both). 1515 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1516 allocation. 1517 1518 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()` 1519 @*/ 1520 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1521 { 1522 PetscFunctionBegin; 1523 PetscCall(PetscKokkosInitializeCheck()); 1524 PetscCall(MatCreate(comm, A)); 1525 PetscCall(MatSetSizes(*A, m, n, m, n)); 1526 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1527 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1528 PetscFunctionReturn(PETSC_SUCCESS); 1529 } 1530 1531 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1532 { 1533 PetscFunctionBegin; 1534 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1535 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1536 PetscFunctionReturn(PETSC_SUCCESS); 1537 } 1538 1539 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1540 { 1541 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1542 1543 PetscFunctionBegin; 1544 if (!factors->sptrsv_symbolic_completed) { 1545 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d); 1546 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d); 1547 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1548 } 1549 PetscFunctionReturn(PETSC_SUCCESS); 1550 } 1551 1552 /* Check if we need to update factors etc for transpose solve */ 1553 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1554 { 1555 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1556 MatColIdxType n = A->rmap->n; 1557 1558 PetscFunctionBegin; 1559 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1560 /* Update L^T and do sptrsv symbolic */ 1561 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0 1562 factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 1563 factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 1564 1565 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d, 1566 factors->iLt_d, factors->jLt_d, factors->aLt_d); 1567 1568 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1569 We have to sort the indices, until KK provides finer control options. 1570 */ 1571 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d); 1572 1573 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d); 1574 1575 /* Update U^T and do sptrsv symbolic */ 1576 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0 1577 factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 1578 factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 1579 1580 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d, 1581 factors->iUt_d, factors->jUt_d, factors->aUt_d); 1582 1583 /* Sort indices. See comments above */ 1584 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d); 1585 1586 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d); 1587 factors->transpose_updated = PETSC_TRUE; 1588 } 1589 PetscFunctionReturn(PETSC_SUCCESS); 1590 } 1591 1592 /* Solve Ax = b, with A = LU */ 1593 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x) 1594 { 1595 ConstPetscScalarKokkosView bv; 1596 PetscScalarKokkosView xv; 1597 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1598 1599 PetscFunctionBegin; 1600 PetscCall(PetscLogGpuTimeBegin()); 1601 PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 1602 PetscCall(VecGetKokkosView(b, &bv)); 1603 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1604 /* Solve L tmpv = b */ 1605 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector)); 1606 /* Solve Ux = tmpv */ 1607 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv)); 1608 PetscCall(VecRestoreKokkosView(b, &bv)); 1609 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1610 PetscCall(PetscLogGpuTimeEnd()); 1611 PetscFunctionReturn(PETSC_SUCCESS); 1612 } 1613 1614 /* Solve A^T x = b, where A^T = U^T L^T */ 1615 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x) 1616 { 1617 ConstPetscScalarKokkosView bv; 1618 PetscScalarKokkosView xv; 1619 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1620 1621 PetscFunctionBegin; 1622 PetscCall(PetscLogGpuTimeBegin()); 1623 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 1624 PetscCall(VecGetKokkosView(b, &bv)); 1625 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1626 /* Solve U^T tmpv = b */ 1627 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector); 1628 1629 /* Solve L^T x = tmpv */ 1630 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv); 1631 PetscCall(VecRestoreKokkosView(b, &bv)); 1632 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1633 PetscCall(PetscLogGpuTimeEnd()); 1634 PetscFunctionReturn(PETSC_SUCCESS); 1635 } 1636 1637 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1638 { 1639 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1640 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1641 PetscInt fill_lev = info->levels; 1642 1643 PetscFunctionBegin; 1644 PetscCall(PetscLogGpuTimeBegin()); 1645 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1646 1647 auto a_d = aijkok->a_dual.view_device(); 1648 auto i_d = aijkok->i_dual.view_device(); 1649 auto j_d = aijkok->j_dual.view_device(); 1650 1651 KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d); 1652 1653 B->assembled = PETSC_TRUE; 1654 B->preallocated = PETSC_TRUE; 1655 B->ops->solve = MatSolve_SeqAIJKokkos; 1656 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1657 B->ops->matsolve = NULL; 1658 B->ops->matsolvetranspose = NULL; 1659 B->offloadmask = PETSC_OFFLOAD_GPU; 1660 1661 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1662 factors->transpose_updated = PETSC_FALSE; 1663 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1664 /* TODO: log flops, but how to know that? */ 1665 PetscCall(PetscLogGpuTimeEnd()); 1666 PetscFunctionReturn(PETSC_SUCCESS); 1667 } 1668 1669 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1670 { 1671 Mat_SeqAIJKokkos *aijkok; 1672 Mat_SeqAIJ *b; 1673 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1674 PetscInt fill_lev = info->levels; 1675 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 1676 PetscInt n = A->rmap->n; 1677 1678 PetscFunctionBegin; 1679 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1680 /* Rebuild factors */ 1681 if (factors) { 1682 factors->Destroy(); 1683 } /* Destroy the old if it exists */ 1684 else { 1685 B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n); 1686 } 1687 1688 /* Create a spiluk handle and then do symbolic factorization */ 1689 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 1690 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 1691 1692 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1693 1694 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 1695 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 1696 Kokkos::realloc(factors->iU_d, n + 1); 1697 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 1698 1699 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1700 auto i_d = aijkok->i_dual.view_device(); 1701 auto j_d = aijkok->j_dual.view_device(); 1702 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d); 1703 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1704 1705 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1706 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 1707 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 1708 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 1709 1710 /* TODO: add options to select sptrsv algorithms */ 1711 /* Create sptrsv handles for L, U and their transpose */ 1712 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1713 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1714 #else 1715 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1716 #endif 1717 1718 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 1719 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 1720 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 1721 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 1722 1723 /* Fill fields of the factor matrix B */ 1724 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 1725 b = (Mat_SeqAIJ *)B->data; 1726 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 1727 B->info.fill_ratio_given = info->fill; 1728 B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 1729 1730 B->offloadmask = PETSC_OFFLOAD_GPU; 1731 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1732 PetscFunctionReturn(PETSC_SUCCESS); 1733 } 1734 1735 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type) 1736 { 1737 PetscFunctionBegin; 1738 *type = MATSOLVERKOKKOS; 1739 PetscFunctionReturn(PETSC_SUCCESS); 1740 } 1741 1742 /*MC 1743 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1744 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 1745 1746 Level: beginner 1747 1748 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1749 M*/ 1750 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1751 { 1752 PetscInt n = A->rmap->n; 1753 1754 PetscFunctionBegin; 1755 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 1756 PetscCall(MatSetSizes(*B, n, n, n, n)); 1757 (*B)->factortype = ftype; 1758 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 1759 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1760 1761 if (ftype == MAT_FACTOR_LU) { 1762 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1763 (*B)->canuseordering = PETSC_TRUE; 1764 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1765 } else if (ftype == MAT_FACTOR_ILU) { 1766 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1767 (*B)->canuseordering = PETSC_FALSE; 1768 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1769 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1770 1771 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 1772 PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos)); 1773 PetscFunctionReturn(PETSC_SUCCESS); 1774 } 1775 1776 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1777 { 1778 PetscFunctionBegin; 1779 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1780 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1781 PetscFunctionReturn(PETSC_SUCCESS); 1782 } 1783 1784 /* Utility to print out a KokkosCsrMatrix for debugging */ 1785 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 1786 { 1787 const auto &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map); 1788 const auto &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries); 1789 const auto &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values); 1790 const PetscInt *i = iv.data(); 1791 const PetscInt *j = jv.data(); 1792 const PetscScalar *a = av.data(); 1793 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 1794 1795 PetscFunctionBegin; 1796 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 1797 for (PetscInt k = 0; k < m; k++) { 1798 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 1799 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 1800 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 1801 } 1802 PetscFunctionReturn(PETSC_SUCCESS); 1803 } 1804