1 #include <petsc_kokkos.hpp> 2 #include <petscvec_kokkos.hpp> 3 #include <petscpkg_version.h> 4 #include <petsc/private/petscimpl.h> 5 #include <petsc/private/sfimpl.h> 6 #include <petscsystypes.h> 7 #include <petscerror.h> 8 9 #include <Kokkos_Core.hpp> 10 #include <KokkosBlas.hpp> 11 #include <KokkosSparse_CrsMatrix.hpp> 12 #include <KokkosSparse_spmv.hpp> 13 #include <KokkosSparse_spiluk.hpp> 14 #include <KokkosSparse_sptrsv.hpp> 15 #include <KokkosSparse_spgemm.hpp> 16 #include <KokkosSparse_spadd.hpp> 17 #include <KokkosBatched_LU_Decl.hpp> 18 #include <KokkosBatched_InverseLU_Decl.hpp> 19 20 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 21 22 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 23 #include <KokkosSparse_Utils.hpp> 24 using KokkosSparse::sort_crs_matrix; 25 using KokkosSparse::Impl::transpose_matrix; 26 #else 27 #include <KokkosKernels_Sorting.hpp> 28 using KokkosKernels::sort_crs_matrix; 29 using KokkosKernels::Impl::transpose_matrix; 30 #endif 31 32 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 33 34 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 35 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 36 In the latter case, it is important to set a_dual's sync state correctly. 37 */ 38 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 39 { 40 Mat_SeqAIJ *aijseq; 41 Mat_SeqAIJKokkos *aijkok; 42 43 PetscFunctionBegin; 44 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 45 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 46 47 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 48 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 49 50 /* If aijkok does not exist, we just copy i, j to device. 51 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 52 In both cases, we build a new aijkok structure. 53 */ 54 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 55 delete aijkok; 56 aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 57 A->spptr = aijkok; 58 } 59 60 PetscFunctionReturn(PETSC_SUCCESS); 61 } 62 63 /* Sync CSR data to device if not yet */ 64 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 65 { 66 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 67 68 PetscFunctionBegin; 69 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 70 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 71 if (aijkok->a_dual.need_sync_device()) { 72 aijkok->a_dual.sync_device(); 73 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 74 aijkok->hermitian_updated = PETSC_FALSE; 75 } 76 PetscFunctionReturn(PETSC_SUCCESS); 77 } 78 79 /* Mark the CSR data on device as modified */ 80 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 81 { 82 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 83 84 PetscFunctionBegin; 85 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 86 aijkok->a_dual.clear_sync_state(); 87 aijkok->a_dual.modify_device(); 88 aijkok->transpose_updated = PETSC_FALSE; 89 aijkok->hermitian_updated = PETSC_FALSE; 90 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 91 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 92 PetscFunctionReturn(PETSC_SUCCESS); 93 } 94 95 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 96 { 97 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 98 auto &exec = PetscGetKokkosExecutionSpace(); 99 100 PetscFunctionBegin; 101 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 102 /* We do not expect one needs factors on host */ 103 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 104 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 105 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 106 PetscCallCXX(exec.fence()); 107 PetscFunctionReturn(PETSC_SUCCESS); 108 } 109 110 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 111 { 112 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 113 114 PetscFunctionBegin; 115 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 116 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 117 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 118 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 119 */ 120 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 121 auto &exec = PetscGetKokkosExecutionSpace(); 122 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 123 PetscCallCXX(exec.fence()); 124 *array = aijkok->a_dual.view_host().data(); 125 } else { /* Happens when calling MatSetValues on a newly created matrix */ 126 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 127 } 128 PetscFunctionReturn(PETSC_SUCCESS); 129 } 130 131 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 132 { 133 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 134 135 PetscFunctionBegin; 136 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 137 PetscFunctionReturn(PETSC_SUCCESS); 138 } 139 140 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 141 { 142 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 143 144 PetscFunctionBegin; 145 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 146 auto &exec = PetscGetKokkosExecutionSpace(); 147 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 148 PetscCallCXX(exec.fence()); 149 *array = aijkok->a_dual.view_host().data(); 150 } else { 151 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 152 } 153 PetscFunctionReturn(PETSC_SUCCESS); 154 } 155 156 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 157 { 158 PetscFunctionBegin; 159 *array = NULL; 160 PetscFunctionReturn(PETSC_SUCCESS); 161 } 162 163 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 164 { 165 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 166 167 PetscFunctionBegin; 168 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 169 *array = aijkok->a_dual.view_host().data(); 170 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 171 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 172 } 173 PetscFunctionReturn(PETSC_SUCCESS); 174 } 175 176 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 177 { 178 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 179 180 PetscFunctionBegin; 181 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 182 aijkok->a_dual.clear_sync_state(); 183 aijkok->a_dual.modify_host(); 184 } 185 PetscFunctionReturn(PETSC_SUCCESS); 186 } 187 188 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 189 { 190 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 191 192 PetscFunctionBegin; 193 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 194 195 if (i) *i = aijkok->i_device_data(); 196 if (j) *j = aijkok->j_device_data(); 197 if (a) { 198 aijkok->a_dual.sync_device(); 199 *a = aijkok->a_device_data(); 200 } 201 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 202 PetscFunctionReturn(PETSC_SUCCESS); 203 } 204 205 /* 206 Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 207 208 Input Parameter: 209 . A - the MATSEQAIJKOKKOS matrix 210 211 Output Parameters: 212 + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 213 - T_d - the transpose on device, whose value array is allocated but not initialized 214 */ 215 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 216 { 217 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 218 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 219 const PetscInt *Ai = aseq->i, *Aj = aseq->j; 220 MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 221 MatRowMapType *Ti = Ti_h.data(); 222 MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 223 MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 224 PetscInt *Tj = Tj_h.data(); 225 PetscInt *perm = perm_h.data(); 226 PetscInt *offset; 227 228 PetscFunctionBegin; 229 // Populate Ti 230 PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 231 Ti++; 232 for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 233 Ti--; 234 for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 235 236 // Populate Tj and the permutation array 237 PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 238 for (PetscInt i = 0; i < m; i++) { 239 for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 240 PetscInt r = Aj[j]; // row r of T 241 PetscInt disp = Ti[r] + offset[r]; 242 243 Tj[disp] = i; // col i of T 244 perm[disp] = j; 245 offset[r]++; 246 } 247 } 248 PetscCall(PetscFree(offset)); 249 250 // Sort each row of T, along with the permutation array 251 for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 252 253 // Output perm and T on device 254 auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 255 auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 256 PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 257 PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 258 PetscFunctionReturn(PETSC_SUCCESS); 259 } 260 261 // Generate the transpose on device and cache it internally 262 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 263 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 264 { 265 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 266 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 267 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 268 KokkosCsrMatrix &T = akok->csrmatT; 269 270 PetscFunctionBegin; 271 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 272 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 273 274 const auto &Aa = akok->a_dual.view_device(); 275 276 if (A->symmetric == PETSC_BOOL3_TRUE) { 277 *csrmatT = akok->csrmat; 278 } else { 279 // See if we already have a cached transpose and its value is up to date 280 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 281 if (!akok->transpose_updated) { // if the value is out of date, update the cached version 282 const auto &perm = akok->transpose_perm; // get the permutation array 283 auto &Ta = T.values; 284 285 PetscCallCXX(Kokkos::parallel_for( 286 nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 287 } 288 } else { // Generate T of size n x m for the first time 289 MatRowMapKokkosView perm; 290 291 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 292 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 293 PetscCallCXX(Kokkos::parallel_for( 294 nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 295 } 296 akok->transpose_updated = PETSC_TRUE; 297 *csrmatT = akok->csrmatT; 298 } 299 PetscFunctionReturn(PETSC_SUCCESS); 300 } 301 302 // Generate the Hermitian on device and cache it internally 303 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 304 { 305 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 306 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 307 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 308 KokkosCsrMatrix &T = akok->csrmatH; 309 310 PetscFunctionBegin; 311 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 312 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 313 314 const auto &Aa = akok->a_dual.view_device(); 315 316 if (A->hermitian == PETSC_BOOL3_TRUE) { 317 *csrmatH = akok->csrmat; 318 } else { 319 // See if we already have a cached hermitian and its value is up to date 320 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 321 if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 322 const auto &perm = akok->transpose_perm; // get the permutation array 323 auto &Ta = T.values; 324 325 PetscCallCXX(Kokkos::parallel_for( 326 nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 327 } 328 } else { // Generate T of size n x m for the first time 329 MatRowMapKokkosView perm; 330 331 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 332 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 333 PetscCallCXX(Kokkos::parallel_for( 334 nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 335 } 336 akok->hermitian_updated = PETSC_TRUE; 337 *csrmatH = akok->csrmatH; 338 } 339 PetscFunctionReturn(PETSC_SUCCESS); 340 } 341 342 /* y = A x */ 343 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 344 { 345 Mat_SeqAIJKokkos *aijkok; 346 ConstPetscScalarKokkosView xv; 347 PetscScalarKokkosView yv; 348 349 PetscFunctionBegin; 350 PetscCall(PetscLogGpuTimeBegin()); 351 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 352 PetscCall(VecGetKokkosView(xx, &xv)); 353 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 354 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 355 PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 356 PetscCall(VecRestoreKokkosView(xx, &xv)); 357 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 358 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 359 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 360 PetscCall(PetscLogGpuTimeEnd()); 361 PetscFunctionReturn(PETSC_SUCCESS); 362 } 363 364 /* y = A^T x */ 365 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 366 { 367 Mat_SeqAIJKokkos *aijkok; 368 const char *mode; 369 ConstPetscScalarKokkosView xv; 370 PetscScalarKokkosView yv; 371 KokkosCsrMatrix csrmat; 372 373 PetscFunctionBegin; 374 PetscCall(PetscLogGpuTimeBegin()); 375 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 376 PetscCall(VecGetKokkosView(xx, &xv)); 377 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 378 if (A->form_explicit_transpose) { 379 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 380 mode = "N"; 381 } else { 382 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 383 csrmat = aijkok->csrmat; 384 mode = "T"; 385 } 386 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 387 PetscCall(VecRestoreKokkosView(xx, &xv)); 388 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 389 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 390 PetscCall(PetscLogGpuTimeEnd()); 391 PetscFunctionReturn(PETSC_SUCCESS); 392 } 393 394 /* y = A^H x */ 395 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 396 { 397 Mat_SeqAIJKokkos *aijkok; 398 const char *mode; 399 ConstPetscScalarKokkosView xv; 400 PetscScalarKokkosView yv; 401 KokkosCsrMatrix csrmat; 402 403 PetscFunctionBegin; 404 PetscCall(PetscLogGpuTimeBegin()); 405 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 406 PetscCall(VecGetKokkosView(xx, &xv)); 407 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 408 if (A->form_explicit_transpose) { 409 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 410 mode = "N"; 411 } else { 412 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 413 csrmat = aijkok->csrmat; 414 mode = "C"; 415 } 416 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 417 PetscCall(VecRestoreKokkosView(xx, &xv)); 418 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 419 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 420 PetscCall(PetscLogGpuTimeEnd()); 421 PetscFunctionReturn(PETSC_SUCCESS); 422 } 423 424 /* z = A x + y */ 425 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 426 { 427 Mat_SeqAIJKokkos *aijkok; 428 ConstPetscScalarKokkosView xv, yv; 429 PetscScalarKokkosView zv; 430 431 PetscFunctionBegin; 432 PetscCall(PetscLogGpuTimeBegin()); 433 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 434 PetscCall(VecGetKokkosView(xx, &xv)); 435 PetscCall(VecGetKokkosView(yy, &yv)); 436 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 437 if (zz != yy) Kokkos::deep_copy(zv, yv); 438 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 439 PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 440 PetscCall(VecRestoreKokkosView(xx, &xv)); 441 PetscCall(VecRestoreKokkosView(yy, &yv)); 442 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 443 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 444 PetscCall(PetscLogGpuTimeEnd()); 445 PetscFunctionReturn(PETSC_SUCCESS); 446 } 447 448 /* z = A^T x + y */ 449 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 450 { 451 Mat_SeqAIJKokkos *aijkok; 452 const char *mode; 453 ConstPetscScalarKokkosView xv, yv; 454 PetscScalarKokkosView zv; 455 KokkosCsrMatrix csrmat; 456 457 PetscFunctionBegin; 458 PetscCall(PetscLogGpuTimeBegin()); 459 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 460 PetscCall(VecGetKokkosView(xx, &xv)); 461 PetscCall(VecGetKokkosView(yy, &yv)); 462 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 463 if (zz != yy) Kokkos::deep_copy(zv, yv); 464 if (A->form_explicit_transpose) { 465 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 466 mode = "N"; 467 } else { 468 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 469 csrmat = aijkok->csrmat; 470 mode = "T"; 471 } 472 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 473 PetscCall(VecRestoreKokkosView(xx, &xv)); 474 PetscCall(VecRestoreKokkosView(yy, &yv)); 475 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 476 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 477 PetscCall(PetscLogGpuTimeEnd()); 478 PetscFunctionReturn(PETSC_SUCCESS); 479 } 480 481 /* z = A^H x + y */ 482 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 483 { 484 Mat_SeqAIJKokkos *aijkok; 485 const char *mode; 486 ConstPetscScalarKokkosView xv, yv; 487 PetscScalarKokkosView zv; 488 KokkosCsrMatrix csrmat; 489 490 PetscFunctionBegin; 491 PetscCall(PetscLogGpuTimeBegin()); 492 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 493 PetscCall(VecGetKokkosView(xx, &xv)); 494 PetscCall(VecGetKokkosView(yy, &yv)); 495 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 496 if (zz != yy) Kokkos::deep_copy(zv, yv); 497 if (A->form_explicit_transpose) { 498 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 499 mode = "N"; 500 } else { 501 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 502 csrmat = aijkok->csrmat; 503 mode = "C"; 504 } 505 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 506 PetscCall(VecRestoreKokkosView(xx, &xv)); 507 PetscCall(VecRestoreKokkosView(yy, &yv)); 508 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 509 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 510 PetscCall(PetscLogGpuTimeEnd()); 511 PetscFunctionReturn(PETSC_SUCCESS); 512 } 513 514 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 515 { 516 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 517 518 PetscFunctionBegin; 519 switch (op) { 520 case MAT_FORM_EXPLICIT_TRANSPOSE: 521 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 522 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 523 A->form_explicit_transpose = flg; 524 break; 525 default: 526 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 527 break; 528 } 529 PetscFunctionReturn(PETSC_SUCCESS); 530 } 531 532 /* Depending on reuse, either build a new mat, or use the existing mat */ 533 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 534 { 535 Mat_SeqAIJ *aseq; 536 537 PetscFunctionBegin; 538 PetscCall(PetscKokkosInitializeCheck()); 539 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 540 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */ 541 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 542 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 543 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 544 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 545 PetscCall(PetscFree(A->defaultvectype)); 546 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 547 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 548 PetscCall(MatSetOps_SeqAIJKokkos(A)); 549 aseq = static_cast<Mat_SeqAIJ *>(A->data); 550 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 551 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 552 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE); 553 } 554 } 555 PetscFunctionReturn(PETSC_SUCCESS); 556 } 557 558 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 559 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 560 */ 561 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 562 { 563 Mat_SeqAIJ *bseq; 564 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 565 Mat mat; 566 567 PetscFunctionBegin; 568 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 569 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 570 mat = *B; 571 if (A->preallocated) { 572 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 573 bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE); 574 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 575 /* Now copy values to B if needed */ 576 if (dupOption == MAT_COPY_VALUES) { 577 if (akok->a_dual.need_sync_device()) { 578 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 579 bkok->a_dual.modify_host(); 580 } else { /* If device has the latest data, we only copy data on device */ 581 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 582 bkok->a_dual.modify_device(); 583 } 584 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 585 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 586 bkok->a_dual.modify_host(); 587 } 588 mat->spptr = bkok; 589 } 590 591 PetscCall(PetscFree(mat->defaultvectype)); 592 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 593 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 594 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 595 PetscFunctionReturn(PETSC_SUCCESS); 596 } 597 598 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 599 { 600 Mat At; 601 KokkosCsrMatrix internT; 602 Mat_SeqAIJKokkos *atkok, *bkok; 603 604 PetscFunctionBegin; 605 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 606 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 607 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 608 /* Deep copy internT, as we want to isolate the internal transpose */ 609 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 610 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 611 if (reuse == MAT_INITIAL_MATRIX) *B = At; 612 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 613 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 614 if ((*B)->assembled) { 615 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 616 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 617 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 618 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 619 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 620 MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 621 MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 622 PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 623 PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 624 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 625 } 626 PetscFunctionReturn(PETSC_SUCCESS); 627 } 628 629 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 630 { 631 Mat_SeqAIJKokkos *aijkok; 632 633 PetscFunctionBegin; 634 if (A->factortype == MAT_FACTOR_NONE) { 635 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 636 delete aijkok; 637 } else { 638 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 639 } 640 A->spptr = NULL; 641 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 642 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 643 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 644 PetscCall(MatDestroy_SeqAIJ(A)); 645 PetscFunctionReturn(PETSC_SUCCESS); 646 } 647 648 /*MC 649 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 650 651 A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 652 653 Options Database Key: 654 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 655 656 Level: beginner 657 658 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 659 M*/ 660 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 661 { 662 PetscFunctionBegin; 663 PetscCall(PetscKokkosInitializeCheck()); 664 PetscCall(MatCreate_SeqAIJ(A)); 665 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 666 PetscFunctionReturn(PETSC_SUCCESS); 667 } 668 669 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 670 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 671 { 672 Mat_SeqAIJ *a, *b; 673 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 674 MatScalarKokkosView aa, ba, ca; 675 MatRowMapKokkosView ai, bi, ci; 676 MatColIdxKokkosView aj, bj, cj; 677 PetscInt m, n, nnz, aN; 678 679 PetscFunctionBegin; 680 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 681 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 682 PetscAssertPointer(C, 4); 683 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 684 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 685 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 686 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 687 688 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 689 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 690 a = static_cast<Mat_SeqAIJ *>(A->data); 691 b = static_cast<Mat_SeqAIJ *>(B->data); 692 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 693 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 694 aa = akok->a_dual.view_device(); 695 ai = akok->i_dual.view_device(); 696 ba = bkok->a_dual.view_device(); 697 bi = bkok->i_dual.view_device(); 698 m = A->rmap->n; /* M, N and nnz of C */ 699 n = A->cmap->n + B->cmap->n; 700 nnz = a->nz + b->nz; 701 aN = A->cmap->n; /* N of A */ 702 if (reuse == MAT_INITIAL_MATRIX) { 703 aj = akok->j_dual.view_device(); 704 bj = bkok->j_dual.view_device(); 705 auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 706 auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 707 auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 708 ca = ca_dual.view_device(); 709 ci = ci_dual.view_device(); 710 cj = cj_dual.view_device(); 711 712 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 713 Kokkos::parallel_for( 714 Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 715 PetscInt i = t.league_rank(); /* row i */ 716 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 717 718 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 719 ci(i) = coffset; 720 if (i == m - 1) ci(m) = ai(m) + bi(m); 721 }); 722 723 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 724 if (k < alen) { 725 ca(coffset + k) = aa(ai(i) + k); 726 cj(coffset + k) = aj(ai(i) + k); 727 } else { 728 ca(coffset + k) = ba(bi(i) + k - alen); 729 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 730 } 731 }); 732 }); 733 ca_dual.modify_device(); 734 ci_dual.modify_device(); 735 cj_dual.modify_device(); 736 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 737 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 738 } else if (reuse == MAT_REUSE_MATRIX) { 739 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 740 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 741 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 742 ca = ckok->a_dual.view_device(); 743 ci = ckok->i_dual.view_device(); 744 745 Kokkos::parallel_for( 746 Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 747 PetscInt i = t.league_rank(); /* row i */ 748 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 749 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 750 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 751 else ca(ci(i) + k) = ba(bi(i) + k - alen); 752 }); 753 }); 754 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 755 } 756 PetscFunctionReturn(PETSC_SUCCESS); 757 } 758 759 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 760 { 761 PetscFunctionBegin; 762 delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 763 PetscFunctionReturn(PETSC_SUCCESS); 764 } 765 766 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 767 { 768 Mat_Product *product = C->product; 769 Mat A, B; 770 bool transA, transB; /* use bool, since KK needs this type */ 771 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 772 Mat_SeqAIJ *c; 773 MatProductData_SeqAIJKokkos *pdata; 774 KokkosCsrMatrix csrmatA, csrmatB; 775 776 PetscFunctionBegin; 777 MatCheckProduct(C, 1); 778 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 779 pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 780 781 // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 782 // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 783 // we still do numeric. 784 if (pdata->reusesym) { // numeric reuses results from symbolic 785 pdata->reusesym = PETSC_FALSE; 786 PetscFunctionReturn(PETSC_SUCCESS); 787 } 788 789 switch (product->type) { 790 case MATPRODUCT_AB: 791 transA = false; 792 transB = false; 793 break; 794 case MATPRODUCT_AtB: 795 transA = true; 796 transB = false; 797 break; 798 case MATPRODUCT_ABt: 799 transA = false; 800 transB = true; 801 break; 802 default: 803 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 804 } 805 806 A = product->A; 807 B = product->B; 808 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 809 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 810 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 811 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 812 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 813 814 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 815 816 csrmatA = akok->csrmat; 817 csrmatB = bkok->csrmat; 818 819 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 820 if (transA) { 821 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 822 transA = false; 823 } 824 825 if (transB) { 826 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 827 transB = false; 828 } 829 PetscCall(PetscLogGpuTimeBegin()); 830 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 831 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 832 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 833 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 834 #endif 835 836 PetscCall(PetscLogGpuTimeEnd()); 837 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 838 /* shorter version of MatAssemblyEnd_SeqAIJ */ 839 c = (Mat_SeqAIJ *)C->data; 840 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 841 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 842 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 843 c->reallocs = 0; 844 C->info.mallocs = 0; 845 C->info.nz_unneeded = 0; 846 C->assembled = C->was_assembled = PETSC_TRUE; 847 C->num_ass++; 848 PetscFunctionReturn(PETSC_SUCCESS); 849 } 850 851 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 852 { 853 Mat_Product *product = C->product; 854 MatProductType ptype; 855 Mat A, B; 856 bool transA, transB; 857 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 858 MatProductData_SeqAIJKokkos *pdata; 859 MPI_Comm comm; 860 KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 861 862 PetscFunctionBegin; 863 MatCheckProduct(C, 1); 864 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 865 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 866 A = product->A; 867 B = product->B; 868 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 869 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 870 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 871 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 872 csrmatA = akok->csrmat; 873 csrmatB = bkok->csrmat; 874 875 ptype = product->type; 876 // Take advantage of the symmetry if true 877 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 878 ptype = MATPRODUCT_AB; 879 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 880 } 881 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 882 ptype = MATPRODUCT_AB; 883 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 884 } 885 886 switch (ptype) { 887 case MATPRODUCT_AB: 888 transA = false; 889 transB = false; 890 break; 891 case MATPRODUCT_AtB: 892 transA = true; 893 transB = false; 894 break; 895 case MATPRODUCT_ABt: 896 transA = false; 897 transB = true; 898 break; 899 default: 900 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 901 } 902 PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 903 pdata->reusesym = product->api_user; 904 905 /* TODO: add command line options to select spgemm algorithms */ 906 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 907 908 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 909 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 910 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 911 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 912 #endif 913 #endif 914 PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 915 916 PetscCall(PetscLogGpuTimeBegin()); 917 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 918 if (transA) { 919 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 920 transA = false; 921 } 922 923 if (transB) { 924 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 925 transB = false; 926 } 927 928 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 929 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 930 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 931 calling new Mat_SeqAIJKokkos(). 932 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 933 */ 934 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 935 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 936 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 937 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 938 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 939 #endif 940 PetscCall(PetscLogGpuTimeEnd()); 941 942 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 943 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 944 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 945 PetscFunctionReturn(PETSC_SUCCESS); 946 } 947 948 /* handles sparse matrix matrix ops */ 949 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 950 { 951 Mat_Product *product = mat->product; 952 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 953 954 PetscFunctionBegin; 955 MatCheckProduct(mat, 1); 956 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 957 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 958 if (Biskok && Ciskok) { 959 switch (product->type) { 960 case MATPRODUCT_AB: 961 case MATPRODUCT_AtB: 962 case MATPRODUCT_ABt: 963 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 964 break; 965 case MATPRODUCT_PtAP: 966 case MATPRODUCT_RARt: 967 case MATPRODUCT_ABC: 968 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 969 break; 970 default: 971 break; 972 } 973 } else { /* fallback for AIJ */ 974 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 975 } 976 PetscFunctionReturn(PETSC_SUCCESS); 977 } 978 979 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 980 { 981 Mat_SeqAIJKokkos *aijkok; 982 983 PetscFunctionBegin; 984 PetscCall(PetscLogGpuTimeBegin()); 985 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 986 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 987 KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 988 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 989 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 990 PetscCall(PetscLogGpuTimeEnd()); 991 PetscFunctionReturn(PETSC_SUCCESS); 992 } 993 994 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 995 { 996 Mat_SeqAIJKokkos *aijkok; 997 998 PetscFunctionBegin; 999 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1000 if (aijkok) { /* Only zero the device if data is already there */ 1001 KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0); 1002 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1003 } else { /* Might be preallocated but not assembled */ 1004 PetscCall(MatZeroEntries_SeqAIJ(A)); 1005 } 1006 PetscFunctionReturn(PETSC_SUCCESS); 1007 } 1008 1009 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1010 { 1011 Mat_SeqAIJ *aijseq; 1012 Mat_SeqAIJKokkos *aijkok; 1013 PetscInt n; 1014 PetscScalarKokkosView xv; 1015 1016 PetscFunctionBegin; 1017 PetscCall(VecGetLocalSize(x, &n)); 1018 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1019 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1020 1021 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1022 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1023 1024 if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */ 1025 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1026 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1027 aijkok->SetDiagonal(aijseq->diag); 1028 } 1029 1030 const auto &Aa = aijkok->a_dual.view_device(); 1031 const auto &Ai = aijkok->i_dual.view_device(); 1032 const auto &Adiag = aijkok->diag_dual.view_device(); 1033 1034 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1035 Kokkos::parallel_for( 1036 n, KOKKOS_LAMBDA(const PetscInt i) { 1037 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1038 else xv(i) = 0; 1039 }); 1040 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1041 PetscFunctionReturn(PETSC_SUCCESS); 1042 } 1043 1044 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1045 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1046 { 1047 Mat_SeqAIJKokkos *aijkok; 1048 1049 PetscFunctionBegin; 1050 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1051 PetscAssertPointer(kv, 2); 1052 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1053 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1054 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1055 *kv = aijkok->a_dual.view_device(); 1056 PetscFunctionReturn(PETSC_SUCCESS); 1057 } 1058 1059 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1060 { 1061 PetscFunctionBegin; 1062 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1063 PetscAssertPointer(kv, 2); 1064 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1065 PetscFunctionReturn(PETSC_SUCCESS); 1066 } 1067 1068 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1069 { 1070 Mat_SeqAIJKokkos *aijkok; 1071 1072 PetscFunctionBegin; 1073 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1074 PetscAssertPointer(kv, 2); 1075 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1076 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1077 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1078 *kv = aijkok->a_dual.view_device(); 1079 PetscFunctionReturn(PETSC_SUCCESS); 1080 } 1081 1082 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1083 { 1084 PetscFunctionBegin; 1085 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1086 PetscAssertPointer(kv, 2); 1087 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1088 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1089 PetscFunctionReturn(PETSC_SUCCESS); 1090 } 1091 1092 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1093 { 1094 Mat_SeqAIJKokkos *aijkok; 1095 1096 PetscFunctionBegin; 1097 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1098 PetscAssertPointer(kv, 2); 1099 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1100 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1101 *kv = aijkok->a_dual.view_device(); 1102 PetscFunctionReturn(PETSC_SUCCESS); 1103 } 1104 1105 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1106 { 1107 PetscFunctionBegin; 1108 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1109 PetscAssertPointer(kv, 2); 1110 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1111 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1112 PetscFunctionReturn(PETSC_SUCCESS); 1113 } 1114 1115 /* Computes Y += alpha X */ 1116 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1117 { 1118 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1119 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1120 ConstMatScalarKokkosView Xa; 1121 MatScalarKokkosView Ya; 1122 1123 PetscFunctionBegin; 1124 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1125 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1126 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1127 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1128 PetscCall(PetscLogGpuTimeBegin()); 1129 1130 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1131 PetscBool e; 1132 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1133 if (e) { 1134 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1135 if (e) pattern = SAME_NONZERO_PATTERN; 1136 } 1137 } 1138 1139 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1140 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1141 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1142 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1143 */ 1144 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1145 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1146 Xa = xkok->a_dual.view_device(); 1147 Ya = ykok->a_dual.view_device(); 1148 1149 if (pattern == SAME_NONZERO_PATTERN) { 1150 KokkosBlas::axpy(alpha, Xa, Ya); 1151 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1152 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1153 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1154 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1155 1156 Kokkos::parallel_for( 1157 Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1158 PetscInt i = t.league_rank(); // row i 1159 Kokkos::single(Kokkos::PerTeam(t), [=]() { 1160 // Only one thread works in a team 1161 PetscInt p, q = Yi(i); 1162 for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 1163 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 1164 if (Xj(p) == Yj(q)) { // Found it 1165 Ya(q) += alpha * Xa(p); 1166 q++; 1167 } else { 1168 // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1169 // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1170 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 1171 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 1172 #else 1173 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 1174 #endif 1175 } 1176 } 1177 }); 1178 }); 1179 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1180 } else { // different nonzero patterns 1181 Mat Z; 1182 KokkosCsrMatrix zcsr; 1183 KernelHandle kh; 1184 kh.create_spadd_handle(true); // X, Y are sorted 1185 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1186 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1187 zkok = new Mat_SeqAIJKokkos(zcsr); 1188 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1189 PetscCall(MatHeaderReplace(Y, &Z)); 1190 kh.destroy_spadd_handle(); 1191 } 1192 PetscCall(PetscLogGpuTimeEnd()); 1193 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 1194 PetscFunctionReturn(PETSC_SUCCESS); 1195 } 1196 1197 struct MatCOOStruct_SeqAIJKokkos { 1198 PetscCount n; 1199 PetscCount Atot; 1200 PetscInt nz; 1201 PetscCountKokkosView jmap; 1202 PetscCountKokkosView perm; 1203 1204 MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h) 1205 { 1206 nz = coo_h->nz; 1207 n = coo_h->n; 1208 Atot = coo_h->Atot; 1209 jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1)); 1210 perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot)); 1211 } 1212 }; 1213 1214 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void *data) 1215 { 1216 PetscFunctionBegin; 1217 PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(data)); 1218 PetscFunctionReturn(PETSC_SUCCESS); 1219 } 1220 1221 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1222 { 1223 Mat_SeqAIJKokkos *akok; 1224 Mat_SeqAIJ *aseq; 1225 PetscContainer container_h, container_d; 1226 MatCOOStruct_SeqAIJ *coo_h; 1227 MatCOOStruct_SeqAIJKokkos *coo_d; 1228 1229 PetscFunctionBegin; 1230 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1231 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1232 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1233 delete akok; 1234 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE); 1235 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1236 1237 // Copy the COO struct to device 1238 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 1239 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 1240 PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h)); 1241 1242 // Put the COO struct in a container and then attach that to the matrix 1243 PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d)); 1244 PetscCall(PetscContainerSetPointer(container_d, coo_d)); 1245 PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_SeqAIJKokkos)); 1246 PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d)); 1247 PetscCall(PetscContainerDestroy(&container_d)); 1248 PetscFunctionReturn(PETSC_SUCCESS); 1249 } 1250 1251 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1252 { 1253 MatScalarKokkosView Aa; 1254 ConstMatScalarKokkosView kv; 1255 PetscMemType memtype; 1256 PetscContainer container; 1257 MatCOOStruct_SeqAIJKokkos *coo; 1258 1259 PetscFunctionBegin; 1260 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 1261 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 1262 1263 const auto &n = coo->n; 1264 const auto &Annz = coo->nz; 1265 const auto &jmap = coo->jmap; 1266 const auto &perm = coo->perm; 1267 1268 PetscCall(PetscGetMemType(v, &memtype)); 1269 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1270 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n)); 1271 } else { 1272 kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */ 1273 } 1274 1275 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1276 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1277 1278 PetscCall(PetscLogGpuTimeBegin()); 1279 Kokkos::parallel_for( 1280 Annz, KOKKOS_LAMBDA(const PetscCount i) { 1281 PetscScalar sum = 0.0; 1282 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1283 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1284 }); 1285 PetscCall(PetscLogGpuTimeEnd()); 1286 1287 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1288 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1289 PetscFunctionReturn(PETSC_SUCCESS); 1290 } 1291 1292 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1293 { 1294 PetscFunctionBegin; 1295 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1296 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1297 B->offloadmask = PETSC_OFFLOAD_CPU; 1298 PetscFunctionReturn(PETSC_SUCCESS); 1299 } 1300 1301 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1302 { 1303 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1304 1305 PetscFunctionBegin; 1306 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1307 A->boundtocpu = PETSC_FALSE; 1308 1309 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1310 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1311 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1312 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1313 A->ops->scale = MatScale_SeqAIJKokkos; 1314 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1315 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1316 A->ops->mult = MatMult_SeqAIJKokkos; 1317 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1318 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1319 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1320 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1321 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1322 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1323 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1324 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1325 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1326 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1327 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1328 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1329 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1330 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1331 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1332 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1333 1334 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1335 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1336 PetscFunctionReturn(PETSC_SUCCESS); 1337 } 1338 1339 /* 1340 Extract the (prescribled) diagonal blocks of the matrix and then invert them 1341 1342 Input Parameters: 1343 + A - the MATSEQAIJKOKKOS matrix 1344 . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i) 1345 . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[] 1346 . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i) 1347 - work - a pre-allocated work buffer (as big as diagVal) for use by this routine 1348 1349 Output Parameter: 1350 . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order) 1351 */ 1352 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal) 1353 { 1354 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1355 PetscInt N = A->rmap->n; 1356 PetscInt nblocks = bs.extent(0) - 1; 1357 1358 PetscFunctionBegin; 1359 // Set the diagonal pointer on device if not already 1360 if (N && akok->diag_dual.extent(0) == 0) { 1361 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1362 akok->SetDiagonal(static_cast<Mat_SeqAIJ *>(A->data)->diag); 1363 } 1364 1365 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device 1366 1367 // Pull out the diagonal blocks of the matrix and then invert the blocks 1368 auto Aa = akok->a_dual.view_device(); 1369 auto Ai = akok->i_dual.view_device(); 1370 auto Aj = akok->j_dual.view_device(); 1371 auto Adiag = akok->diag_dual.view_device(); 1372 // TODO: how to tune the team size? 1373 #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HOST) 1374 auto ts = Kokkos::AUTO(); 1375 #else 1376 auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs 1377 #endif 1378 PetscCallCXX(Kokkos::parallel_for( 1379 Kokkos::TeamPolicy<>(nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) { 1380 const PetscInt bid = teamMember.league_rank(); // block id 1381 const PetscInt rstart = bs(bid); // this block starts from this row 1382 const PetscInt m = bs(bid + 1) - bs(bid); // size of this block 1383 const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order 1384 const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m); 1385 1386 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B 1387 PetscInt i = rstart + r; // i-th row in A 1388 1389 if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case) 1390 PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row 1391 1392 for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet 1393 if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero 1394 B(r, c) = 0.0; 1395 } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column 1396 B(r, c) = Aa(first + c); 1397 } else { // this entry does not show up in the CSR 1398 B(r, c) = 0.0; 1399 } 1400 } 1401 } else { // rare case that the diagonal does not exist 1402 const PetscInt begin = Ai(i); 1403 const PetscInt end = Ai(i + 1); 1404 for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0; 1405 for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not. 1406 if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j); 1407 else if (Aj(j) >= rstart + m) break; 1408 } 1409 } 1410 }); 1411 1412 // LU-decompose B (w/o pivoting) and then invert B 1413 KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0); 1414 KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W); 1415 })); 1416 // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels 1417 PetscFunctionReturn(PETSC_SUCCESS); 1418 } 1419 1420 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1421 { 1422 Mat_SeqAIJ *aseq; 1423 PetscInt i, m, n; 1424 auto &exec = PetscGetKokkosExecutionSpace(); 1425 1426 PetscFunctionBegin; 1427 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1428 1429 m = akok->nrows(); 1430 n = akok->ncols(); 1431 PetscCall(MatSetSizes(A, m, n, m, n)); 1432 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1433 1434 /* Set up data structures of A as a MATSEQAIJ */ 1435 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1436 aseq = (Mat_SeqAIJ *)(A)->data; 1437 1438 PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */ 1439 PetscCallCXX(akok->j_dual.sync_host(exec)); 1440 PetscCallCXX(exec.fence()); 1441 1442 aseq->i = akok->i_host_data(); 1443 aseq->j = akok->j_host_data(); 1444 aseq->a = akok->a_host_data(); 1445 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1446 aseq->singlemalloc = PETSC_FALSE; 1447 aseq->free_a = PETSC_FALSE; 1448 aseq->free_ij = PETSC_FALSE; 1449 aseq->nz = akok->nnz(); 1450 aseq->maxnz = aseq->nz; 1451 1452 PetscCall(PetscMalloc1(m, &aseq->imax)); 1453 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1454 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1455 1456 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1457 akok->nonzerostate = A->nonzerostate; 1458 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1459 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1460 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1461 PetscFunctionReturn(PETSC_SUCCESS); 1462 } 1463 1464 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 1465 { 1466 PetscFunctionBegin; 1467 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1468 *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 1469 PetscFunctionReturn(PETSC_SUCCESS); 1470 } 1471 1472 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 1473 { 1474 Mat_SeqAIJKokkos *akok; 1475 PetscFunctionBegin; 1476 PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 1477 PetscCall(MatCreate(comm, A)); 1478 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1479 PetscFunctionReturn(PETSC_SUCCESS); 1480 } 1481 1482 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1483 1484 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1485 */ 1486 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1487 { 1488 PetscFunctionBegin; 1489 PetscCall(MatCreate(comm, A)); 1490 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1491 PetscFunctionReturn(PETSC_SUCCESS); 1492 } 1493 1494 /*@C 1495 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1496 (the default parallel PETSc format). This matrix will ultimately be handled by 1497 Kokkos for calculations. 1498 1499 Collective 1500 1501 Input Parameters: 1502 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1503 . m - number of rows 1504 . n - number of columns 1505 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 1506 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 1507 1508 Output Parameter: 1509 . A - the matrix 1510 1511 Level: intermediate 1512 1513 Notes: 1514 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1515 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1516 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1517 1518 The AIJ format, also called 1519 compressed row storage, is fully compatible with standard Fortran 1520 storage. That is, the stored row and column indices can begin at 1521 either one (as in Fortran) or zero. 1522 1523 Specify the preallocated storage with either `nz` or `nnz` (not both). 1524 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1525 allocation. 1526 1527 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()` 1528 @*/ 1529 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1530 { 1531 PetscFunctionBegin; 1532 PetscCall(PetscKokkosInitializeCheck()); 1533 PetscCall(MatCreate(comm, A)); 1534 PetscCall(MatSetSizes(*A, m, n, m, n)); 1535 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1536 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1537 PetscFunctionReturn(PETSC_SUCCESS); 1538 } 1539 1540 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1541 { 1542 PetscFunctionBegin; 1543 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1544 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1545 PetscFunctionReturn(PETSC_SUCCESS); 1546 } 1547 1548 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1549 { 1550 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1551 1552 PetscFunctionBegin; 1553 if (!factors->sptrsv_symbolic_completed) { 1554 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d); 1555 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d); 1556 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1557 } 1558 PetscFunctionReturn(PETSC_SUCCESS); 1559 } 1560 1561 /* Check if we need to update factors etc for transpose solve */ 1562 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1563 { 1564 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1565 MatColIdxType n = A->rmap->n; 1566 1567 PetscFunctionBegin; 1568 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1569 /* Update L^T and do sptrsv symbolic */ 1570 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0 1571 factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 1572 factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 1573 1574 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d, 1575 factors->iLt_d, factors->jLt_d, factors->aLt_d); 1576 1577 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1578 We have to sort the indices, until KK provides finer control options. 1579 */ 1580 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d); 1581 1582 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d); 1583 1584 /* Update U^T and do sptrsv symbolic */ 1585 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0 1586 factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 1587 factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 1588 1589 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d, 1590 factors->iUt_d, factors->jUt_d, factors->aUt_d); 1591 1592 /* Sort indices. See comments above */ 1593 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d); 1594 1595 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d); 1596 factors->transpose_updated = PETSC_TRUE; 1597 } 1598 PetscFunctionReturn(PETSC_SUCCESS); 1599 } 1600 1601 /* Solve Ax = b, with A = LU */ 1602 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x) 1603 { 1604 ConstPetscScalarKokkosView bv; 1605 PetscScalarKokkosView xv; 1606 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1607 1608 PetscFunctionBegin; 1609 PetscCall(PetscLogGpuTimeBegin()); 1610 PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 1611 PetscCall(VecGetKokkosView(b, &bv)); 1612 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1613 /* Solve L tmpv = b */ 1614 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector)); 1615 /* Solve Ux = tmpv */ 1616 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv)); 1617 PetscCall(VecRestoreKokkosView(b, &bv)); 1618 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1619 PetscCall(PetscLogGpuTimeEnd()); 1620 PetscFunctionReturn(PETSC_SUCCESS); 1621 } 1622 1623 /* Solve A^T x = b, where A^T = U^T L^T */ 1624 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x) 1625 { 1626 ConstPetscScalarKokkosView bv; 1627 PetscScalarKokkosView xv; 1628 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1629 1630 PetscFunctionBegin; 1631 PetscCall(PetscLogGpuTimeBegin()); 1632 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 1633 PetscCall(VecGetKokkosView(b, &bv)); 1634 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1635 /* Solve U^T tmpv = b */ 1636 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector); 1637 1638 /* Solve L^T x = tmpv */ 1639 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv); 1640 PetscCall(VecRestoreKokkosView(b, &bv)); 1641 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1642 PetscCall(PetscLogGpuTimeEnd()); 1643 PetscFunctionReturn(PETSC_SUCCESS); 1644 } 1645 1646 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1647 { 1648 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1649 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1650 PetscInt fill_lev = info->levels; 1651 1652 PetscFunctionBegin; 1653 PetscCall(PetscLogGpuTimeBegin()); 1654 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1655 1656 auto a_d = aijkok->a_dual.view_device(); 1657 auto i_d = aijkok->i_dual.view_device(); 1658 auto j_d = aijkok->j_dual.view_device(); 1659 1660 KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d); 1661 1662 B->assembled = PETSC_TRUE; 1663 B->preallocated = PETSC_TRUE; 1664 B->ops->solve = MatSolve_SeqAIJKokkos; 1665 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1666 B->ops->matsolve = NULL; 1667 B->ops->matsolvetranspose = NULL; 1668 B->offloadmask = PETSC_OFFLOAD_GPU; 1669 1670 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1671 factors->transpose_updated = PETSC_FALSE; 1672 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1673 /* TODO: log flops, but how to know that? */ 1674 PetscCall(PetscLogGpuTimeEnd()); 1675 PetscFunctionReturn(PETSC_SUCCESS); 1676 } 1677 1678 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1679 { 1680 Mat_SeqAIJKokkos *aijkok; 1681 Mat_SeqAIJ *b; 1682 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1683 PetscInt fill_lev = info->levels; 1684 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 1685 PetscInt n = A->rmap->n; 1686 1687 PetscFunctionBegin; 1688 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1689 /* Rebuild factors */ 1690 if (factors) { 1691 factors->Destroy(); 1692 } /* Destroy the old if it exists */ 1693 else { 1694 B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n); 1695 } 1696 1697 /* Create a spiluk handle and then do symbolic factorization */ 1698 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 1699 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 1700 1701 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1702 1703 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 1704 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 1705 Kokkos::realloc(factors->iU_d, n + 1); 1706 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 1707 1708 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1709 auto i_d = aijkok->i_dual.view_device(); 1710 auto j_d = aijkok->j_dual.view_device(); 1711 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d); 1712 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1713 1714 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1715 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 1716 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 1717 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 1718 1719 /* TODO: add options to select sptrsv algorithms */ 1720 /* Create sptrsv handles for L, U and their transpose */ 1721 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1722 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1723 #else 1724 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1725 #endif 1726 1727 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 1728 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 1729 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 1730 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 1731 1732 /* Fill fields of the factor matrix B */ 1733 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 1734 b = (Mat_SeqAIJ *)B->data; 1735 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 1736 B->info.fill_ratio_given = info->fill; 1737 B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 1738 1739 B->offloadmask = PETSC_OFFLOAD_GPU; 1740 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1741 PetscFunctionReturn(PETSC_SUCCESS); 1742 } 1743 1744 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type) 1745 { 1746 PetscFunctionBegin; 1747 *type = MATSOLVERKOKKOS; 1748 PetscFunctionReturn(PETSC_SUCCESS); 1749 } 1750 1751 /*MC 1752 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1753 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 1754 1755 Level: beginner 1756 1757 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1758 M*/ 1759 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1760 { 1761 PetscInt n = A->rmap->n; 1762 1763 PetscFunctionBegin; 1764 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 1765 PetscCall(MatSetSizes(*B, n, n, n, n)); 1766 (*B)->factortype = ftype; 1767 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 1768 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1769 1770 if (ftype == MAT_FACTOR_LU) { 1771 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1772 (*B)->canuseordering = PETSC_TRUE; 1773 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1774 } else if (ftype == MAT_FACTOR_ILU) { 1775 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1776 (*B)->canuseordering = PETSC_FALSE; 1777 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1778 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1779 1780 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 1781 PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos)); 1782 PetscFunctionReturn(PETSC_SUCCESS); 1783 } 1784 1785 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1786 { 1787 PetscFunctionBegin; 1788 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1789 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1790 PetscFunctionReturn(PETSC_SUCCESS); 1791 } 1792 1793 /* Utility to print out a KokkosCsrMatrix for debugging */ 1794 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 1795 { 1796 const auto &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map); 1797 const auto &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries); 1798 const auto &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values); 1799 const PetscInt *i = iv.data(); 1800 const PetscInt *j = jv.data(); 1801 const PetscScalar *a = av.data(); 1802 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 1803 1804 PetscFunctionBegin; 1805 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 1806 for (PetscInt k = 0; k < m; k++) { 1807 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 1808 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 1809 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 1810 } 1811 PetscFunctionReturn(PETSC_SUCCESS); 1812 } 1813