1 #include <petsc_kokkos.hpp> 2 #include <petscvec_kokkos.hpp> 3 #include <petscpkg_version.h> 4 #include <petsc/private/petscimpl.h> 5 #include <petsc/private/sfimpl.h> 6 #include <petscsystypes.h> 7 #include <petscerror.h> 8 9 #include <Kokkos_Core.hpp> 10 #include <KokkosBlas.hpp> 11 #include <KokkosSparse_CrsMatrix.hpp> 12 #include <KokkosSparse_spmv.hpp> 13 #include <KokkosSparse_spiluk.hpp> 14 #include <KokkosSparse_sptrsv.hpp> 15 #include <KokkosSparse_spgemm.hpp> 16 #include <KokkosSparse_spadd.hpp> 17 #include <KokkosBatched_LU_Decl.hpp> 18 #include <KokkosBatched_InverseLU_Decl.hpp> 19 20 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 21 22 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 23 #include <KokkosSparse_Utils.hpp> 24 using KokkosSparse::sort_crs_matrix; 25 using KokkosSparse::Impl::transpose_matrix; 26 #else 27 #include <KokkosKernels_Sorting.hpp> 28 using KokkosKernels::sort_crs_matrix; 29 using KokkosKernels::Impl::transpose_matrix; 30 #endif 31 32 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 33 34 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 35 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 36 In the latter case, it is important to set a_dual's sync state correctly. 37 */ 38 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 39 { 40 Mat_SeqAIJ *aijseq; 41 Mat_SeqAIJKokkos *aijkok; 42 43 PetscFunctionBegin; 44 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 45 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 46 47 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 48 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 49 50 /* If aijkok does not exist, we just copy i, j to device. 51 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 52 In both cases, we build a new aijkok structure. 53 */ 54 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 55 delete aijkok; 56 aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 57 A->spptr = aijkok; 58 } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag. 59 MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n); 60 auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h); 61 aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h); 62 } 63 PetscFunctionReturn(PETSC_SUCCESS); 64 } 65 66 /* Sync CSR data to device if not yet */ 67 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 68 { 69 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 70 71 PetscFunctionBegin; 72 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 73 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 74 if (aijkok->a_dual.need_sync_device()) { 75 aijkok->a_dual.sync_device(); 76 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 77 aijkok->hermitian_updated = PETSC_FALSE; 78 } 79 PetscFunctionReturn(PETSC_SUCCESS); 80 } 81 82 /* Mark the CSR data on device as modified */ 83 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 84 { 85 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 86 87 PetscFunctionBegin; 88 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 89 aijkok->a_dual.clear_sync_state(); 90 aijkok->a_dual.modify_device(); 91 aijkok->transpose_updated = PETSC_FALSE; 92 aijkok->hermitian_updated = PETSC_FALSE; 93 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 94 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 95 PetscFunctionReturn(PETSC_SUCCESS); 96 } 97 98 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 99 { 100 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 101 auto &exec = PetscGetKokkosExecutionSpace(); 102 103 PetscFunctionBegin; 104 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 105 /* We do not expect one needs factors on host */ 106 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 107 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 108 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 109 PetscCallCXX(exec.fence()); 110 PetscFunctionReturn(PETSC_SUCCESS); 111 } 112 113 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 114 { 115 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 116 117 PetscFunctionBegin; 118 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 119 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 120 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 121 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 122 */ 123 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 124 auto &exec = PetscGetKokkosExecutionSpace(); 125 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 126 PetscCallCXX(exec.fence()); 127 *array = aijkok->a_dual.view_host().data(); 128 } else { /* Happens when calling MatSetValues on a newly created matrix */ 129 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 130 } 131 PetscFunctionReturn(PETSC_SUCCESS); 132 } 133 134 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 135 { 136 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 137 138 PetscFunctionBegin; 139 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 140 PetscFunctionReturn(PETSC_SUCCESS); 141 } 142 143 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 144 { 145 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 146 147 PetscFunctionBegin; 148 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 149 auto &exec = PetscGetKokkosExecutionSpace(); 150 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 151 PetscCallCXX(exec.fence()); 152 *array = aijkok->a_dual.view_host().data(); 153 } else { 154 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 155 } 156 PetscFunctionReturn(PETSC_SUCCESS); 157 } 158 159 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 160 { 161 PetscFunctionBegin; 162 *array = NULL; 163 PetscFunctionReturn(PETSC_SUCCESS); 164 } 165 166 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 167 { 168 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 169 170 PetscFunctionBegin; 171 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 172 *array = aijkok->a_dual.view_host().data(); 173 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 174 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 175 } 176 PetscFunctionReturn(PETSC_SUCCESS); 177 } 178 179 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 180 { 181 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 182 183 PetscFunctionBegin; 184 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 185 aijkok->a_dual.clear_sync_state(); 186 aijkok->a_dual.modify_host(); 187 } 188 PetscFunctionReturn(PETSC_SUCCESS); 189 } 190 191 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 192 { 193 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 194 195 PetscFunctionBegin; 196 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 197 198 if (i) *i = aijkok->i_device_data(); 199 if (j) *j = aijkok->j_device_data(); 200 if (a) { 201 aijkok->a_dual.sync_device(); 202 *a = aijkok->a_device_data(); 203 } 204 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 205 PetscFunctionReturn(PETSC_SUCCESS); 206 } 207 208 /* 209 Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 210 211 Input Parameter: 212 . A - the MATSEQAIJKOKKOS matrix 213 214 Output Parameters: 215 + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 216 - T_d - the transpose on device, whose value array is allocated but not initialized 217 */ 218 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 219 { 220 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 221 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 222 const PetscInt *Ai = aseq->i, *Aj = aseq->j; 223 MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 224 MatRowMapType *Ti = Ti_h.data(); 225 MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 226 MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 227 PetscInt *Tj = Tj_h.data(); 228 PetscInt *perm = perm_h.data(); 229 PetscInt *offset; 230 231 PetscFunctionBegin; 232 // Populate Ti 233 PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 234 Ti++; 235 for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 236 Ti--; 237 for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 238 239 // Populate Tj and the permutation array 240 PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 241 for (PetscInt i = 0; i < m; i++) { 242 for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 243 PetscInt r = Aj[j]; // row r of T 244 PetscInt disp = Ti[r] + offset[r]; 245 246 Tj[disp] = i; // col i of T 247 perm[disp] = j; 248 offset[r]++; 249 } 250 } 251 PetscCall(PetscFree(offset)); 252 253 // Sort each row of T, along with the permutation array 254 for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 255 256 // Output perm and T on device 257 auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 258 auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 259 PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 260 PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 261 PetscFunctionReturn(PETSC_SUCCESS); 262 } 263 264 // Generate the transpose on device and cache it internally 265 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 266 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 267 { 268 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 269 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 270 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 271 KokkosCsrMatrix &T = akok->csrmatT; 272 273 PetscFunctionBegin; 274 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 275 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 276 277 const auto &Aa = akok->a_dual.view_device(); 278 279 if (A->symmetric == PETSC_BOOL3_TRUE) { 280 *csrmatT = akok->csrmat; 281 } else { 282 // See if we already have a cached transpose and its value is up to date 283 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 284 if (!akok->transpose_updated) { // if the value is out of date, update the cached version 285 const auto &perm = akok->transpose_perm; // get the permutation array 286 auto &Ta = T.values; 287 288 PetscCallCXX(Kokkos::parallel_for( 289 nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 290 } 291 } else { // Generate T of size n x m for the first time 292 MatRowMapKokkosView perm; 293 294 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 295 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 296 PetscCallCXX(Kokkos::parallel_for( 297 nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 298 } 299 akok->transpose_updated = PETSC_TRUE; 300 *csrmatT = akok->csrmatT; 301 } 302 PetscFunctionReturn(PETSC_SUCCESS); 303 } 304 305 // Generate the Hermitian on device and cache it internally 306 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 307 { 308 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 309 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 310 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 311 KokkosCsrMatrix &T = akok->csrmatH; 312 313 PetscFunctionBegin; 314 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 315 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 316 317 const auto &Aa = akok->a_dual.view_device(); 318 319 if (A->hermitian == PETSC_BOOL3_TRUE) { 320 *csrmatH = akok->csrmat; 321 } else { 322 // See if we already have a cached hermitian and its value is up to date 323 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 324 if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 325 const auto &perm = akok->transpose_perm; // get the permutation array 326 auto &Ta = T.values; 327 328 PetscCallCXX(Kokkos::parallel_for( 329 nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 330 } 331 } else { // Generate T of size n x m for the first time 332 MatRowMapKokkosView perm; 333 334 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 335 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 336 PetscCallCXX(Kokkos::parallel_for( 337 nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 338 } 339 akok->hermitian_updated = PETSC_TRUE; 340 *csrmatH = akok->csrmatH; 341 } 342 PetscFunctionReturn(PETSC_SUCCESS); 343 } 344 345 /* y = A x */ 346 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 347 { 348 Mat_SeqAIJKokkos *aijkok; 349 ConstPetscScalarKokkosView xv; 350 PetscScalarKokkosView yv; 351 352 PetscFunctionBegin; 353 PetscCall(PetscLogGpuTimeBegin()); 354 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 355 PetscCall(VecGetKokkosView(xx, &xv)); 356 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 357 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 358 PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 359 PetscCall(VecRestoreKokkosView(xx, &xv)); 360 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 361 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 362 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 363 PetscCall(PetscLogGpuTimeEnd()); 364 PetscFunctionReturn(PETSC_SUCCESS); 365 } 366 367 /* y = A^T x */ 368 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 369 { 370 Mat_SeqAIJKokkos *aijkok; 371 const char *mode; 372 ConstPetscScalarKokkosView xv; 373 PetscScalarKokkosView yv; 374 KokkosCsrMatrix csrmat; 375 376 PetscFunctionBegin; 377 PetscCall(PetscLogGpuTimeBegin()); 378 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 379 PetscCall(VecGetKokkosView(xx, &xv)); 380 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 381 if (A->form_explicit_transpose) { 382 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 383 mode = "N"; 384 } else { 385 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 386 csrmat = aijkok->csrmat; 387 mode = "T"; 388 } 389 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 390 PetscCall(VecRestoreKokkosView(xx, &xv)); 391 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 392 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 393 PetscCall(PetscLogGpuTimeEnd()); 394 PetscFunctionReturn(PETSC_SUCCESS); 395 } 396 397 /* y = A^H x */ 398 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 399 { 400 Mat_SeqAIJKokkos *aijkok; 401 const char *mode; 402 ConstPetscScalarKokkosView xv; 403 PetscScalarKokkosView yv; 404 KokkosCsrMatrix csrmat; 405 406 PetscFunctionBegin; 407 PetscCall(PetscLogGpuTimeBegin()); 408 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 409 PetscCall(VecGetKokkosView(xx, &xv)); 410 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 411 if (A->form_explicit_transpose) { 412 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 413 mode = "N"; 414 } else { 415 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 416 csrmat = aijkok->csrmat; 417 mode = "C"; 418 } 419 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 420 PetscCall(VecRestoreKokkosView(xx, &xv)); 421 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 422 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 423 PetscCall(PetscLogGpuTimeEnd()); 424 PetscFunctionReturn(PETSC_SUCCESS); 425 } 426 427 /* z = A x + y */ 428 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 429 { 430 Mat_SeqAIJKokkos *aijkok; 431 ConstPetscScalarKokkosView xv, yv; 432 PetscScalarKokkosView zv; 433 434 PetscFunctionBegin; 435 PetscCall(PetscLogGpuTimeBegin()); 436 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 437 PetscCall(VecGetKokkosView(xx, &xv)); 438 PetscCall(VecGetKokkosView(yy, &yv)); 439 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 440 if (zz != yy) Kokkos::deep_copy(zv, yv); 441 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 442 PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 443 PetscCall(VecRestoreKokkosView(xx, &xv)); 444 PetscCall(VecRestoreKokkosView(yy, &yv)); 445 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 446 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 447 PetscCall(PetscLogGpuTimeEnd()); 448 PetscFunctionReturn(PETSC_SUCCESS); 449 } 450 451 /* z = A^T x + y */ 452 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 453 { 454 Mat_SeqAIJKokkos *aijkok; 455 const char *mode; 456 ConstPetscScalarKokkosView xv, yv; 457 PetscScalarKokkosView zv; 458 KokkosCsrMatrix csrmat; 459 460 PetscFunctionBegin; 461 PetscCall(PetscLogGpuTimeBegin()); 462 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 463 PetscCall(VecGetKokkosView(xx, &xv)); 464 PetscCall(VecGetKokkosView(yy, &yv)); 465 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 466 if (zz != yy) Kokkos::deep_copy(zv, yv); 467 if (A->form_explicit_transpose) { 468 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 469 mode = "N"; 470 } else { 471 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 472 csrmat = aijkok->csrmat; 473 mode = "T"; 474 } 475 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 476 PetscCall(VecRestoreKokkosView(xx, &xv)); 477 PetscCall(VecRestoreKokkosView(yy, &yv)); 478 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 479 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 480 PetscCall(PetscLogGpuTimeEnd()); 481 PetscFunctionReturn(PETSC_SUCCESS); 482 } 483 484 /* z = A^H x + y */ 485 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 486 { 487 Mat_SeqAIJKokkos *aijkok; 488 const char *mode; 489 ConstPetscScalarKokkosView xv, yv; 490 PetscScalarKokkosView zv; 491 KokkosCsrMatrix csrmat; 492 493 PetscFunctionBegin; 494 PetscCall(PetscLogGpuTimeBegin()); 495 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 496 PetscCall(VecGetKokkosView(xx, &xv)); 497 PetscCall(VecGetKokkosView(yy, &yv)); 498 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 499 if (zz != yy) Kokkos::deep_copy(zv, yv); 500 if (A->form_explicit_transpose) { 501 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 502 mode = "N"; 503 } else { 504 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 505 csrmat = aijkok->csrmat; 506 mode = "C"; 507 } 508 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 509 PetscCall(VecRestoreKokkosView(xx, &xv)); 510 PetscCall(VecRestoreKokkosView(yy, &yv)); 511 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 512 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 513 PetscCall(PetscLogGpuTimeEnd()); 514 PetscFunctionReturn(PETSC_SUCCESS); 515 } 516 517 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 518 { 519 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 520 521 PetscFunctionBegin; 522 switch (op) { 523 case MAT_FORM_EXPLICIT_TRANSPOSE: 524 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 525 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 526 A->form_explicit_transpose = flg; 527 break; 528 default: 529 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 530 break; 531 } 532 PetscFunctionReturn(PETSC_SUCCESS); 533 } 534 535 /* Depending on reuse, either build a new mat, or use the existing mat */ 536 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 537 { 538 Mat_SeqAIJ *aseq; 539 540 PetscFunctionBegin; 541 PetscCall(PetscKokkosInitializeCheck()); 542 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 543 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */ 544 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 545 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 546 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 547 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 548 PetscCall(PetscFree(A->defaultvectype)); 549 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 550 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 551 PetscCall(MatSetOps_SeqAIJKokkos(A)); 552 aseq = static_cast<Mat_SeqAIJ *>(A->data); 553 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 554 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 555 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE); 556 } 557 } 558 PetscFunctionReturn(PETSC_SUCCESS); 559 } 560 561 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 562 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 563 */ 564 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 565 { 566 Mat_SeqAIJ *bseq; 567 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 568 Mat mat; 569 570 PetscFunctionBegin; 571 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 572 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 573 mat = *B; 574 if (A->assembled) { 575 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 576 bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE); 577 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 578 /* Now copy values to B if needed */ 579 if (dupOption == MAT_COPY_VALUES) { 580 if (akok->a_dual.need_sync_device()) { 581 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 582 bkok->a_dual.modify_host(); 583 } else { /* If device has the latest data, we only copy data on device */ 584 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 585 bkok->a_dual.modify_device(); 586 } 587 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 588 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 589 bkok->a_dual.modify_host(); 590 } 591 mat->spptr = bkok; 592 } 593 594 PetscCall(PetscFree(mat->defaultvectype)); 595 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 596 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 597 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 598 PetscFunctionReturn(PETSC_SUCCESS); 599 } 600 601 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 602 { 603 Mat At; 604 KokkosCsrMatrix internT; 605 Mat_SeqAIJKokkos *atkok, *bkok; 606 607 PetscFunctionBegin; 608 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 609 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 610 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 611 /* Deep copy internT, as we want to isolate the internal transpose */ 612 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 613 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 614 if (reuse == MAT_INITIAL_MATRIX) *B = At; 615 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 616 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 617 if ((*B)->assembled) { 618 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 619 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 620 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 621 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 622 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 623 MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 624 MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 625 PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 626 PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 627 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 628 } 629 PetscFunctionReturn(PETSC_SUCCESS); 630 } 631 632 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 633 { 634 Mat_SeqAIJKokkos *aijkok; 635 636 PetscFunctionBegin; 637 if (A->factortype == MAT_FACTOR_NONE) { 638 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 639 delete aijkok; 640 } else { 641 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 642 } 643 A->spptr = NULL; 644 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 645 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 646 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 647 PetscCall(MatDestroy_SeqAIJ(A)); 648 PetscFunctionReturn(PETSC_SUCCESS); 649 } 650 651 /*MC 652 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 653 654 A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types 655 656 Options Database Key: 657 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 658 659 Level: beginner 660 661 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 662 M*/ 663 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 664 { 665 PetscFunctionBegin; 666 PetscCall(PetscKokkosInitializeCheck()); 667 PetscCall(MatCreate_SeqAIJ(A)); 668 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 669 PetscFunctionReturn(PETSC_SUCCESS); 670 } 671 672 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 673 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 674 { 675 Mat_SeqAIJ *a, *b; 676 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 677 MatScalarKokkosView aa, ba, ca; 678 MatRowMapKokkosView ai, bi, ci; 679 MatColIdxKokkosView aj, bj, cj; 680 PetscInt m, n, nnz, aN; 681 682 PetscFunctionBegin; 683 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 684 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 685 PetscAssertPointer(C, 4); 686 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 687 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 688 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 689 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 690 691 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 692 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 693 a = static_cast<Mat_SeqAIJ *>(A->data); 694 b = static_cast<Mat_SeqAIJ *>(B->data); 695 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 696 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 697 aa = akok->a_dual.view_device(); 698 ai = akok->i_dual.view_device(); 699 ba = bkok->a_dual.view_device(); 700 bi = bkok->i_dual.view_device(); 701 m = A->rmap->n; /* M, N and nnz of C */ 702 n = A->cmap->n + B->cmap->n; 703 nnz = a->nz + b->nz; 704 aN = A->cmap->n; /* N of A */ 705 if (reuse == MAT_INITIAL_MATRIX) { 706 aj = akok->j_dual.view_device(); 707 bj = bkok->j_dual.view_device(); 708 auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 709 auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 710 auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 711 ca = ca_dual.view_device(); 712 ci = ci_dual.view_device(); 713 cj = cj_dual.view_device(); 714 715 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 716 Kokkos::parallel_for( 717 Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 718 PetscInt i = t.league_rank(); /* row i */ 719 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 720 721 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 722 ci(i) = coffset; 723 if (i == m - 1) ci(m) = ai(m) + bi(m); 724 }); 725 726 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 727 if (k < alen) { 728 ca(coffset + k) = aa(ai(i) + k); 729 cj(coffset + k) = aj(ai(i) + k); 730 } else { 731 ca(coffset + k) = ba(bi(i) + k - alen); 732 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 733 } 734 }); 735 }); 736 ca_dual.modify_device(); 737 ci_dual.modify_device(); 738 cj_dual.modify_device(); 739 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 740 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 741 } else if (reuse == MAT_REUSE_MATRIX) { 742 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 743 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 744 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 745 ca = ckok->a_dual.view_device(); 746 ci = ckok->i_dual.view_device(); 747 748 Kokkos::parallel_for( 749 Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 750 PetscInt i = t.league_rank(); /* row i */ 751 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 752 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 753 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 754 else ca(ci(i) + k) = ba(bi(i) + k - alen); 755 }); 756 }); 757 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 758 } 759 PetscFunctionReturn(PETSC_SUCCESS); 760 } 761 762 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 763 { 764 PetscFunctionBegin; 765 delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 766 PetscFunctionReturn(PETSC_SUCCESS); 767 } 768 769 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 770 { 771 Mat_Product *product = C->product; 772 Mat A, B; 773 bool transA, transB; /* use bool, since KK needs this type */ 774 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 775 Mat_SeqAIJ *c; 776 MatProductData_SeqAIJKokkos *pdata; 777 KokkosCsrMatrix csrmatA, csrmatB; 778 779 PetscFunctionBegin; 780 MatCheckProduct(C, 1); 781 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 782 pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 783 784 // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 785 // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 786 // we still do numeric. 787 if (pdata->reusesym) { // numeric reuses results from symbolic 788 pdata->reusesym = PETSC_FALSE; 789 PetscFunctionReturn(PETSC_SUCCESS); 790 } 791 792 switch (product->type) { 793 case MATPRODUCT_AB: 794 transA = false; 795 transB = false; 796 break; 797 case MATPRODUCT_AtB: 798 transA = true; 799 transB = false; 800 break; 801 case MATPRODUCT_ABt: 802 transA = false; 803 transB = true; 804 break; 805 default: 806 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 807 } 808 809 A = product->A; 810 B = product->B; 811 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 812 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 813 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 814 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 815 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 816 817 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 818 819 csrmatA = akok->csrmat; 820 csrmatB = bkok->csrmat; 821 822 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 823 if (transA) { 824 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 825 transA = false; 826 } 827 828 if (transB) { 829 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 830 transB = false; 831 } 832 PetscCall(PetscLogGpuTimeBegin()); 833 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 834 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 835 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 836 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 837 #endif 838 839 PetscCall(PetscLogGpuTimeEnd()); 840 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 841 /* shorter version of MatAssemblyEnd_SeqAIJ */ 842 c = (Mat_SeqAIJ *)C->data; 843 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 844 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 845 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 846 c->reallocs = 0; 847 C->info.mallocs = 0; 848 C->info.nz_unneeded = 0; 849 C->assembled = C->was_assembled = PETSC_TRUE; 850 C->num_ass++; 851 PetscFunctionReturn(PETSC_SUCCESS); 852 } 853 854 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 855 { 856 Mat_Product *product = C->product; 857 MatProductType ptype; 858 Mat A, B; 859 bool transA, transB; 860 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 861 MatProductData_SeqAIJKokkos *pdata; 862 MPI_Comm comm; 863 KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 864 865 PetscFunctionBegin; 866 MatCheckProduct(C, 1); 867 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 868 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 869 A = product->A; 870 B = product->B; 871 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 872 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 873 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 874 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 875 csrmatA = akok->csrmat; 876 csrmatB = bkok->csrmat; 877 878 ptype = product->type; 879 // Take advantage of the symmetry if true 880 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 881 ptype = MATPRODUCT_AB; 882 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 883 } 884 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 885 ptype = MATPRODUCT_AB; 886 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 887 } 888 889 switch (ptype) { 890 case MATPRODUCT_AB: 891 transA = false; 892 transB = false; 893 break; 894 case MATPRODUCT_AtB: 895 transA = true; 896 transB = false; 897 break; 898 case MATPRODUCT_ABt: 899 transA = false; 900 transB = true; 901 break; 902 default: 903 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 904 } 905 PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 906 pdata->reusesym = product->api_user; 907 908 /* TODO: add command line options to select spgemm algorithms */ 909 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 910 911 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 912 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 913 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 914 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 915 #endif 916 #endif 917 PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 918 919 PetscCall(PetscLogGpuTimeBegin()); 920 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 921 if (transA) { 922 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 923 transA = false; 924 } 925 926 if (transB) { 927 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 928 transB = false; 929 } 930 931 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 932 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 933 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 934 calling new Mat_SeqAIJKokkos(). 935 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 936 */ 937 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 938 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 939 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 940 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 941 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 942 #endif 943 PetscCall(PetscLogGpuTimeEnd()); 944 945 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 946 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 947 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 948 PetscFunctionReturn(PETSC_SUCCESS); 949 } 950 951 /* handles sparse matrix matrix ops */ 952 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 953 { 954 Mat_Product *product = mat->product; 955 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 956 957 PetscFunctionBegin; 958 MatCheckProduct(mat, 1); 959 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 960 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 961 if (Biskok && Ciskok) { 962 switch (product->type) { 963 case MATPRODUCT_AB: 964 case MATPRODUCT_AtB: 965 case MATPRODUCT_ABt: 966 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 967 break; 968 case MATPRODUCT_PtAP: 969 case MATPRODUCT_RARt: 970 case MATPRODUCT_ABC: 971 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 972 break; 973 default: 974 break; 975 } 976 } else { /* fallback for AIJ */ 977 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 978 } 979 PetscFunctionReturn(PETSC_SUCCESS); 980 } 981 982 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 983 { 984 Mat_SeqAIJKokkos *aijkok; 985 986 PetscFunctionBegin; 987 PetscCall(PetscLogGpuTimeBegin()); 988 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 989 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 990 KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 991 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 992 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 993 PetscCall(PetscLogGpuTimeEnd()); 994 PetscFunctionReturn(PETSC_SUCCESS); 995 } 996 997 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular) 998 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a) 999 { 1000 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1001 1002 PetscFunctionBegin; 1003 if (A->assembled && aijseq->diagonaldense) { // no missing diagonals 1004 PetscInt n = PetscMin(A->rmap->n, A->cmap->n); 1005 1006 PetscCall(PetscLogGpuTimeBegin()); 1007 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1008 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1009 const auto &Aa = aijkok->a_dual.view_device(); 1010 const auto &Adiag = aijkok->diag_dual.view_device(); 1011 PetscCallCXX(Kokkos::parallel_for( 1012 n, KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; })); 1013 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1014 PetscCall(PetscLogGpuFlops(n)); 1015 PetscCall(PetscLogGpuTimeEnd()); 1016 } else { // need reassembly, very slow! 1017 PetscCall(MatShift_Basic(A, a)); 1018 } 1019 PetscFunctionReturn(PETSC_SUCCESS); 1020 } 1021 1022 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is) 1023 { 1024 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data); 1025 1026 PetscFunctionBegin; 1027 if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals 1028 ConstPetscScalarKokkosView dv; 1029 PetscInt n, nv; 1030 1031 PetscCall(PetscLogGpuTimeBegin()); 1032 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1033 PetscCall(VecGetKokkosView(D, &dv)); 1034 PetscCall(VecGetLocalSize(D, &nv)); 1035 n = PetscMin(Y->rmap->n, Y->cmap->n); 1036 PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match"); 1037 1038 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1039 const auto &Aa = aijkok->a_dual.view_device(); 1040 const auto &Adiag = aijkok->diag_dual.view_device(); 1041 PetscCallCXX(Kokkos::parallel_for( 1042 n, KOKKOS_LAMBDA(const PetscInt i) { 1043 if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i); 1044 else Aa(Adiag(i)) += dv(i); 1045 })); 1046 PetscCall(VecRestoreKokkosView(D, &dv)); 1047 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1048 PetscCall(PetscLogGpuFlops(n)); 1049 PetscCall(PetscLogGpuTimeEnd()); 1050 } else { // need reassembly, very slow! 1051 PetscCall(MatDiagonalSet_Default(Y, D, is)); 1052 } 1053 PetscFunctionReturn(PETSC_SUCCESS); 1054 } 1055 1056 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr) 1057 { 1058 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1059 PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz; 1060 ConstPetscScalarKokkosView lv, rv; 1061 1062 PetscFunctionBegin; 1063 PetscCall(PetscLogGpuTimeBegin()); 1064 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1065 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1066 const auto &Aa = aijkok->a_dual.view_device(); 1067 const auto &Ai = aijkok->i_dual.view_device(); 1068 const auto &Aj = aijkok->j_dual.view_device(); 1069 if (ll) { 1070 PetscCall(VecGetLocalSize(ll, &m)); 1071 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1072 PetscCall(VecGetKokkosView(ll, &lv)); 1073 PetscCallCXX(Kokkos::parallel_for( // for each row 1074 Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1075 PetscInt i = t.league_rank(); // row i 1076 PetscInt len = Ai(i + 1) - Ai(i); 1077 // scale entries on the row 1078 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); }); 1079 })); 1080 PetscCall(VecRestoreKokkosView(ll, &lv)); 1081 PetscCall(PetscLogGpuFlops(nz)); 1082 } 1083 if (rr) { 1084 PetscCall(VecGetLocalSize(rr, &n)); 1085 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1086 PetscCall(VecGetKokkosView(rr, &rv)); 1087 PetscCallCXX(Kokkos::parallel_for( // for each nonzero 1088 nz, KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); })); 1089 PetscCall(VecRestoreKokkosView(rr, &lv)); 1090 PetscCall(PetscLogGpuFlops(nz)); 1091 } 1092 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1093 PetscCall(PetscLogGpuTimeEnd()); 1094 PetscFunctionReturn(PETSC_SUCCESS); 1095 } 1096 1097 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 1098 { 1099 Mat_SeqAIJKokkos *aijkok; 1100 1101 PetscFunctionBegin; 1102 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1103 if (aijkok) { /* Only zero the device if data is already there */ 1104 KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0); 1105 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1106 } else { /* Might be preallocated but not assembled */ 1107 PetscCall(MatZeroEntries_SeqAIJ(A)); 1108 } 1109 PetscFunctionReturn(PETSC_SUCCESS); 1110 } 1111 1112 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1113 { 1114 Mat_SeqAIJKokkos *aijkok; 1115 PetscInt n; 1116 PetscScalarKokkosView xv; 1117 1118 PetscFunctionBegin; 1119 PetscCall(VecGetLocalSize(x, &n)); 1120 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1121 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1122 1123 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1124 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1125 1126 const auto &Aa = aijkok->a_dual.view_device(); 1127 const auto &Ai = aijkok->i_dual.view_device(); 1128 const auto &Adiag = aijkok->diag_dual.view_device(); 1129 1130 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1131 Kokkos::parallel_for( 1132 n, KOKKOS_LAMBDA(const PetscInt i) { 1133 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1134 else xv(i) = 0; 1135 }); 1136 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1137 PetscFunctionReturn(PETSC_SUCCESS); 1138 } 1139 1140 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1141 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1142 { 1143 Mat_SeqAIJKokkos *aijkok; 1144 1145 PetscFunctionBegin; 1146 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1147 PetscAssertPointer(kv, 2); 1148 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1149 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1150 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1151 *kv = aijkok->a_dual.view_device(); 1152 PetscFunctionReturn(PETSC_SUCCESS); 1153 } 1154 1155 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1156 { 1157 PetscFunctionBegin; 1158 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1159 PetscAssertPointer(kv, 2); 1160 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1161 PetscFunctionReturn(PETSC_SUCCESS); 1162 } 1163 1164 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1165 { 1166 Mat_SeqAIJKokkos *aijkok; 1167 1168 PetscFunctionBegin; 1169 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1170 PetscAssertPointer(kv, 2); 1171 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1172 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1173 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1174 *kv = aijkok->a_dual.view_device(); 1175 PetscFunctionReturn(PETSC_SUCCESS); 1176 } 1177 1178 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1179 { 1180 PetscFunctionBegin; 1181 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1182 PetscAssertPointer(kv, 2); 1183 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1184 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1185 PetscFunctionReturn(PETSC_SUCCESS); 1186 } 1187 1188 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1189 { 1190 Mat_SeqAIJKokkos *aijkok; 1191 1192 PetscFunctionBegin; 1193 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1194 PetscAssertPointer(kv, 2); 1195 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1196 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1197 *kv = aijkok->a_dual.view_device(); 1198 PetscFunctionReturn(PETSC_SUCCESS); 1199 } 1200 1201 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1202 { 1203 PetscFunctionBegin; 1204 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1205 PetscAssertPointer(kv, 2); 1206 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1207 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1208 PetscFunctionReturn(PETSC_SUCCESS); 1209 } 1210 1211 /* Computes Y += alpha X */ 1212 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1213 { 1214 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1215 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1216 ConstMatScalarKokkosView Xa; 1217 MatScalarKokkosView Ya; 1218 1219 PetscFunctionBegin; 1220 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1221 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1222 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1223 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1224 PetscCall(PetscLogGpuTimeBegin()); 1225 1226 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1227 PetscBool e; 1228 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1229 if (e) { 1230 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1231 if (e) pattern = SAME_NONZERO_PATTERN; 1232 } 1233 } 1234 1235 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1236 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1237 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1238 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1239 */ 1240 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1241 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1242 Xa = xkok->a_dual.view_device(); 1243 Ya = ykok->a_dual.view_device(); 1244 1245 if (pattern == SAME_NONZERO_PATTERN) { 1246 KokkosBlas::axpy(alpha, Xa, Ya); 1247 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1248 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1249 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1250 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1251 1252 Kokkos::parallel_for( 1253 Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1254 PetscInt i = t.league_rank(); // row i 1255 Kokkos::single(Kokkos::PerTeam(t), [=]() { 1256 // Only one thread works in a team 1257 PetscInt p, q = Yi(i); 1258 for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 1259 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 1260 if (Xj(p) == Yj(q)) { // Found it 1261 Ya(q) += alpha * Xa(p); 1262 q++; 1263 } else { 1264 // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1265 // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1266 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 1267 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 1268 #else 1269 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 1270 #endif 1271 } 1272 } 1273 }); 1274 }); 1275 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1276 } else { // different nonzero patterns 1277 Mat Z; 1278 KokkosCsrMatrix zcsr; 1279 KernelHandle kh; 1280 kh.create_spadd_handle(true); // X, Y are sorted 1281 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1282 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1283 zkok = new Mat_SeqAIJKokkos(zcsr); 1284 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1285 PetscCall(MatHeaderReplace(Y, &Z)); 1286 kh.destroy_spadd_handle(); 1287 } 1288 PetscCall(PetscLogGpuTimeEnd()); 1289 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 1290 PetscFunctionReturn(PETSC_SUCCESS); 1291 } 1292 1293 struct MatCOOStruct_SeqAIJKokkos { 1294 PetscCount n; 1295 PetscCount Atot; 1296 PetscInt nz; 1297 PetscCountKokkosView jmap; 1298 PetscCountKokkosView perm; 1299 1300 MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h) 1301 { 1302 nz = coo_h->nz; 1303 n = coo_h->n; 1304 Atot = coo_h->Atot; 1305 jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1)); 1306 perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot)); 1307 } 1308 }; 1309 1310 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void *data) 1311 { 1312 PetscFunctionBegin; 1313 PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(data)); 1314 PetscFunctionReturn(PETSC_SUCCESS); 1315 } 1316 1317 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1318 { 1319 Mat_SeqAIJKokkos *akok; 1320 Mat_SeqAIJ *aseq; 1321 PetscContainer container_h, container_d; 1322 MatCOOStruct_SeqAIJ *coo_h; 1323 MatCOOStruct_SeqAIJKokkos *coo_d; 1324 1325 PetscFunctionBegin; 1326 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1327 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1328 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1329 delete akok; 1330 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE); 1331 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1332 1333 // Copy the COO struct to device 1334 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 1335 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 1336 PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h)); 1337 1338 // Put the COO struct in a container and then attach that to the matrix 1339 PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d)); 1340 PetscCall(PetscContainerSetPointer(container_d, coo_d)); 1341 PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_SeqAIJKokkos)); 1342 PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d)); 1343 PetscCall(PetscContainerDestroy(&container_d)); 1344 PetscFunctionReturn(PETSC_SUCCESS); 1345 } 1346 1347 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1348 { 1349 MatScalarKokkosView Aa; 1350 ConstMatScalarKokkosView kv; 1351 PetscMemType memtype; 1352 PetscContainer container; 1353 MatCOOStruct_SeqAIJKokkos *coo; 1354 1355 PetscFunctionBegin; 1356 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 1357 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 1358 1359 const auto &n = coo->n; 1360 const auto &Annz = coo->nz; 1361 const auto &jmap = coo->jmap; 1362 const auto &perm = coo->perm; 1363 1364 PetscCall(PetscGetMemType(v, &memtype)); 1365 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1366 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n)); 1367 } else { 1368 kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */ 1369 } 1370 1371 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1372 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1373 1374 PetscCall(PetscLogGpuTimeBegin()); 1375 Kokkos::parallel_for( 1376 Annz, KOKKOS_LAMBDA(const PetscCount i) { 1377 PetscScalar sum = 0.0; 1378 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1379 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1380 }); 1381 PetscCall(PetscLogGpuTimeEnd()); 1382 1383 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1384 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1385 PetscFunctionReturn(PETSC_SUCCESS); 1386 } 1387 1388 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1389 { 1390 PetscFunctionBegin; 1391 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1392 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1393 B->offloadmask = PETSC_OFFLOAD_CPU; 1394 PetscFunctionReturn(PETSC_SUCCESS); 1395 } 1396 1397 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1398 { 1399 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1400 1401 PetscFunctionBegin; 1402 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1403 A->boundtocpu = PETSC_FALSE; 1404 1405 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1406 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1407 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1408 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1409 A->ops->scale = MatScale_SeqAIJKokkos; 1410 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1411 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1412 A->ops->mult = MatMult_SeqAIJKokkos; 1413 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1414 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1415 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1416 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1417 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1418 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1419 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1420 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1421 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1422 A->ops->shift = MatShift_SeqAIJKokkos; 1423 A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos; 1424 A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos; 1425 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1426 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1427 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1428 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1429 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1430 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1431 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1432 1433 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1434 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1435 PetscFunctionReturn(PETSC_SUCCESS); 1436 } 1437 1438 /* 1439 Extract the (prescribled) diagonal blocks of the matrix and then invert them 1440 1441 Input Parameters: 1442 + A - the MATSEQAIJKOKKOS matrix 1443 . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i) 1444 . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[] 1445 . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i) 1446 - work - a pre-allocated work buffer (as big as diagVal) for use by this routine 1447 1448 Output Parameter: 1449 . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order) 1450 */ 1451 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal) 1452 { 1453 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1454 PetscInt nblocks = bs.extent(0) - 1; 1455 1456 PetscFunctionBegin; 1457 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device 1458 1459 // Pull out the diagonal blocks of the matrix and then invert the blocks 1460 auto Aa = akok->a_dual.view_device(); 1461 auto Ai = akok->i_dual.view_device(); 1462 auto Aj = akok->j_dual.view_device(); 1463 auto Adiag = akok->diag_dual.view_device(); 1464 // TODO: how to tune the team size? 1465 #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HOST) 1466 auto ts = Kokkos::AUTO(); 1467 #else 1468 auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs 1469 #endif 1470 PetscCallCXX(Kokkos::parallel_for( 1471 Kokkos::TeamPolicy<>(nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) { 1472 const PetscInt bid = teamMember.league_rank(); // block id 1473 const PetscInt rstart = bs(bid); // this block starts from this row 1474 const PetscInt m = bs(bid + 1) - bs(bid); // size of this block 1475 const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order 1476 const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m); 1477 1478 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B 1479 PetscInt i = rstart + r; // i-th row in A 1480 1481 if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case) 1482 PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row 1483 1484 for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet 1485 if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero 1486 B(r, c) = 0.0; 1487 } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column 1488 B(r, c) = Aa(first + c); 1489 } else { // this entry does not show up in the CSR 1490 B(r, c) = 0.0; 1491 } 1492 } 1493 } else { // rare case that the diagonal does not exist 1494 const PetscInt begin = Ai(i); 1495 const PetscInt end = Ai(i + 1); 1496 for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0; 1497 for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not. 1498 if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j); 1499 else if (Aj(j) >= rstart + m) break; 1500 } 1501 } 1502 }); 1503 1504 // LU-decompose B (w/o pivoting) and then invert B 1505 KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0); 1506 KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W); 1507 })); 1508 // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels 1509 PetscFunctionReturn(PETSC_SUCCESS); 1510 } 1511 1512 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1513 { 1514 Mat_SeqAIJ *aseq; 1515 PetscInt i, m, n; 1516 auto &exec = PetscGetKokkosExecutionSpace(); 1517 1518 PetscFunctionBegin; 1519 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1520 1521 m = akok->nrows(); 1522 n = akok->ncols(); 1523 PetscCall(MatSetSizes(A, m, n, m, n)); 1524 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1525 1526 /* Set up data structures of A as a MATSEQAIJ */ 1527 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1528 aseq = (Mat_SeqAIJ *)(A)->data; 1529 1530 PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */ 1531 PetscCallCXX(akok->j_dual.sync_host(exec)); 1532 PetscCallCXX(exec.fence()); 1533 1534 aseq->i = akok->i_host_data(); 1535 aseq->j = akok->j_host_data(); 1536 aseq->a = akok->a_host_data(); 1537 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1538 aseq->singlemalloc = PETSC_FALSE; 1539 aseq->free_a = PETSC_FALSE; 1540 aseq->free_ij = PETSC_FALSE; 1541 aseq->nz = akok->nnz(); 1542 aseq->maxnz = aseq->nz; 1543 1544 PetscCall(PetscMalloc1(m, &aseq->imax)); 1545 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1546 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1547 1548 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1549 akok->nonzerostate = A->nonzerostate; 1550 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1551 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1552 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1553 PetscFunctionReturn(PETSC_SUCCESS); 1554 } 1555 1556 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 1557 { 1558 PetscFunctionBegin; 1559 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1560 *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 1561 PetscFunctionReturn(PETSC_SUCCESS); 1562 } 1563 1564 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 1565 { 1566 Mat_SeqAIJKokkos *akok; 1567 PetscFunctionBegin; 1568 PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 1569 PetscCall(MatCreate(comm, A)); 1570 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1571 PetscFunctionReturn(PETSC_SUCCESS); 1572 } 1573 1574 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1575 1576 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1577 */ 1578 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1579 { 1580 PetscFunctionBegin; 1581 PetscCall(MatCreate(comm, A)); 1582 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1583 PetscFunctionReturn(PETSC_SUCCESS); 1584 } 1585 1586 /*@C 1587 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1588 (the default parallel PETSc format). This matrix will ultimately be handled by 1589 Kokkos for calculations. 1590 1591 Collective 1592 1593 Input Parameters: 1594 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1595 . m - number of rows 1596 . n - number of columns 1597 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 1598 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 1599 1600 Output Parameter: 1601 . A - the matrix 1602 1603 Level: intermediate 1604 1605 Notes: 1606 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1607 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1608 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1609 1610 The AIJ format, also called 1611 compressed row storage, is fully compatible with standard Fortran 1612 storage. That is, the stored row and column indices can begin at 1613 either one (as in Fortran) or zero. 1614 1615 Specify the preallocated storage with either `nz` or `nnz` (not both). 1616 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1617 allocation. 1618 1619 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()` 1620 @*/ 1621 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1622 { 1623 PetscFunctionBegin; 1624 PetscCall(PetscKokkosInitializeCheck()); 1625 PetscCall(MatCreate(comm, A)); 1626 PetscCall(MatSetSizes(*A, m, n, m, n)); 1627 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1628 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1629 PetscFunctionReturn(PETSC_SUCCESS); 1630 } 1631 1632 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1633 { 1634 PetscFunctionBegin; 1635 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1636 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1637 PetscFunctionReturn(PETSC_SUCCESS); 1638 } 1639 1640 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1641 { 1642 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1643 1644 PetscFunctionBegin; 1645 if (!factors->sptrsv_symbolic_completed) { 1646 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d); 1647 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d); 1648 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1649 } 1650 PetscFunctionReturn(PETSC_SUCCESS); 1651 } 1652 1653 /* Check if we need to update factors etc for transpose solve */ 1654 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1655 { 1656 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1657 MatColIdxType n = A->rmap->n; 1658 1659 PetscFunctionBegin; 1660 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1661 /* Update L^T and do sptrsv symbolic */ 1662 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0 1663 factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 1664 factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 1665 1666 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d, 1667 factors->iLt_d, factors->jLt_d, factors->aLt_d); 1668 1669 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1670 We have to sort the indices, until KK provides finer control options. 1671 */ 1672 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d); 1673 1674 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d); 1675 1676 /* Update U^T and do sptrsv symbolic */ 1677 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0 1678 factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 1679 factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 1680 1681 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d, 1682 factors->iUt_d, factors->jUt_d, factors->aUt_d); 1683 1684 /* Sort indices. See comments above */ 1685 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d); 1686 1687 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d); 1688 factors->transpose_updated = PETSC_TRUE; 1689 } 1690 PetscFunctionReturn(PETSC_SUCCESS); 1691 } 1692 1693 /* Solve Ax = b, with A = LU */ 1694 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x) 1695 { 1696 ConstPetscScalarKokkosView bv; 1697 PetscScalarKokkosView xv; 1698 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1699 1700 PetscFunctionBegin; 1701 PetscCall(PetscLogGpuTimeBegin()); 1702 PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 1703 PetscCall(VecGetKokkosView(b, &bv)); 1704 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1705 /* Solve L tmpv = b */ 1706 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector)); 1707 /* Solve Ux = tmpv */ 1708 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv)); 1709 PetscCall(VecRestoreKokkosView(b, &bv)); 1710 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1711 PetscCall(PetscLogGpuTimeEnd()); 1712 PetscFunctionReturn(PETSC_SUCCESS); 1713 } 1714 1715 /* Solve A^T x = b, where A^T = U^T L^T */ 1716 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x) 1717 { 1718 ConstPetscScalarKokkosView bv; 1719 PetscScalarKokkosView xv; 1720 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1721 1722 PetscFunctionBegin; 1723 PetscCall(PetscLogGpuTimeBegin()); 1724 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 1725 PetscCall(VecGetKokkosView(b, &bv)); 1726 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1727 /* Solve U^T tmpv = b */ 1728 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector); 1729 1730 /* Solve L^T x = tmpv */ 1731 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv); 1732 PetscCall(VecRestoreKokkosView(b, &bv)); 1733 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1734 PetscCall(PetscLogGpuTimeEnd()); 1735 PetscFunctionReturn(PETSC_SUCCESS); 1736 } 1737 1738 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1739 { 1740 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1741 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1742 PetscInt fill_lev = info->levels; 1743 1744 PetscFunctionBegin; 1745 PetscCall(PetscLogGpuTimeBegin()); 1746 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1747 1748 auto a_d = aijkok->a_dual.view_device(); 1749 auto i_d = aijkok->i_dual.view_device(); 1750 auto j_d = aijkok->j_dual.view_device(); 1751 1752 KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d); 1753 1754 B->assembled = PETSC_TRUE; 1755 B->preallocated = PETSC_TRUE; 1756 B->ops->solve = MatSolve_SeqAIJKokkos; 1757 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1758 B->ops->matsolve = NULL; 1759 B->ops->matsolvetranspose = NULL; 1760 B->offloadmask = PETSC_OFFLOAD_GPU; 1761 1762 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1763 factors->transpose_updated = PETSC_FALSE; 1764 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1765 /* TODO: log flops, but how to know that? */ 1766 PetscCall(PetscLogGpuTimeEnd()); 1767 PetscFunctionReturn(PETSC_SUCCESS); 1768 } 1769 1770 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1771 { 1772 Mat_SeqAIJKokkos *aijkok; 1773 Mat_SeqAIJ *b; 1774 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1775 PetscInt fill_lev = info->levels; 1776 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 1777 PetscInt n = A->rmap->n; 1778 1779 PetscFunctionBegin; 1780 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1781 /* Rebuild factors */ 1782 if (factors) { 1783 factors->Destroy(); 1784 } /* Destroy the old if it exists */ 1785 else { 1786 B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n); 1787 } 1788 1789 /* Create a spiluk handle and then do symbolic factorization */ 1790 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 1791 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 1792 1793 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1794 1795 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 1796 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 1797 Kokkos::realloc(factors->iU_d, n + 1); 1798 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 1799 1800 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1801 auto i_d = aijkok->i_dual.view_device(); 1802 auto j_d = aijkok->j_dual.view_device(); 1803 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d); 1804 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1805 1806 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1807 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 1808 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 1809 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 1810 1811 /* TODO: add options to select sptrsv algorithms */ 1812 /* Create sptrsv handles for L, U and their transpose */ 1813 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1814 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1815 #else 1816 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1817 #endif 1818 1819 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 1820 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 1821 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 1822 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 1823 1824 /* Fill fields of the factor matrix B */ 1825 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 1826 b = (Mat_SeqAIJ *)B->data; 1827 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 1828 B->info.fill_ratio_given = info->fill; 1829 B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 1830 1831 B->offloadmask = PETSC_OFFLOAD_GPU; 1832 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1833 PetscFunctionReturn(PETSC_SUCCESS); 1834 } 1835 1836 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type) 1837 { 1838 PetscFunctionBegin; 1839 *type = MATSOLVERKOKKOS; 1840 PetscFunctionReturn(PETSC_SUCCESS); 1841 } 1842 1843 /*MC 1844 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1845 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 1846 1847 Level: beginner 1848 1849 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1850 M*/ 1851 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1852 { 1853 PetscInt n = A->rmap->n; 1854 1855 PetscFunctionBegin; 1856 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 1857 PetscCall(MatSetSizes(*B, n, n, n, n)); 1858 (*B)->factortype = ftype; 1859 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 1860 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1861 1862 if (ftype == MAT_FACTOR_LU) { 1863 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1864 (*B)->canuseordering = PETSC_TRUE; 1865 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1866 } else if (ftype == MAT_FACTOR_ILU) { 1867 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1868 (*B)->canuseordering = PETSC_FALSE; 1869 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1870 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1871 1872 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 1873 PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos)); 1874 PetscFunctionReturn(PETSC_SUCCESS); 1875 } 1876 1877 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1878 { 1879 PetscFunctionBegin; 1880 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1881 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1882 PetscFunctionReturn(PETSC_SUCCESS); 1883 } 1884 1885 /* Utility to print out a KokkosCsrMatrix for debugging */ 1886 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 1887 { 1888 const auto &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map); 1889 const auto &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries); 1890 const auto &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values); 1891 const PetscInt *i = iv.data(); 1892 const PetscInt *j = jv.data(); 1893 const PetscScalar *a = av.data(); 1894 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 1895 1896 PetscFunctionBegin; 1897 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 1898 for (PetscInt k = 0; k < m; k++) { 1899 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 1900 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 1901 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 1902 } 1903 PetscFunctionReturn(PETSC_SUCCESS); 1904 } 1905