1 #include <petsc_kokkos.hpp> 2 #include <petscvec_kokkos.hpp> 3 #include <petscpkg_version.h> 4 #include <petsc/private/petscimpl.h> 5 #include <petsc/private/sfimpl.h> 6 #include <petscsystypes.h> 7 #include <petscerror.h> 8 9 #include <Kokkos_Core.hpp> 10 #include <KokkosBlas.hpp> 11 #include <KokkosSparse_CrsMatrix.hpp> 12 #include <KokkosSparse_spmv.hpp> 13 #include <KokkosSparse_spiluk.hpp> 14 #include <KokkosSparse_sptrsv.hpp> 15 #include <KokkosSparse_spgemm.hpp> 16 #include <KokkosSparse_spadd.hpp> 17 #include <KokkosBatched_LU_Decl.hpp> 18 #include <KokkosBatched_InverseLU_Decl.hpp> 19 20 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 21 22 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 23 #include <KokkosSparse_Utils.hpp> 24 using KokkosSparse::sort_crs_matrix; 25 using KokkosSparse::Impl::transpose_matrix; 26 #else 27 #include <KokkosKernels_Sorting.hpp> 28 using KokkosKernels::sort_crs_matrix; 29 using KokkosKernels::Impl::transpose_matrix; 30 #endif 31 32 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 33 34 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 35 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 36 In the latter case, it is important to set a_dual's sync state correctly. 37 */ 38 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 39 { 40 Mat_SeqAIJ *aijseq; 41 Mat_SeqAIJKokkos *aijkok; 42 43 PetscFunctionBegin; 44 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 45 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 46 47 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 48 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 49 50 /* If aijkok does not exist, we just copy i, j to device. 51 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 52 In both cases, we build a new aijkok structure. 53 */ 54 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 55 delete aijkok; 56 aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 57 A->spptr = aijkok; 58 } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag. 59 MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n); 60 auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h); 61 aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h); 62 } 63 PetscFunctionReturn(PETSC_SUCCESS); 64 } 65 66 /* Sync CSR data to device if not yet */ 67 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 68 { 69 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 70 71 PetscFunctionBegin; 72 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 73 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 74 if (aijkok->a_dual.need_sync_device()) { 75 aijkok->a_dual.sync_device(); 76 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 77 aijkok->hermitian_updated = PETSC_FALSE; 78 } 79 PetscFunctionReturn(PETSC_SUCCESS); 80 } 81 82 /* Mark the CSR data on device as modified */ 83 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 84 { 85 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 86 87 PetscFunctionBegin; 88 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 89 aijkok->a_dual.clear_sync_state(); 90 aijkok->a_dual.modify_device(); 91 aijkok->transpose_updated = PETSC_FALSE; 92 aijkok->hermitian_updated = PETSC_FALSE; 93 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 94 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 95 PetscFunctionReturn(PETSC_SUCCESS); 96 } 97 98 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 99 { 100 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 101 auto &exec = PetscGetKokkosExecutionSpace(); 102 103 PetscFunctionBegin; 104 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 105 /* We do not expect one needs factors on host */ 106 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 107 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 108 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 109 PetscCallCXX(exec.fence()); 110 PetscFunctionReturn(PETSC_SUCCESS); 111 } 112 113 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 114 { 115 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 116 117 PetscFunctionBegin; 118 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 119 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 120 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 121 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 122 */ 123 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 124 auto &exec = PetscGetKokkosExecutionSpace(); 125 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 126 PetscCallCXX(exec.fence()); 127 *array = aijkok->a_dual.view_host().data(); 128 } else { /* Happens when calling MatSetValues on a newly created matrix */ 129 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 130 } 131 PetscFunctionReturn(PETSC_SUCCESS); 132 } 133 134 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 135 { 136 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 137 138 PetscFunctionBegin; 139 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 140 PetscFunctionReturn(PETSC_SUCCESS); 141 } 142 143 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 144 { 145 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 146 147 PetscFunctionBegin; 148 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 149 auto &exec = PetscGetKokkosExecutionSpace(); 150 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 151 PetscCallCXX(exec.fence()); 152 *array = aijkok->a_dual.view_host().data(); 153 } else { 154 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 155 } 156 PetscFunctionReturn(PETSC_SUCCESS); 157 } 158 159 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 160 { 161 PetscFunctionBegin; 162 *array = NULL; 163 PetscFunctionReturn(PETSC_SUCCESS); 164 } 165 166 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 167 { 168 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 169 170 PetscFunctionBegin; 171 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 172 *array = aijkok->a_dual.view_host().data(); 173 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 174 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 175 } 176 PetscFunctionReturn(PETSC_SUCCESS); 177 } 178 179 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 180 { 181 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 182 183 PetscFunctionBegin; 184 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 185 aijkok->a_dual.clear_sync_state(); 186 aijkok->a_dual.modify_host(); 187 } 188 PetscFunctionReturn(PETSC_SUCCESS); 189 } 190 191 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 192 { 193 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 194 195 PetscFunctionBegin; 196 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 197 198 if (i) *i = aijkok->i_device_data(); 199 if (j) *j = aijkok->j_device_data(); 200 if (a) { 201 aijkok->a_dual.sync_device(); 202 *a = aijkok->a_device_data(); 203 } 204 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 205 PetscFunctionReturn(PETSC_SUCCESS); 206 } 207 208 /* 209 Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 210 211 Input Parameter: 212 . A - the MATSEQAIJKOKKOS matrix 213 214 Output Parameters: 215 + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 216 - T_d - the transpose on device, whose value array is allocated but not initialized 217 */ 218 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 219 { 220 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 221 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 222 const PetscInt *Ai = aseq->i, *Aj = aseq->j; 223 MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 224 MatRowMapType *Ti = Ti_h.data(); 225 MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 226 MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 227 PetscInt *Tj = Tj_h.data(); 228 PetscInt *perm = perm_h.data(); 229 PetscInt *offset; 230 231 PetscFunctionBegin; 232 // Populate Ti 233 PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 234 Ti++; 235 for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 236 Ti--; 237 for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 238 239 // Populate Tj and the permutation array 240 PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 241 for (PetscInt i = 0; i < m; i++) { 242 for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 243 PetscInt r = Aj[j]; // row r of T 244 PetscInt disp = Ti[r] + offset[r]; 245 246 Tj[disp] = i; // col i of T 247 perm[disp] = j; 248 offset[r]++; 249 } 250 } 251 PetscCall(PetscFree(offset)); 252 253 // Sort each row of T, along with the permutation array 254 for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 255 256 // Output perm and T on device 257 auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 258 auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 259 PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 260 PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 261 PetscFunctionReturn(PETSC_SUCCESS); 262 } 263 264 // Generate the transpose on device and cache it internally 265 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 266 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 267 { 268 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 269 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 270 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 271 KokkosCsrMatrix &T = akok->csrmatT; 272 273 PetscFunctionBegin; 274 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 275 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 276 277 const auto &Aa = akok->a_dual.view_device(); 278 279 if (A->symmetric == PETSC_BOOL3_TRUE) { 280 *csrmatT = akok->csrmat; 281 } else { 282 // See if we already have a cached transpose and its value is up to date 283 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 284 if (!akok->transpose_updated) { // if the value is out of date, update the cached version 285 const auto &perm = akok->transpose_perm; // get the permutation array 286 auto &Ta = T.values; 287 288 PetscCallCXX(Kokkos::parallel_for(nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 289 } 290 } else { // Generate T of size n x m for the first time 291 MatRowMapKokkosView perm; 292 293 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 294 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 295 PetscCallCXX(Kokkos::parallel_for(nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 296 } 297 akok->transpose_updated = PETSC_TRUE; 298 *csrmatT = akok->csrmatT; 299 } 300 PetscFunctionReturn(PETSC_SUCCESS); 301 } 302 303 // Generate the Hermitian on device and cache it internally 304 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 305 { 306 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 307 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 308 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 309 KokkosCsrMatrix &T = akok->csrmatH; 310 311 PetscFunctionBegin; 312 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 313 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 314 315 const auto &Aa = akok->a_dual.view_device(); 316 317 if (A->hermitian == PETSC_BOOL3_TRUE) { 318 *csrmatH = akok->csrmat; 319 } else { 320 // See if we already have a cached hermitian and its value is up to date 321 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 322 if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 323 const auto &perm = akok->transpose_perm; // get the permutation array 324 auto &Ta = T.values; 325 326 PetscCallCXX(Kokkos::parallel_for(nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 327 } 328 } else { // Generate T of size n x m for the first time 329 MatRowMapKokkosView perm; 330 331 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 332 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 333 PetscCallCXX(Kokkos::parallel_for(nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 334 } 335 akok->hermitian_updated = PETSC_TRUE; 336 *csrmatH = akok->csrmatH; 337 } 338 PetscFunctionReturn(PETSC_SUCCESS); 339 } 340 341 /* y = A x */ 342 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 343 { 344 Mat_SeqAIJKokkos *aijkok; 345 ConstPetscScalarKokkosView xv; 346 PetscScalarKokkosView yv; 347 348 PetscFunctionBegin; 349 PetscCall(PetscLogGpuTimeBegin()); 350 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 351 PetscCall(VecGetKokkosView(xx, &xv)); 352 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 353 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 354 PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 355 PetscCall(VecRestoreKokkosView(xx, &xv)); 356 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 357 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 358 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 359 PetscCall(PetscLogGpuTimeEnd()); 360 PetscFunctionReturn(PETSC_SUCCESS); 361 } 362 363 /* y = A^T x */ 364 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 365 { 366 Mat_SeqAIJKokkos *aijkok; 367 const char *mode; 368 ConstPetscScalarKokkosView xv; 369 PetscScalarKokkosView yv; 370 KokkosCsrMatrix csrmat; 371 372 PetscFunctionBegin; 373 PetscCall(PetscLogGpuTimeBegin()); 374 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 375 PetscCall(VecGetKokkosView(xx, &xv)); 376 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 377 if (A->form_explicit_transpose) { 378 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 379 mode = "N"; 380 } else { 381 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 382 csrmat = aijkok->csrmat; 383 mode = "T"; 384 } 385 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 386 PetscCall(VecRestoreKokkosView(xx, &xv)); 387 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 388 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 389 PetscCall(PetscLogGpuTimeEnd()); 390 PetscFunctionReturn(PETSC_SUCCESS); 391 } 392 393 /* y = A^H x */ 394 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 395 { 396 Mat_SeqAIJKokkos *aijkok; 397 const char *mode; 398 ConstPetscScalarKokkosView xv; 399 PetscScalarKokkosView yv; 400 KokkosCsrMatrix csrmat; 401 402 PetscFunctionBegin; 403 PetscCall(PetscLogGpuTimeBegin()); 404 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 405 PetscCall(VecGetKokkosView(xx, &xv)); 406 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 407 if (A->form_explicit_transpose) { 408 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 409 mode = "N"; 410 } else { 411 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 412 csrmat = aijkok->csrmat; 413 mode = "C"; 414 } 415 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 416 PetscCall(VecRestoreKokkosView(xx, &xv)); 417 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 418 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 419 PetscCall(PetscLogGpuTimeEnd()); 420 PetscFunctionReturn(PETSC_SUCCESS); 421 } 422 423 /* z = A x + y */ 424 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 425 { 426 Mat_SeqAIJKokkos *aijkok; 427 ConstPetscScalarKokkosView xv, yv; 428 PetscScalarKokkosView zv; 429 430 PetscFunctionBegin; 431 PetscCall(PetscLogGpuTimeBegin()); 432 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 433 PetscCall(VecGetKokkosView(xx, &xv)); 434 PetscCall(VecGetKokkosView(yy, &yv)); 435 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 436 if (zz != yy) Kokkos::deep_copy(zv, yv); 437 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 438 PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 439 PetscCall(VecRestoreKokkosView(xx, &xv)); 440 PetscCall(VecRestoreKokkosView(yy, &yv)); 441 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 442 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 443 PetscCall(PetscLogGpuTimeEnd()); 444 PetscFunctionReturn(PETSC_SUCCESS); 445 } 446 447 /* z = A^T x + y */ 448 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 449 { 450 Mat_SeqAIJKokkos *aijkok; 451 const char *mode; 452 ConstPetscScalarKokkosView xv, yv; 453 PetscScalarKokkosView zv; 454 KokkosCsrMatrix csrmat; 455 456 PetscFunctionBegin; 457 PetscCall(PetscLogGpuTimeBegin()); 458 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 459 PetscCall(VecGetKokkosView(xx, &xv)); 460 PetscCall(VecGetKokkosView(yy, &yv)); 461 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 462 if (zz != yy) Kokkos::deep_copy(zv, yv); 463 if (A->form_explicit_transpose) { 464 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 465 mode = "N"; 466 } else { 467 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 468 csrmat = aijkok->csrmat; 469 mode = "T"; 470 } 471 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 472 PetscCall(VecRestoreKokkosView(xx, &xv)); 473 PetscCall(VecRestoreKokkosView(yy, &yv)); 474 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 475 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 476 PetscCall(PetscLogGpuTimeEnd()); 477 PetscFunctionReturn(PETSC_SUCCESS); 478 } 479 480 /* z = A^H x + y */ 481 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 482 { 483 Mat_SeqAIJKokkos *aijkok; 484 const char *mode; 485 ConstPetscScalarKokkosView xv, yv; 486 PetscScalarKokkosView zv; 487 KokkosCsrMatrix csrmat; 488 489 PetscFunctionBegin; 490 PetscCall(PetscLogGpuTimeBegin()); 491 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 492 PetscCall(VecGetKokkosView(xx, &xv)); 493 PetscCall(VecGetKokkosView(yy, &yv)); 494 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 495 if (zz != yy) Kokkos::deep_copy(zv, yv); 496 if (A->form_explicit_transpose) { 497 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 498 mode = "N"; 499 } else { 500 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 501 csrmat = aijkok->csrmat; 502 mode = "C"; 503 } 504 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 505 PetscCall(VecRestoreKokkosView(xx, &xv)); 506 PetscCall(VecRestoreKokkosView(yy, &yv)); 507 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 508 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 509 PetscCall(PetscLogGpuTimeEnd()); 510 PetscFunctionReturn(PETSC_SUCCESS); 511 } 512 513 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 514 { 515 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 516 517 PetscFunctionBegin; 518 switch (op) { 519 case MAT_FORM_EXPLICIT_TRANSPOSE: 520 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 521 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 522 A->form_explicit_transpose = flg; 523 break; 524 default: 525 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 526 break; 527 } 528 PetscFunctionReturn(PETSC_SUCCESS); 529 } 530 531 /* Depending on reuse, either build a new mat, or use the existing mat */ 532 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 533 { 534 Mat_SeqAIJ *aseq; 535 536 PetscFunctionBegin; 537 PetscCall(PetscKokkosInitializeCheck()); 538 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 539 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */ 540 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 541 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 542 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 543 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 544 PetscCall(PetscFree(A->defaultvectype)); 545 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 546 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 547 PetscCall(MatSetOps_SeqAIJKokkos(A)); 548 aseq = static_cast<Mat_SeqAIJ *>(A->data); 549 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 550 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 551 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE); 552 } 553 } 554 PetscFunctionReturn(PETSC_SUCCESS); 555 } 556 557 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 558 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 559 */ 560 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 561 { 562 Mat_SeqAIJ *bseq; 563 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 564 Mat mat; 565 566 PetscFunctionBegin; 567 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 568 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 569 mat = *B; 570 if (A->assembled) { 571 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 572 bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE); 573 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 574 /* Now copy values to B if needed */ 575 if (dupOption == MAT_COPY_VALUES) { 576 if (akok->a_dual.need_sync_device()) { 577 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 578 bkok->a_dual.modify_host(); 579 } else { /* If device has the latest data, we only copy data on device */ 580 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 581 bkok->a_dual.modify_device(); 582 } 583 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 584 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 585 bkok->a_dual.modify_host(); 586 } 587 mat->spptr = bkok; 588 } 589 590 PetscCall(PetscFree(mat->defaultvectype)); 591 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 592 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 593 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 594 PetscFunctionReturn(PETSC_SUCCESS); 595 } 596 597 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 598 { 599 Mat At; 600 KokkosCsrMatrix internT; 601 Mat_SeqAIJKokkos *atkok, *bkok; 602 603 PetscFunctionBegin; 604 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 605 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 606 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 607 /* Deep copy internT, as we want to isolate the internal transpose */ 608 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 609 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 610 if (reuse == MAT_INITIAL_MATRIX) *B = At; 611 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 612 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 613 if ((*B)->assembled) { 614 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 615 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 616 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 617 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 618 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 619 MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 620 MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 621 PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 622 PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 623 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 624 } 625 PetscFunctionReturn(PETSC_SUCCESS); 626 } 627 628 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 629 { 630 Mat_SeqAIJKokkos *aijkok; 631 632 PetscFunctionBegin; 633 if (A->factortype == MAT_FACTOR_NONE) { 634 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 635 delete aijkok; 636 } else { 637 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 638 } 639 A->spptr = NULL; 640 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 641 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 642 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 643 PetscCall(MatDestroy_SeqAIJ(A)); 644 PetscFunctionReturn(PETSC_SUCCESS); 645 } 646 647 /*MC 648 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 649 650 A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types 651 652 Options Database Key: 653 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 654 655 Level: beginner 656 657 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 658 M*/ 659 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 660 { 661 PetscFunctionBegin; 662 PetscCall(PetscKokkosInitializeCheck()); 663 PetscCall(MatCreate_SeqAIJ(A)); 664 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 665 PetscFunctionReturn(PETSC_SUCCESS); 666 } 667 668 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 669 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 670 { 671 Mat_SeqAIJ *a, *b; 672 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 673 MatScalarKokkosView aa, ba, ca; 674 MatRowMapKokkosView ai, bi, ci; 675 MatColIdxKokkosView aj, bj, cj; 676 PetscInt m, n, nnz, aN; 677 678 PetscFunctionBegin; 679 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 680 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 681 PetscAssertPointer(C, 4); 682 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 683 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 684 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 685 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 686 687 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 688 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 689 a = static_cast<Mat_SeqAIJ *>(A->data); 690 b = static_cast<Mat_SeqAIJ *>(B->data); 691 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 692 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 693 aa = akok->a_dual.view_device(); 694 ai = akok->i_dual.view_device(); 695 ba = bkok->a_dual.view_device(); 696 bi = bkok->i_dual.view_device(); 697 m = A->rmap->n; /* M, N and nnz of C */ 698 n = A->cmap->n + B->cmap->n; 699 nnz = a->nz + b->nz; 700 aN = A->cmap->n; /* N of A */ 701 if (reuse == MAT_INITIAL_MATRIX) { 702 aj = akok->j_dual.view_device(); 703 bj = bkok->j_dual.view_device(); 704 auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 705 auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 706 auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 707 ca = ca_dual.view_device(); 708 ci = ci_dual.view_device(); 709 cj = cj_dual.view_device(); 710 711 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 712 Kokkos::parallel_for( 713 Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 714 PetscInt i = t.league_rank(); /* row i */ 715 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 716 717 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 718 ci(i) = coffset; 719 if (i == m - 1) ci(m) = ai(m) + bi(m); 720 }); 721 722 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 723 if (k < alen) { 724 ca(coffset + k) = aa(ai(i) + k); 725 cj(coffset + k) = aj(ai(i) + k); 726 } else { 727 ca(coffset + k) = ba(bi(i) + k - alen); 728 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 729 } 730 }); 731 }); 732 ca_dual.modify_device(); 733 ci_dual.modify_device(); 734 cj_dual.modify_device(); 735 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 736 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 737 } else if (reuse == MAT_REUSE_MATRIX) { 738 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 739 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 740 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 741 ca = ckok->a_dual.view_device(); 742 ci = ckok->i_dual.view_device(); 743 744 Kokkos::parallel_for( 745 Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 746 PetscInt i = t.league_rank(); /* row i */ 747 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 748 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 749 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 750 else ca(ci(i) + k) = ba(bi(i) + k - alen); 751 }); 752 }); 753 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 754 } 755 PetscFunctionReturn(PETSC_SUCCESS); 756 } 757 758 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 759 { 760 PetscFunctionBegin; 761 delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 762 PetscFunctionReturn(PETSC_SUCCESS); 763 } 764 765 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 766 { 767 Mat_Product *product = C->product; 768 Mat A, B; 769 bool transA, transB; /* use bool, since KK needs this type */ 770 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 771 Mat_SeqAIJ *c; 772 MatProductData_SeqAIJKokkos *pdata; 773 KokkosCsrMatrix csrmatA, csrmatB; 774 775 PetscFunctionBegin; 776 MatCheckProduct(C, 1); 777 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 778 pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 779 780 // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 781 // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 782 // we still do numeric. 783 if (pdata->reusesym) { // numeric reuses results from symbolic 784 pdata->reusesym = PETSC_FALSE; 785 PetscFunctionReturn(PETSC_SUCCESS); 786 } 787 788 switch (product->type) { 789 case MATPRODUCT_AB: 790 transA = false; 791 transB = false; 792 break; 793 case MATPRODUCT_AtB: 794 transA = true; 795 transB = false; 796 break; 797 case MATPRODUCT_ABt: 798 transA = false; 799 transB = true; 800 break; 801 default: 802 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 803 } 804 805 A = product->A; 806 B = product->B; 807 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 808 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 809 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 810 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 811 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 812 813 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 814 815 csrmatA = akok->csrmat; 816 csrmatB = bkok->csrmat; 817 818 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 819 if (transA) { 820 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 821 transA = false; 822 } 823 824 if (transB) { 825 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 826 transB = false; 827 } 828 PetscCall(PetscLogGpuTimeBegin()); 829 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 830 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 831 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 832 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 833 #endif 834 835 PetscCall(PetscLogGpuTimeEnd()); 836 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 837 /* shorter version of MatAssemblyEnd_SeqAIJ */ 838 c = (Mat_SeqAIJ *)C->data; 839 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 840 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 841 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 842 c->reallocs = 0; 843 C->info.mallocs = 0; 844 C->info.nz_unneeded = 0; 845 C->assembled = C->was_assembled = PETSC_TRUE; 846 C->num_ass++; 847 PetscFunctionReturn(PETSC_SUCCESS); 848 } 849 850 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 851 { 852 Mat_Product *product = C->product; 853 MatProductType ptype; 854 Mat A, B; 855 bool transA, transB; 856 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 857 MatProductData_SeqAIJKokkos *pdata; 858 MPI_Comm comm; 859 KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 860 861 PetscFunctionBegin; 862 MatCheckProduct(C, 1); 863 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 864 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 865 A = product->A; 866 B = product->B; 867 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 868 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 869 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 870 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 871 csrmatA = akok->csrmat; 872 csrmatB = bkok->csrmat; 873 874 ptype = product->type; 875 // Take advantage of the symmetry if true 876 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 877 ptype = MATPRODUCT_AB; 878 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 879 } 880 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 881 ptype = MATPRODUCT_AB; 882 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 883 } 884 885 switch (ptype) { 886 case MATPRODUCT_AB: 887 transA = false; 888 transB = false; 889 break; 890 case MATPRODUCT_AtB: 891 transA = true; 892 transB = false; 893 break; 894 case MATPRODUCT_ABt: 895 transA = false; 896 transB = true; 897 break; 898 default: 899 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 900 } 901 PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 902 pdata->reusesym = product->api_user; 903 904 /* TODO: add command line options to select spgemm algorithms */ 905 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 906 907 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 908 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 909 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 910 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 911 #endif 912 #endif 913 PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 914 915 PetscCall(PetscLogGpuTimeBegin()); 916 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 917 if (transA) { 918 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 919 transA = false; 920 } 921 922 if (transB) { 923 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 924 transB = false; 925 } 926 927 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 928 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 929 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 930 calling new Mat_SeqAIJKokkos(). 931 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 932 */ 933 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 934 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 935 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 936 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 937 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 938 #endif 939 PetscCall(PetscLogGpuTimeEnd()); 940 941 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 942 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 943 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 944 PetscFunctionReturn(PETSC_SUCCESS); 945 } 946 947 /* handles sparse matrix matrix ops */ 948 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 949 { 950 Mat_Product *product = mat->product; 951 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 952 953 PetscFunctionBegin; 954 MatCheckProduct(mat, 1); 955 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 956 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 957 if (Biskok && Ciskok) { 958 switch (product->type) { 959 case MATPRODUCT_AB: 960 case MATPRODUCT_AtB: 961 case MATPRODUCT_ABt: 962 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 963 break; 964 case MATPRODUCT_PtAP: 965 case MATPRODUCT_RARt: 966 case MATPRODUCT_ABC: 967 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 968 break; 969 default: 970 break; 971 } 972 } else { /* fallback for AIJ */ 973 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 974 } 975 PetscFunctionReturn(PETSC_SUCCESS); 976 } 977 978 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 979 { 980 Mat_SeqAIJKokkos *aijkok; 981 982 PetscFunctionBegin; 983 PetscCall(PetscLogGpuTimeBegin()); 984 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 985 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 986 KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 987 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 988 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 989 PetscCall(PetscLogGpuTimeEnd()); 990 PetscFunctionReturn(PETSC_SUCCESS); 991 } 992 993 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular) 994 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a) 995 { 996 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 997 998 PetscFunctionBegin; 999 if (A->assembled && aijseq->diagonaldense) { // no missing diagonals 1000 PetscInt n = PetscMin(A->rmap->n, A->cmap->n); 1001 1002 PetscCall(PetscLogGpuTimeBegin()); 1003 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1004 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1005 const auto &Aa = aijkok->a_dual.view_device(); 1006 const auto &Adiag = aijkok->diag_dual.view_device(); 1007 PetscCallCXX(Kokkos::parallel_for(n, KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; })); 1008 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1009 PetscCall(PetscLogGpuFlops(n)); 1010 PetscCall(PetscLogGpuTimeEnd()); 1011 } else { // need reassembly, very slow! 1012 PetscCall(MatShift_Basic(A, a)); 1013 } 1014 PetscFunctionReturn(PETSC_SUCCESS); 1015 } 1016 1017 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is) 1018 { 1019 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data); 1020 1021 PetscFunctionBegin; 1022 if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals 1023 ConstPetscScalarKokkosView dv; 1024 PetscInt n, nv; 1025 1026 PetscCall(PetscLogGpuTimeBegin()); 1027 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1028 PetscCall(VecGetKokkosView(D, &dv)); 1029 PetscCall(VecGetLocalSize(D, &nv)); 1030 n = PetscMin(Y->rmap->n, Y->cmap->n); 1031 PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match"); 1032 1033 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1034 const auto &Aa = aijkok->a_dual.view_device(); 1035 const auto &Adiag = aijkok->diag_dual.view_device(); 1036 PetscCallCXX(Kokkos::parallel_for( 1037 n, KOKKOS_LAMBDA(const PetscInt i) { 1038 if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i); 1039 else Aa(Adiag(i)) += dv(i); 1040 })); 1041 PetscCall(VecRestoreKokkosView(D, &dv)); 1042 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1043 PetscCall(PetscLogGpuFlops(n)); 1044 PetscCall(PetscLogGpuTimeEnd()); 1045 } else { // need reassembly, very slow! 1046 PetscCall(MatDiagonalSet_Default(Y, D, is)); 1047 } 1048 PetscFunctionReturn(PETSC_SUCCESS); 1049 } 1050 1051 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr) 1052 { 1053 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1054 PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz; 1055 ConstPetscScalarKokkosView lv, rv; 1056 1057 PetscFunctionBegin; 1058 PetscCall(PetscLogGpuTimeBegin()); 1059 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1060 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1061 const auto &Aa = aijkok->a_dual.view_device(); 1062 const auto &Ai = aijkok->i_dual.view_device(); 1063 const auto &Aj = aijkok->j_dual.view_device(); 1064 if (ll) { 1065 PetscCall(VecGetLocalSize(ll, &m)); 1066 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1067 PetscCall(VecGetKokkosView(ll, &lv)); 1068 PetscCallCXX(Kokkos::parallel_for( // for each row 1069 Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1070 PetscInt i = t.league_rank(); // row i 1071 PetscInt len = Ai(i + 1) - Ai(i); 1072 // scale entries on the row 1073 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); }); 1074 })); 1075 PetscCall(VecRestoreKokkosView(ll, &lv)); 1076 PetscCall(PetscLogGpuFlops(nz)); 1077 } 1078 if (rr) { 1079 PetscCall(VecGetLocalSize(rr, &n)); 1080 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1081 PetscCall(VecGetKokkosView(rr, &rv)); 1082 PetscCallCXX(Kokkos::parallel_for( // for each nonzero 1083 nz, KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); })); 1084 PetscCall(VecRestoreKokkosView(rr, &lv)); 1085 PetscCall(PetscLogGpuFlops(nz)); 1086 } 1087 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1088 PetscCall(PetscLogGpuTimeEnd()); 1089 PetscFunctionReturn(PETSC_SUCCESS); 1090 } 1091 1092 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 1093 { 1094 Mat_SeqAIJKokkos *aijkok; 1095 1096 PetscFunctionBegin; 1097 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1098 if (aijkok) { /* Only zero the device if data is already there */ 1099 KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0); 1100 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1101 } else { /* Might be preallocated but not assembled */ 1102 PetscCall(MatZeroEntries_SeqAIJ(A)); 1103 } 1104 PetscFunctionReturn(PETSC_SUCCESS); 1105 } 1106 1107 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1108 { 1109 Mat_SeqAIJKokkos *aijkok; 1110 PetscInt n; 1111 PetscScalarKokkosView xv; 1112 1113 PetscFunctionBegin; 1114 PetscCall(VecGetLocalSize(x, &n)); 1115 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1116 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1117 1118 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1119 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1120 1121 const auto &Aa = aijkok->a_dual.view_device(); 1122 const auto &Ai = aijkok->i_dual.view_device(); 1123 const auto &Adiag = aijkok->diag_dual.view_device(); 1124 1125 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1126 Kokkos::parallel_for( 1127 n, KOKKOS_LAMBDA(const PetscInt i) { 1128 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1129 else xv(i) = 0; 1130 }); 1131 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1132 PetscFunctionReturn(PETSC_SUCCESS); 1133 } 1134 1135 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1136 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1137 { 1138 Mat_SeqAIJKokkos *aijkok; 1139 1140 PetscFunctionBegin; 1141 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1142 PetscAssertPointer(kv, 2); 1143 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1144 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1145 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1146 *kv = aijkok->a_dual.view_device(); 1147 PetscFunctionReturn(PETSC_SUCCESS); 1148 } 1149 1150 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1151 { 1152 PetscFunctionBegin; 1153 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1154 PetscAssertPointer(kv, 2); 1155 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1156 PetscFunctionReturn(PETSC_SUCCESS); 1157 } 1158 1159 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1160 { 1161 Mat_SeqAIJKokkos *aijkok; 1162 1163 PetscFunctionBegin; 1164 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1165 PetscAssertPointer(kv, 2); 1166 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1167 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1168 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1169 *kv = aijkok->a_dual.view_device(); 1170 PetscFunctionReturn(PETSC_SUCCESS); 1171 } 1172 1173 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1174 { 1175 PetscFunctionBegin; 1176 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1177 PetscAssertPointer(kv, 2); 1178 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1179 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1180 PetscFunctionReturn(PETSC_SUCCESS); 1181 } 1182 1183 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1184 { 1185 Mat_SeqAIJKokkos *aijkok; 1186 1187 PetscFunctionBegin; 1188 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1189 PetscAssertPointer(kv, 2); 1190 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1191 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1192 *kv = aijkok->a_dual.view_device(); 1193 PetscFunctionReturn(PETSC_SUCCESS); 1194 } 1195 1196 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1197 { 1198 PetscFunctionBegin; 1199 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1200 PetscAssertPointer(kv, 2); 1201 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1202 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1203 PetscFunctionReturn(PETSC_SUCCESS); 1204 } 1205 1206 /* Computes Y += alpha X */ 1207 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1208 { 1209 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1210 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1211 ConstMatScalarKokkosView Xa; 1212 MatScalarKokkosView Ya; 1213 1214 PetscFunctionBegin; 1215 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1216 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1217 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1218 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1219 PetscCall(PetscLogGpuTimeBegin()); 1220 1221 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1222 PetscBool e; 1223 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1224 if (e) { 1225 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1226 if (e) pattern = SAME_NONZERO_PATTERN; 1227 } 1228 } 1229 1230 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1231 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1232 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1233 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1234 */ 1235 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1236 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1237 Xa = xkok->a_dual.view_device(); 1238 Ya = ykok->a_dual.view_device(); 1239 1240 if (pattern == SAME_NONZERO_PATTERN) { 1241 KokkosBlas::axpy(alpha, Xa, Ya); 1242 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1243 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1244 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1245 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1246 1247 Kokkos::parallel_for( 1248 Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1249 PetscInt i = t.league_rank(); // row i 1250 Kokkos::single(Kokkos::PerTeam(t), [=]() { 1251 // Only one thread works in a team 1252 PetscInt p, q = Yi(i); 1253 for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 1254 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 1255 if (Xj(p) == Yj(q)) { // Found it 1256 Ya(q) += alpha * Xa(p); 1257 q++; 1258 } else { 1259 // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1260 // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1261 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 1262 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 1263 #else 1264 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 1265 #endif 1266 } 1267 } 1268 }); 1269 }); 1270 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1271 } else { // different nonzero patterns 1272 Mat Z; 1273 KokkosCsrMatrix zcsr; 1274 KernelHandle kh; 1275 kh.create_spadd_handle(true); // X, Y are sorted 1276 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1277 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1278 zkok = new Mat_SeqAIJKokkos(zcsr); 1279 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1280 PetscCall(MatHeaderReplace(Y, &Z)); 1281 kh.destroy_spadd_handle(); 1282 } 1283 PetscCall(PetscLogGpuTimeEnd()); 1284 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 1285 PetscFunctionReturn(PETSC_SUCCESS); 1286 } 1287 1288 struct MatCOOStruct_SeqAIJKokkos { 1289 PetscCount n; 1290 PetscCount Atot; 1291 PetscInt nz; 1292 PetscCountKokkosView jmap; 1293 PetscCountKokkosView perm; 1294 1295 MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h) 1296 { 1297 nz = coo_h->nz; 1298 n = coo_h->n; 1299 Atot = coo_h->Atot; 1300 jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1)); 1301 perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot)); 1302 } 1303 }; 1304 1305 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void *data) 1306 { 1307 PetscFunctionBegin; 1308 PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(data)); 1309 PetscFunctionReturn(PETSC_SUCCESS); 1310 } 1311 1312 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1313 { 1314 Mat_SeqAIJKokkos *akok; 1315 Mat_SeqAIJ *aseq; 1316 PetscContainer container_h, container_d; 1317 MatCOOStruct_SeqAIJ *coo_h; 1318 MatCOOStruct_SeqAIJKokkos *coo_d; 1319 1320 PetscFunctionBegin; 1321 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1322 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1323 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1324 delete akok; 1325 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE); 1326 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1327 1328 // Copy the COO struct to device 1329 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 1330 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 1331 PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h)); 1332 1333 // Put the COO struct in a container and then attach that to the matrix 1334 PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d)); 1335 PetscCall(PetscContainerSetPointer(container_d, coo_d)); 1336 PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_SeqAIJKokkos)); 1337 PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d)); 1338 PetscCall(PetscContainerDestroy(&container_d)); 1339 PetscFunctionReturn(PETSC_SUCCESS); 1340 } 1341 1342 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1343 { 1344 MatScalarKokkosView Aa; 1345 ConstMatScalarKokkosView kv; 1346 PetscMemType memtype; 1347 PetscContainer container; 1348 MatCOOStruct_SeqAIJKokkos *coo; 1349 1350 PetscFunctionBegin; 1351 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 1352 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 1353 1354 const auto &n = coo->n; 1355 const auto &Annz = coo->nz; 1356 const auto &jmap = coo->jmap; 1357 const auto &perm = coo->perm; 1358 1359 PetscCall(PetscGetMemType(v, &memtype)); 1360 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1361 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n)); 1362 } else { 1363 kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */ 1364 } 1365 1366 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1367 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1368 1369 PetscCall(PetscLogGpuTimeBegin()); 1370 Kokkos::parallel_for( 1371 Annz, KOKKOS_LAMBDA(const PetscCount i) { 1372 PetscScalar sum = 0.0; 1373 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1374 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1375 }); 1376 PetscCall(PetscLogGpuTimeEnd()); 1377 1378 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1379 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1380 PetscFunctionReturn(PETSC_SUCCESS); 1381 } 1382 1383 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1384 { 1385 PetscFunctionBegin; 1386 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1387 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1388 B->offloadmask = PETSC_OFFLOAD_CPU; 1389 PetscFunctionReturn(PETSC_SUCCESS); 1390 } 1391 1392 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1393 { 1394 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1395 1396 PetscFunctionBegin; 1397 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1398 A->boundtocpu = PETSC_FALSE; 1399 1400 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1401 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1402 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1403 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1404 A->ops->scale = MatScale_SeqAIJKokkos; 1405 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1406 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1407 A->ops->mult = MatMult_SeqAIJKokkos; 1408 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1409 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1410 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1411 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1412 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1413 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1414 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1415 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1416 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1417 A->ops->shift = MatShift_SeqAIJKokkos; 1418 A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos; 1419 A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos; 1420 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1421 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1422 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1423 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1424 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1425 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1426 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1427 1428 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1429 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1430 PetscFunctionReturn(PETSC_SUCCESS); 1431 } 1432 1433 /* 1434 Extract the (prescribled) diagonal blocks of the matrix and then invert them 1435 1436 Input Parameters: 1437 + A - the MATSEQAIJKOKKOS matrix 1438 . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i) 1439 . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[] 1440 . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i) 1441 - work - a pre-allocated work buffer (as big as diagVal) for use by this routine 1442 1443 Output Parameter: 1444 . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order) 1445 */ 1446 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal) 1447 { 1448 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1449 PetscInt nblocks = bs.extent(0) - 1; 1450 1451 PetscFunctionBegin; 1452 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device 1453 1454 // Pull out the diagonal blocks of the matrix and then invert the blocks 1455 auto Aa = akok->a_dual.view_device(); 1456 auto Ai = akok->i_dual.view_device(); 1457 auto Aj = akok->j_dual.view_device(); 1458 auto Adiag = akok->diag_dual.view_device(); 1459 // TODO: how to tune the team size? 1460 #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HOST) 1461 auto ts = Kokkos::AUTO(); 1462 #else 1463 auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs 1464 #endif 1465 PetscCallCXX(Kokkos::parallel_for( 1466 Kokkos::TeamPolicy<>(nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) { 1467 const PetscInt bid = teamMember.league_rank(); // block id 1468 const PetscInt rstart = bs(bid); // this block starts from this row 1469 const PetscInt m = bs(bid + 1) - bs(bid); // size of this block 1470 const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order 1471 const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m); 1472 1473 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B 1474 PetscInt i = rstart + r; // i-th row in A 1475 1476 if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case) 1477 PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row 1478 1479 for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet 1480 if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero 1481 B(r, c) = 0.0; 1482 } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column 1483 B(r, c) = Aa(first + c); 1484 } else { // this entry does not show up in the CSR 1485 B(r, c) = 0.0; 1486 } 1487 } 1488 } else { // rare case that the diagonal does not exist 1489 const PetscInt begin = Ai(i); 1490 const PetscInt end = Ai(i + 1); 1491 for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0; 1492 for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not. 1493 if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j); 1494 else if (Aj(j) >= rstart + m) break; 1495 } 1496 } 1497 }); 1498 1499 // LU-decompose B (w/o pivoting) and then invert B 1500 KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0); 1501 KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W); 1502 })); 1503 // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels 1504 PetscFunctionReturn(PETSC_SUCCESS); 1505 } 1506 1507 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1508 { 1509 Mat_SeqAIJ *aseq; 1510 PetscInt i, m, n; 1511 auto &exec = PetscGetKokkosExecutionSpace(); 1512 1513 PetscFunctionBegin; 1514 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1515 1516 m = akok->nrows(); 1517 n = akok->ncols(); 1518 PetscCall(MatSetSizes(A, m, n, m, n)); 1519 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1520 1521 /* Set up data structures of A as a MATSEQAIJ */ 1522 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1523 aseq = (Mat_SeqAIJ *)(A)->data; 1524 1525 PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */ 1526 PetscCallCXX(akok->j_dual.sync_host(exec)); 1527 PetscCallCXX(exec.fence()); 1528 1529 aseq->i = akok->i_host_data(); 1530 aseq->j = akok->j_host_data(); 1531 aseq->a = akok->a_host_data(); 1532 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1533 aseq->singlemalloc = PETSC_FALSE; 1534 aseq->free_a = PETSC_FALSE; 1535 aseq->free_ij = PETSC_FALSE; 1536 aseq->nz = akok->nnz(); 1537 aseq->maxnz = aseq->nz; 1538 1539 PetscCall(PetscMalloc1(m, &aseq->imax)); 1540 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1541 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1542 1543 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1544 akok->nonzerostate = A->nonzerostate; 1545 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1546 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1547 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1548 PetscFunctionReturn(PETSC_SUCCESS); 1549 } 1550 1551 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 1552 { 1553 PetscFunctionBegin; 1554 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1555 *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 1556 PetscFunctionReturn(PETSC_SUCCESS); 1557 } 1558 1559 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 1560 { 1561 Mat_SeqAIJKokkos *akok; 1562 1563 PetscFunctionBegin; 1564 PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 1565 PetscCall(MatCreate(comm, A)); 1566 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1567 PetscFunctionReturn(PETSC_SUCCESS); 1568 } 1569 1570 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1571 1572 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1573 */ 1574 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1575 { 1576 PetscFunctionBegin; 1577 PetscCall(MatCreate(comm, A)); 1578 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1579 PetscFunctionReturn(PETSC_SUCCESS); 1580 } 1581 1582 /*@C 1583 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1584 (the default parallel PETSc format). This matrix will ultimately be handled by 1585 Kokkos for calculations. 1586 1587 Collective 1588 1589 Input Parameters: 1590 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1591 . m - number of rows 1592 . n - number of columns 1593 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 1594 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 1595 1596 Output Parameter: 1597 . A - the matrix 1598 1599 Level: intermediate 1600 1601 Notes: 1602 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1603 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1604 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1605 1606 The AIJ format, also called 1607 compressed row storage, is fully compatible with standard Fortran 1608 storage. That is, the stored row and column indices can begin at 1609 either one (as in Fortran) or zero. 1610 1611 Specify the preallocated storage with either `nz` or `nnz` (not both). 1612 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1613 allocation. 1614 1615 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()` 1616 @*/ 1617 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1618 { 1619 PetscFunctionBegin; 1620 PetscCall(PetscKokkosInitializeCheck()); 1621 PetscCall(MatCreate(comm, A)); 1622 PetscCall(MatSetSizes(*A, m, n, m, n)); 1623 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1624 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1625 PetscFunctionReturn(PETSC_SUCCESS); 1626 } 1627 1628 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1629 { 1630 PetscFunctionBegin; 1631 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1632 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1633 PetscFunctionReturn(PETSC_SUCCESS); 1634 } 1635 1636 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1637 { 1638 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1639 1640 PetscFunctionBegin; 1641 if (!factors->sptrsv_symbolic_completed) { 1642 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d); 1643 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d); 1644 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1645 } 1646 PetscFunctionReturn(PETSC_SUCCESS); 1647 } 1648 1649 /* Check if we need to update factors etc for transpose solve */ 1650 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1651 { 1652 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1653 MatColIdxType n = A->rmap->n; 1654 1655 PetscFunctionBegin; 1656 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1657 /* Update L^T and do sptrsv symbolic */ 1658 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0 1659 factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 1660 factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 1661 1662 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d, 1663 factors->iLt_d, factors->jLt_d, factors->aLt_d); 1664 1665 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1666 We have to sort the indices, until KK provides finer control options. 1667 */ 1668 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d); 1669 1670 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d); 1671 1672 /* Update U^T and do sptrsv symbolic */ 1673 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0 1674 factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 1675 factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 1676 1677 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d, 1678 factors->iUt_d, factors->jUt_d, factors->aUt_d); 1679 1680 /* Sort indices. See comments above */ 1681 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d); 1682 1683 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d); 1684 factors->transpose_updated = PETSC_TRUE; 1685 } 1686 PetscFunctionReturn(PETSC_SUCCESS); 1687 } 1688 1689 /* Solve Ax = b, with A = LU */ 1690 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x) 1691 { 1692 ConstPetscScalarKokkosView bv; 1693 PetscScalarKokkosView xv; 1694 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1695 1696 PetscFunctionBegin; 1697 PetscCall(PetscLogGpuTimeBegin()); 1698 PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 1699 PetscCall(VecGetKokkosView(b, &bv)); 1700 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1701 /* Solve L tmpv = b */ 1702 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector)); 1703 /* Solve Ux = tmpv */ 1704 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv)); 1705 PetscCall(VecRestoreKokkosView(b, &bv)); 1706 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1707 PetscCall(PetscLogGpuTimeEnd()); 1708 PetscFunctionReturn(PETSC_SUCCESS); 1709 } 1710 1711 /* Solve A^T x = b, where A^T = U^T L^T */ 1712 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x) 1713 { 1714 ConstPetscScalarKokkosView bv; 1715 PetscScalarKokkosView xv; 1716 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1717 1718 PetscFunctionBegin; 1719 PetscCall(PetscLogGpuTimeBegin()); 1720 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 1721 PetscCall(VecGetKokkosView(b, &bv)); 1722 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1723 /* Solve U^T tmpv = b */ 1724 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector); 1725 1726 /* Solve L^T x = tmpv */ 1727 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv); 1728 PetscCall(VecRestoreKokkosView(b, &bv)); 1729 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1730 PetscCall(PetscLogGpuTimeEnd()); 1731 PetscFunctionReturn(PETSC_SUCCESS); 1732 } 1733 1734 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1735 { 1736 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1737 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1738 PetscInt fill_lev = info->levels; 1739 1740 PetscFunctionBegin; 1741 PetscCall(PetscLogGpuTimeBegin()); 1742 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1743 1744 auto a_d = aijkok->a_dual.view_device(); 1745 auto i_d = aijkok->i_dual.view_device(); 1746 auto j_d = aijkok->j_dual.view_device(); 1747 1748 KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d); 1749 1750 B->assembled = PETSC_TRUE; 1751 B->preallocated = PETSC_TRUE; 1752 B->ops->solve = MatSolve_SeqAIJKokkos; 1753 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1754 B->ops->matsolve = NULL; 1755 B->ops->matsolvetranspose = NULL; 1756 B->offloadmask = PETSC_OFFLOAD_GPU; 1757 1758 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1759 factors->transpose_updated = PETSC_FALSE; 1760 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1761 /* TODO: log flops, but how to know that? */ 1762 PetscCall(PetscLogGpuTimeEnd()); 1763 PetscFunctionReturn(PETSC_SUCCESS); 1764 } 1765 1766 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1767 { 1768 Mat_SeqAIJKokkos *aijkok; 1769 Mat_SeqAIJ *b; 1770 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1771 PetscInt fill_lev = info->levels; 1772 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 1773 PetscInt n = A->rmap->n; 1774 1775 PetscFunctionBegin; 1776 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1777 /* Rebuild factors */ 1778 if (factors) { 1779 factors->Destroy(); 1780 } /* Destroy the old if it exists */ 1781 else { 1782 B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n); 1783 } 1784 1785 /* Create a spiluk handle and then do symbolic factorization */ 1786 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 1787 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 1788 1789 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1790 1791 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 1792 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 1793 Kokkos::realloc(factors->iU_d, n + 1); 1794 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 1795 1796 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1797 auto i_d = aijkok->i_dual.view_device(); 1798 auto j_d = aijkok->j_dual.view_device(); 1799 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d); 1800 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1801 1802 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1803 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 1804 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 1805 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 1806 1807 /* TODO: add options to select sptrsv algorithms */ 1808 /* Create sptrsv handles for L, U and their transpose */ 1809 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1810 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1811 #else 1812 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1813 #endif 1814 1815 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 1816 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 1817 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 1818 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 1819 1820 /* Fill fields of the factor matrix B */ 1821 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 1822 b = (Mat_SeqAIJ *)B->data; 1823 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 1824 B->info.fill_ratio_given = info->fill; 1825 B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 1826 1827 B->offloadmask = PETSC_OFFLOAD_GPU; 1828 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1829 PetscFunctionReturn(PETSC_SUCCESS); 1830 } 1831 1832 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type) 1833 { 1834 PetscFunctionBegin; 1835 *type = MATSOLVERKOKKOS; 1836 PetscFunctionReturn(PETSC_SUCCESS); 1837 } 1838 1839 /*MC 1840 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1841 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 1842 1843 Level: beginner 1844 1845 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1846 M*/ 1847 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1848 { 1849 PetscInt n = A->rmap->n; 1850 1851 PetscFunctionBegin; 1852 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 1853 PetscCall(MatSetSizes(*B, n, n, n, n)); 1854 (*B)->factortype = ftype; 1855 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 1856 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1857 1858 if (ftype == MAT_FACTOR_LU) { 1859 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1860 (*B)->canuseordering = PETSC_TRUE; 1861 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1862 } else if (ftype == MAT_FACTOR_ILU) { 1863 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1864 (*B)->canuseordering = PETSC_FALSE; 1865 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1866 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1867 1868 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 1869 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos)); 1870 PetscFunctionReturn(PETSC_SUCCESS); 1871 } 1872 1873 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1874 { 1875 PetscFunctionBegin; 1876 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1877 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1878 PetscFunctionReturn(PETSC_SUCCESS); 1879 } 1880 1881 /* Utility to print out a KokkosCsrMatrix for debugging */ 1882 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 1883 { 1884 const auto &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map); 1885 const auto &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries); 1886 const auto &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values); 1887 const PetscInt *i = iv.data(); 1888 const PetscInt *j = jv.data(); 1889 const PetscScalar *a = av.data(); 1890 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 1891 1892 PetscFunctionBegin; 1893 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 1894 for (PetscInt k = 0; k < m; k++) { 1895 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 1896 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 1897 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 1898 } 1899 PetscFunctionReturn(PETSC_SUCCESS); 1900 } 1901