1 #include <petsc_kokkos.hpp> 2 #include <petscvec_kokkos.hpp> 3 #include <petscpkg_version.h> 4 #include <petsc/private/petscimpl.h> 5 #include <petsc/private/sfimpl.h> 6 #include <petscsystypes.h> 7 #include <petscerror.h> 8 9 #include <Kokkos_Core.hpp> 10 #include <KokkosBlas.hpp> 11 #include <KokkosSparse_CrsMatrix.hpp> 12 #include <KokkosSparse_spmv.hpp> 13 #include <KokkosSparse_spiluk.hpp> 14 #include <KokkosSparse_sptrsv.hpp> 15 #include <KokkosSparse_spgemm.hpp> 16 #include <KokkosSparse_spadd.hpp> 17 #include <KokkosBatched_LU_Decl.hpp> 18 #include <KokkosBatched_InverseLU_Decl.hpp> 19 20 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 21 22 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 23 #include <KokkosSparse_Utils.hpp> 24 using KokkosSparse::sort_crs_matrix; 25 using KokkosSparse::Impl::transpose_matrix; 26 #else 27 #include <KokkosKernels_Sorting.hpp> 28 using KokkosKernels::sort_crs_matrix; 29 using KokkosKernels::Impl::transpose_matrix; 30 #endif 31 32 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 33 34 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 35 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 36 In the latter case, it is important to set a_dual's sync state correctly. 37 */ 38 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 39 { 40 Mat_SeqAIJ *aijseq; 41 Mat_SeqAIJKokkos *aijkok; 42 43 PetscFunctionBegin; 44 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 45 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 46 47 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 48 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 49 50 /* If aijkok does not exist, we just copy i, j to device. 51 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 52 In both cases, we build a new aijkok structure. 53 */ 54 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 55 delete aijkok; 56 aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 57 A->spptr = aijkok; 58 } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag. 59 MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n); 60 auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h); 61 aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h); 62 } 63 PetscFunctionReturn(PETSC_SUCCESS); 64 } 65 66 /* Sync CSR data to device if not yet */ 67 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 68 { 69 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 70 71 PetscFunctionBegin; 72 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 73 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 74 if (aijkok->a_dual.need_sync_device()) { 75 aijkok->a_dual.sync_device(); 76 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 77 aijkok->hermitian_updated = PETSC_FALSE; 78 } 79 PetscFunctionReturn(PETSC_SUCCESS); 80 } 81 82 /* Mark the CSR data on device as modified */ 83 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 84 { 85 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 86 87 PetscFunctionBegin; 88 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 89 aijkok->a_dual.clear_sync_state(); 90 aijkok->a_dual.modify_device(); 91 aijkok->transpose_updated = PETSC_FALSE; 92 aijkok->hermitian_updated = PETSC_FALSE; 93 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 94 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 95 PetscFunctionReturn(PETSC_SUCCESS); 96 } 97 98 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 99 { 100 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 101 auto &exec = PetscGetKokkosExecutionSpace(); 102 103 PetscFunctionBegin; 104 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 105 /* We do not expect one needs factors on host */ 106 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 107 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 108 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 109 PetscCallCXX(exec.fence()); 110 PetscFunctionReturn(PETSC_SUCCESS); 111 } 112 113 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 114 { 115 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 116 117 PetscFunctionBegin; 118 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 119 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 120 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 121 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 122 */ 123 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 124 auto &exec = PetscGetKokkosExecutionSpace(); 125 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 126 PetscCallCXX(exec.fence()); 127 *array = aijkok->a_dual.view_host().data(); 128 } else { /* Happens when calling MatSetValues on a newly created matrix */ 129 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 130 } 131 PetscFunctionReturn(PETSC_SUCCESS); 132 } 133 134 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 135 { 136 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 137 138 PetscFunctionBegin; 139 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 140 PetscFunctionReturn(PETSC_SUCCESS); 141 } 142 143 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 144 { 145 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 146 147 PetscFunctionBegin; 148 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 149 auto &exec = PetscGetKokkosExecutionSpace(); 150 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 151 PetscCallCXX(exec.fence()); 152 *array = aijkok->a_dual.view_host().data(); 153 } else { 154 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 155 } 156 PetscFunctionReturn(PETSC_SUCCESS); 157 } 158 159 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 160 { 161 PetscFunctionBegin; 162 *array = NULL; 163 PetscFunctionReturn(PETSC_SUCCESS); 164 } 165 166 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 167 { 168 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 169 170 PetscFunctionBegin; 171 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 172 *array = aijkok->a_dual.view_host().data(); 173 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 174 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 175 } 176 PetscFunctionReturn(PETSC_SUCCESS); 177 } 178 179 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 180 { 181 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 182 183 PetscFunctionBegin; 184 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 185 aijkok->a_dual.clear_sync_state(); 186 aijkok->a_dual.modify_host(); 187 } 188 PetscFunctionReturn(PETSC_SUCCESS); 189 } 190 191 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 192 { 193 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 194 195 PetscFunctionBegin; 196 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 197 198 if (i) *i = aijkok->i_device_data(); 199 if (j) *j = aijkok->j_device_data(); 200 if (a) { 201 aijkok->a_dual.sync_device(); 202 *a = aijkok->a_device_data(); 203 } 204 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 205 PetscFunctionReturn(PETSC_SUCCESS); 206 } 207 208 /* 209 Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 210 211 Input Parameter: 212 . A - the MATSEQAIJKOKKOS matrix 213 214 Output Parameters: 215 + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 216 - T_d - the transpose on device, whose value array is allocated but not initialized 217 */ 218 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 219 { 220 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 221 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 222 const PetscInt *Ai = aseq->i, *Aj = aseq->j; 223 MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 224 MatRowMapType *Ti = Ti_h.data(); 225 MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 226 MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 227 PetscInt *Tj = Tj_h.data(); 228 PetscInt *perm = perm_h.data(); 229 PetscInt *offset; 230 231 PetscFunctionBegin; 232 // Populate Ti 233 PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 234 Ti++; 235 for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 236 Ti--; 237 for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 238 239 // Populate Tj and the permutation array 240 PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 241 for (PetscInt i = 0; i < m; i++) { 242 for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 243 PetscInt r = Aj[j]; // row r of T 244 PetscInt disp = Ti[r] + offset[r]; 245 246 Tj[disp] = i; // col i of T 247 perm[disp] = j; 248 offset[r]++; 249 } 250 } 251 PetscCall(PetscFree(offset)); 252 253 // Sort each row of T, along with the permutation array 254 for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 255 256 // Output perm and T on device 257 auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 258 auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 259 PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 260 PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 261 PetscFunctionReturn(PETSC_SUCCESS); 262 } 263 264 // Generate the transpose on device and cache it internally 265 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 266 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 267 { 268 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 269 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 270 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 271 KokkosCsrMatrix &T = akok->csrmatT; 272 273 PetscFunctionBegin; 274 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 275 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 276 277 const auto &Aa = akok->a_dual.view_device(); 278 279 if (A->symmetric == PETSC_BOOL3_TRUE) { 280 *csrmatT = akok->csrmat; 281 } else { 282 // See if we already have a cached transpose and its value is up to date 283 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 284 if (!akok->transpose_updated) { // if the value is out of date, update the cached version 285 const auto &perm = akok->transpose_perm; // get the permutation array 286 auto &Ta = T.values; 287 288 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 289 } 290 } else { // Generate T of size n x m for the first time 291 MatRowMapKokkosView perm; 292 293 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 294 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 295 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 296 } 297 akok->transpose_updated = PETSC_TRUE; 298 *csrmatT = akok->csrmatT; 299 } 300 PetscFunctionReturn(PETSC_SUCCESS); 301 } 302 303 // Generate the Hermitian on device and cache it internally 304 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 305 { 306 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 307 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 308 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 309 KokkosCsrMatrix &T = akok->csrmatH; 310 311 PetscFunctionBegin; 312 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 313 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 314 315 const auto &Aa = akok->a_dual.view_device(); 316 317 if (A->hermitian == PETSC_BOOL3_TRUE) { 318 *csrmatH = akok->csrmat; 319 } else { 320 // See if we already have a cached hermitian and its value is up to date 321 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 322 if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 323 const auto &perm = akok->transpose_perm; // get the permutation array 324 auto &Ta = T.values; 325 326 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 327 } 328 } else { // Generate T of size n x m for the first time 329 MatRowMapKokkosView perm; 330 331 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 332 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 333 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 334 } 335 akok->hermitian_updated = PETSC_TRUE; 336 *csrmatH = akok->csrmatH; 337 } 338 PetscFunctionReturn(PETSC_SUCCESS); 339 } 340 341 /* y = A x */ 342 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 343 { 344 Mat_SeqAIJKokkos *aijkok; 345 ConstPetscScalarKokkosView xv; 346 PetscScalarKokkosView yv; 347 348 PetscFunctionBegin; 349 PetscCall(PetscLogGpuTimeBegin()); 350 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 351 PetscCall(VecGetKokkosView(xx, &xv)); 352 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 353 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 354 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 355 PetscCall(VecRestoreKokkosView(xx, &xv)); 356 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 357 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 358 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 359 PetscCall(PetscLogGpuTimeEnd()); 360 PetscFunctionReturn(PETSC_SUCCESS); 361 } 362 363 /* y = A^T x */ 364 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 365 { 366 Mat_SeqAIJKokkos *aijkok; 367 const char *mode; 368 ConstPetscScalarKokkosView xv; 369 PetscScalarKokkosView yv; 370 KokkosCsrMatrix csrmat; 371 372 PetscFunctionBegin; 373 PetscCall(PetscLogGpuTimeBegin()); 374 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 375 PetscCall(VecGetKokkosView(xx, &xv)); 376 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 377 if (A->form_explicit_transpose) { 378 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 379 mode = "N"; 380 } else { 381 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 382 csrmat = aijkok->csrmat; 383 mode = "T"; 384 } 385 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 386 PetscCall(VecRestoreKokkosView(xx, &xv)); 387 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 388 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 389 PetscCall(PetscLogGpuTimeEnd()); 390 PetscFunctionReturn(PETSC_SUCCESS); 391 } 392 393 /* y = A^H x */ 394 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 395 { 396 Mat_SeqAIJKokkos *aijkok; 397 const char *mode; 398 ConstPetscScalarKokkosView xv; 399 PetscScalarKokkosView yv; 400 KokkosCsrMatrix csrmat; 401 402 PetscFunctionBegin; 403 PetscCall(PetscLogGpuTimeBegin()); 404 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 405 PetscCall(VecGetKokkosView(xx, &xv)); 406 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 407 if (A->form_explicit_transpose) { 408 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 409 mode = "N"; 410 } else { 411 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 412 csrmat = aijkok->csrmat; 413 mode = "C"; 414 } 415 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 416 PetscCall(VecRestoreKokkosView(xx, &xv)); 417 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 418 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 419 PetscCall(PetscLogGpuTimeEnd()); 420 PetscFunctionReturn(PETSC_SUCCESS); 421 } 422 423 /* z = A x + y */ 424 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 425 { 426 Mat_SeqAIJKokkos *aijkok; 427 ConstPetscScalarKokkosView xv, yv; 428 PetscScalarKokkosView zv; 429 430 PetscFunctionBegin; 431 PetscCall(PetscLogGpuTimeBegin()); 432 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 433 PetscCall(VecGetKokkosView(xx, &xv)); 434 PetscCall(VecGetKokkosView(yy, &yv)); 435 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 436 if (zz != yy) Kokkos::deep_copy(zv, yv); 437 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 438 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 439 PetscCall(VecRestoreKokkosView(xx, &xv)); 440 PetscCall(VecRestoreKokkosView(yy, &yv)); 441 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 442 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 443 PetscCall(PetscLogGpuTimeEnd()); 444 PetscFunctionReturn(PETSC_SUCCESS); 445 } 446 447 /* z = A^T x + y */ 448 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 449 { 450 Mat_SeqAIJKokkos *aijkok; 451 const char *mode; 452 ConstPetscScalarKokkosView xv, yv; 453 PetscScalarKokkosView zv; 454 KokkosCsrMatrix csrmat; 455 456 PetscFunctionBegin; 457 PetscCall(PetscLogGpuTimeBegin()); 458 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 459 PetscCall(VecGetKokkosView(xx, &xv)); 460 PetscCall(VecGetKokkosView(yy, &yv)); 461 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 462 if (zz != yy) Kokkos::deep_copy(zv, yv); 463 if (A->form_explicit_transpose) { 464 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 465 mode = "N"; 466 } else { 467 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 468 csrmat = aijkok->csrmat; 469 mode = "T"; 470 } 471 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 472 PetscCall(VecRestoreKokkosView(xx, &xv)); 473 PetscCall(VecRestoreKokkosView(yy, &yv)); 474 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 475 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 476 PetscCall(PetscLogGpuTimeEnd()); 477 PetscFunctionReturn(PETSC_SUCCESS); 478 } 479 480 /* z = A^H x + y */ 481 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 482 { 483 Mat_SeqAIJKokkos *aijkok; 484 const char *mode; 485 ConstPetscScalarKokkosView xv, yv; 486 PetscScalarKokkosView zv; 487 KokkosCsrMatrix csrmat; 488 489 PetscFunctionBegin; 490 PetscCall(PetscLogGpuTimeBegin()); 491 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 492 PetscCall(VecGetKokkosView(xx, &xv)); 493 PetscCall(VecGetKokkosView(yy, &yv)); 494 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 495 if (zz != yy) Kokkos::deep_copy(zv, yv); 496 if (A->form_explicit_transpose) { 497 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 498 mode = "N"; 499 } else { 500 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 501 csrmat = aijkok->csrmat; 502 mode = "C"; 503 } 504 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 505 PetscCall(VecRestoreKokkosView(xx, &xv)); 506 PetscCall(VecRestoreKokkosView(yy, &yv)); 507 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 508 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 509 PetscCall(PetscLogGpuTimeEnd()); 510 PetscFunctionReturn(PETSC_SUCCESS); 511 } 512 513 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 514 { 515 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 516 517 PetscFunctionBegin; 518 switch (op) { 519 case MAT_FORM_EXPLICIT_TRANSPOSE: 520 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 521 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 522 A->form_explicit_transpose = flg; 523 break; 524 default: 525 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 526 break; 527 } 528 PetscFunctionReturn(PETSC_SUCCESS); 529 } 530 531 /* Depending on reuse, either build a new mat, or use the existing mat */ 532 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 533 { 534 Mat_SeqAIJ *aseq; 535 536 PetscFunctionBegin; 537 PetscCall(PetscKokkosInitializeCheck()); 538 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 539 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */ 540 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 541 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 542 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 543 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 544 PetscCall(PetscFree(A->defaultvectype)); 545 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 546 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 547 PetscCall(MatSetOps_SeqAIJKokkos(A)); 548 aseq = static_cast<Mat_SeqAIJ *>(A->data); 549 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 550 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 551 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE); 552 } 553 } 554 PetscFunctionReturn(PETSC_SUCCESS); 555 } 556 557 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 558 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 559 */ 560 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 561 { 562 Mat_SeqAIJ *bseq; 563 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 564 Mat mat; 565 566 PetscFunctionBegin; 567 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 568 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 569 mat = *B; 570 if (A->assembled) { 571 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 572 bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE); 573 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 574 /* Now copy values to B if needed */ 575 if (dupOption == MAT_COPY_VALUES) { 576 if (akok->a_dual.need_sync_device()) { 577 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 578 bkok->a_dual.modify_host(); 579 } else { /* If device has the latest data, we only copy data on device */ 580 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 581 bkok->a_dual.modify_device(); 582 } 583 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 584 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 585 bkok->a_dual.modify_host(); 586 } 587 mat->spptr = bkok; 588 } 589 590 PetscCall(PetscFree(mat->defaultvectype)); 591 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 592 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 593 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 594 PetscFunctionReturn(PETSC_SUCCESS); 595 } 596 597 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 598 { 599 Mat At; 600 KokkosCsrMatrix internT; 601 Mat_SeqAIJKokkos *atkok, *bkok; 602 603 PetscFunctionBegin; 604 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 605 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 606 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 607 /* Deep copy internT, as we want to isolate the internal transpose */ 608 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 609 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 610 if (reuse == MAT_INITIAL_MATRIX) *B = At; 611 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 612 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 613 if ((*B)->assembled) { 614 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 615 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 616 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 617 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 618 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 619 MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 620 MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 621 PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 622 PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 623 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 624 } 625 PetscFunctionReturn(PETSC_SUCCESS); 626 } 627 628 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 629 { 630 Mat_SeqAIJKokkos *aijkok; 631 632 PetscFunctionBegin; 633 if (A->factortype == MAT_FACTOR_NONE) { 634 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 635 delete aijkok; 636 } else { 637 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 638 } 639 A->spptr = NULL; 640 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 641 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 642 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 643 PetscCall(MatDestroy_SeqAIJ(A)); 644 PetscFunctionReturn(PETSC_SUCCESS); 645 } 646 647 /*MC 648 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 649 650 A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types 651 652 Options Database Key: 653 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 654 655 Level: beginner 656 657 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 658 M*/ 659 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 660 { 661 PetscFunctionBegin; 662 PetscCall(PetscKokkosInitializeCheck()); 663 PetscCall(MatCreate_SeqAIJ(A)); 664 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 665 PetscFunctionReturn(PETSC_SUCCESS); 666 } 667 668 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 669 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 670 { 671 Mat_SeqAIJ *a, *b; 672 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 673 MatScalarKokkosView aa, ba, ca; 674 MatRowMapKokkosView ai, bi, ci; 675 MatColIdxKokkosView aj, bj, cj; 676 PetscInt m, n, nnz, aN; 677 678 PetscFunctionBegin; 679 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 680 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 681 PetscAssertPointer(C, 4); 682 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 683 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 684 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 685 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 686 687 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 688 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 689 a = static_cast<Mat_SeqAIJ *>(A->data); 690 b = static_cast<Mat_SeqAIJ *>(B->data); 691 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 692 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 693 aa = akok->a_dual.view_device(); 694 ai = akok->i_dual.view_device(); 695 ba = bkok->a_dual.view_device(); 696 bi = bkok->i_dual.view_device(); 697 m = A->rmap->n; /* M, N and nnz of C */ 698 n = A->cmap->n + B->cmap->n; 699 nnz = a->nz + b->nz; 700 aN = A->cmap->n; /* N of A */ 701 if (reuse == MAT_INITIAL_MATRIX) { 702 aj = akok->j_dual.view_device(); 703 bj = bkok->j_dual.view_device(); 704 auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 705 auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 706 auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 707 ca = ca_dual.view_device(); 708 ci = ci_dual.view_device(); 709 cj = cj_dual.view_device(); 710 711 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 712 Kokkos::parallel_for( 713 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 714 PetscInt i = t.league_rank(); /* row i */ 715 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 716 717 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 718 ci(i) = coffset; 719 if (i == m - 1) ci(m) = ai(m) + bi(m); 720 }); 721 722 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 723 if (k < alen) { 724 ca(coffset + k) = aa(ai(i) + k); 725 cj(coffset + k) = aj(ai(i) + k); 726 } else { 727 ca(coffset + k) = ba(bi(i) + k - alen); 728 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 729 } 730 }); 731 }); 732 ca_dual.modify_device(); 733 ci_dual.modify_device(); 734 cj_dual.modify_device(); 735 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 736 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 737 } else if (reuse == MAT_REUSE_MATRIX) { 738 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 739 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 740 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 741 ca = ckok->a_dual.view_device(); 742 ci = ckok->i_dual.view_device(); 743 744 Kokkos::parallel_for( 745 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 746 PetscInt i = t.league_rank(); /* row i */ 747 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 748 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 749 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 750 else ca(ci(i) + k) = ba(bi(i) + k - alen); 751 }); 752 }); 753 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 754 } 755 PetscFunctionReturn(PETSC_SUCCESS); 756 } 757 758 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 759 { 760 PetscFunctionBegin; 761 delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 762 PetscFunctionReturn(PETSC_SUCCESS); 763 } 764 765 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 766 { 767 Mat_Product *product = C->product; 768 Mat A, B; 769 bool transA, transB; /* use bool, since KK needs this type */ 770 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 771 Mat_SeqAIJ *c; 772 MatProductData_SeqAIJKokkos *pdata; 773 KokkosCsrMatrix csrmatA, csrmatB; 774 775 PetscFunctionBegin; 776 MatCheckProduct(C, 1); 777 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 778 pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 779 780 // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 781 // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 782 // we still do numeric. 783 if (pdata->reusesym) { // numeric reuses results from symbolic 784 pdata->reusesym = PETSC_FALSE; 785 PetscFunctionReturn(PETSC_SUCCESS); 786 } 787 788 switch (product->type) { 789 case MATPRODUCT_AB: 790 transA = false; 791 transB = false; 792 break; 793 case MATPRODUCT_AtB: 794 transA = true; 795 transB = false; 796 break; 797 case MATPRODUCT_ABt: 798 transA = false; 799 transB = true; 800 break; 801 default: 802 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 803 } 804 805 A = product->A; 806 B = product->B; 807 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 808 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 809 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 810 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 811 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 812 813 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 814 815 csrmatA = akok->csrmat; 816 csrmatB = bkok->csrmat; 817 818 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 819 if (transA) { 820 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 821 transA = false; 822 } 823 824 if (transB) { 825 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 826 transB = false; 827 } 828 PetscCall(PetscLogGpuTimeBegin()); 829 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 830 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 831 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 832 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 833 #endif 834 835 PetscCall(PetscLogGpuTimeEnd()); 836 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 837 /* shorter version of MatAssemblyEnd_SeqAIJ */ 838 c = (Mat_SeqAIJ *)C->data; 839 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 840 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 841 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 842 c->reallocs = 0; 843 C->info.mallocs = 0; 844 C->info.nz_unneeded = 0; 845 C->assembled = C->was_assembled = PETSC_TRUE; 846 C->num_ass++; 847 PetscFunctionReturn(PETSC_SUCCESS); 848 } 849 850 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 851 { 852 Mat_Product *product = C->product; 853 MatProductType ptype; 854 Mat A, B; 855 bool transA, transB; 856 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 857 MatProductData_SeqAIJKokkos *pdata; 858 MPI_Comm comm; 859 KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 860 861 PetscFunctionBegin; 862 MatCheckProduct(C, 1); 863 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 864 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 865 A = product->A; 866 B = product->B; 867 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 868 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 869 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 870 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 871 csrmatA = akok->csrmat; 872 csrmatB = bkok->csrmat; 873 874 ptype = product->type; 875 // Take advantage of the symmetry if true 876 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 877 ptype = MATPRODUCT_AB; 878 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 879 } 880 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 881 ptype = MATPRODUCT_AB; 882 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 883 } 884 885 switch (ptype) { 886 case MATPRODUCT_AB: 887 transA = false; 888 transB = false; 889 break; 890 case MATPRODUCT_AtB: 891 transA = true; 892 transB = false; 893 break; 894 case MATPRODUCT_ABt: 895 transA = false; 896 transB = true; 897 break; 898 default: 899 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 900 } 901 PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 902 pdata->reusesym = product->api_user; 903 904 /* TODO: add command line options to select spgemm algorithms */ 905 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 906 907 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 908 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 909 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 910 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 911 #endif 912 #endif 913 PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 914 915 PetscCall(PetscLogGpuTimeBegin()); 916 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 917 if (transA) { 918 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 919 transA = false; 920 } 921 922 if (transB) { 923 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 924 transB = false; 925 } 926 927 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 928 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 929 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 930 calling new Mat_SeqAIJKokkos(). 931 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 932 */ 933 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 934 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 935 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 936 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 937 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 938 #endif 939 PetscCall(PetscLogGpuTimeEnd()); 940 941 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 942 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 943 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 944 PetscFunctionReturn(PETSC_SUCCESS); 945 } 946 947 /* handles sparse matrix matrix ops */ 948 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 949 { 950 Mat_Product *product = mat->product; 951 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 952 953 PetscFunctionBegin; 954 MatCheckProduct(mat, 1); 955 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 956 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 957 if (Biskok && Ciskok) { 958 switch (product->type) { 959 case MATPRODUCT_AB: 960 case MATPRODUCT_AtB: 961 case MATPRODUCT_ABt: 962 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 963 break; 964 case MATPRODUCT_PtAP: 965 case MATPRODUCT_RARt: 966 case MATPRODUCT_ABC: 967 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 968 break; 969 default: 970 break; 971 } 972 } else { /* fallback for AIJ */ 973 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 974 } 975 PetscFunctionReturn(PETSC_SUCCESS); 976 } 977 978 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 979 { 980 Mat_SeqAIJKokkos *aijkok; 981 982 PetscFunctionBegin; 983 PetscCall(PetscLogGpuTimeBegin()); 984 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 985 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 986 KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 987 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 988 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 989 PetscCall(PetscLogGpuTimeEnd()); 990 PetscFunctionReturn(PETSC_SUCCESS); 991 } 992 993 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular) 994 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a) 995 { 996 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 997 998 PetscFunctionBegin; 999 if (A->assembled && aijseq->diagonaldense) { // no missing diagonals 1000 PetscInt n = PetscMin(A->rmap->n, A->cmap->n); 1001 1002 PetscCall(PetscLogGpuTimeBegin()); 1003 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1004 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1005 const auto &Aa = aijkok->a_dual.view_device(); 1006 const auto &Adiag = aijkok->diag_dual.view_device(); 1007 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; })); 1008 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1009 PetscCall(PetscLogGpuFlops(n)); 1010 PetscCall(PetscLogGpuTimeEnd()); 1011 } else { // need reassembly, very slow! 1012 PetscCall(MatShift_Basic(A, a)); 1013 } 1014 PetscFunctionReturn(PETSC_SUCCESS); 1015 } 1016 1017 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is) 1018 { 1019 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data); 1020 1021 PetscFunctionBegin; 1022 if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals 1023 ConstPetscScalarKokkosView dv; 1024 PetscInt n, nv; 1025 1026 PetscCall(PetscLogGpuTimeBegin()); 1027 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1028 PetscCall(VecGetKokkosView(D, &dv)); 1029 PetscCall(VecGetLocalSize(D, &nv)); 1030 n = PetscMin(Y->rmap->n, Y->cmap->n); 1031 PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match"); 1032 1033 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1034 const auto &Aa = aijkok->a_dual.view_device(); 1035 const auto &Adiag = aijkok->diag_dual.view_device(); 1036 PetscCallCXX(Kokkos::parallel_for( 1037 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1038 if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i); 1039 else Aa(Adiag(i)) += dv(i); 1040 })); 1041 PetscCall(VecRestoreKokkosView(D, &dv)); 1042 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1043 PetscCall(PetscLogGpuFlops(n)); 1044 PetscCall(PetscLogGpuTimeEnd()); 1045 } else { // need reassembly, very slow! 1046 PetscCall(MatDiagonalSet_Default(Y, D, is)); 1047 } 1048 PetscFunctionReturn(PETSC_SUCCESS); 1049 } 1050 1051 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr) 1052 { 1053 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1054 PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz; 1055 ConstPetscScalarKokkosView lv, rv; 1056 1057 PetscFunctionBegin; 1058 PetscCall(PetscLogGpuTimeBegin()); 1059 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1060 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1061 const auto &Aa = aijkok->a_dual.view_device(); 1062 const auto &Ai = aijkok->i_dual.view_device(); 1063 const auto &Aj = aijkok->j_dual.view_device(); 1064 if (ll) { 1065 PetscCall(VecGetLocalSize(ll, &m)); 1066 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1067 PetscCall(VecGetKokkosView(ll, &lv)); 1068 PetscCallCXX(Kokkos::parallel_for( // for each row 1069 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1070 PetscInt i = t.league_rank(); // row i 1071 PetscInt len = Ai(i + 1) - Ai(i); 1072 // scale entries on the row 1073 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); }); 1074 })); 1075 PetscCall(VecRestoreKokkosView(ll, &lv)); 1076 PetscCall(PetscLogGpuFlops(nz)); 1077 } 1078 if (rr) { 1079 PetscCall(VecGetLocalSize(rr, &n)); 1080 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1081 PetscCall(VecGetKokkosView(rr, &rv)); 1082 PetscCallCXX(Kokkos::parallel_for( // for each nonzero 1083 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); })); 1084 PetscCall(VecRestoreKokkosView(rr, &lv)); 1085 PetscCall(PetscLogGpuFlops(nz)); 1086 } 1087 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1088 PetscCall(PetscLogGpuTimeEnd()); 1089 PetscFunctionReturn(PETSC_SUCCESS); 1090 } 1091 1092 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 1093 { 1094 Mat_SeqAIJKokkos *aijkok; 1095 1096 PetscFunctionBegin; 1097 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1098 if (aijkok) { /* Only zero the device if data is already there */ 1099 KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0); 1100 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1101 } else { /* Might be preallocated but not assembled */ 1102 PetscCall(MatZeroEntries_SeqAIJ(A)); 1103 } 1104 PetscFunctionReturn(PETSC_SUCCESS); 1105 } 1106 1107 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1108 { 1109 Mat_SeqAIJKokkos *aijkok; 1110 PetscInt n; 1111 PetscScalarKokkosView xv; 1112 1113 PetscFunctionBegin; 1114 PetscCall(VecGetLocalSize(x, &n)); 1115 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1116 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1117 1118 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1119 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1120 1121 const auto &Aa = aijkok->a_dual.view_device(); 1122 const auto &Ai = aijkok->i_dual.view_device(); 1123 const auto &Adiag = aijkok->diag_dual.view_device(); 1124 1125 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1126 Kokkos::parallel_for( 1127 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1128 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1129 else xv(i) = 0; 1130 }); 1131 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1132 PetscFunctionReturn(PETSC_SUCCESS); 1133 } 1134 1135 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1136 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1137 { 1138 Mat_SeqAIJKokkos *aijkok; 1139 1140 PetscFunctionBegin; 1141 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1142 PetscAssertPointer(kv, 2); 1143 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1144 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1145 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1146 *kv = aijkok->a_dual.view_device(); 1147 PetscFunctionReturn(PETSC_SUCCESS); 1148 } 1149 1150 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1151 { 1152 PetscFunctionBegin; 1153 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1154 PetscAssertPointer(kv, 2); 1155 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1156 PetscFunctionReturn(PETSC_SUCCESS); 1157 } 1158 1159 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1160 { 1161 Mat_SeqAIJKokkos *aijkok; 1162 1163 PetscFunctionBegin; 1164 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1165 PetscAssertPointer(kv, 2); 1166 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1167 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1168 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1169 *kv = aijkok->a_dual.view_device(); 1170 PetscFunctionReturn(PETSC_SUCCESS); 1171 } 1172 1173 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1174 { 1175 PetscFunctionBegin; 1176 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1177 PetscAssertPointer(kv, 2); 1178 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1179 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1180 PetscFunctionReturn(PETSC_SUCCESS); 1181 } 1182 1183 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1184 { 1185 Mat_SeqAIJKokkos *aijkok; 1186 1187 PetscFunctionBegin; 1188 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1189 PetscAssertPointer(kv, 2); 1190 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1191 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1192 *kv = aijkok->a_dual.view_device(); 1193 PetscFunctionReturn(PETSC_SUCCESS); 1194 } 1195 1196 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1197 { 1198 PetscFunctionBegin; 1199 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1200 PetscAssertPointer(kv, 2); 1201 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1202 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1203 PetscFunctionReturn(PETSC_SUCCESS); 1204 } 1205 1206 /* Computes Y += alpha X */ 1207 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1208 { 1209 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1210 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1211 ConstMatScalarKokkosView Xa; 1212 MatScalarKokkosView Ya; 1213 auto &exec = PetscGetKokkosExecutionSpace(); 1214 1215 PetscFunctionBegin; 1216 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1217 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1218 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1219 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1220 PetscCall(PetscLogGpuTimeBegin()); 1221 1222 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1223 PetscBool e; 1224 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1225 if (e) { 1226 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1227 if (e) pattern = SAME_NONZERO_PATTERN; 1228 } 1229 } 1230 1231 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1232 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1233 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1234 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1235 */ 1236 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1237 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1238 Xa = xkok->a_dual.view_device(); 1239 Ya = ykok->a_dual.view_device(); 1240 1241 if (pattern == SAME_NONZERO_PATTERN) { 1242 KokkosBlas::axpy(exec, alpha, Xa, Ya); 1243 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1244 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1245 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1246 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1247 1248 Kokkos::parallel_for( 1249 Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1250 PetscInt i = t.league_rank(); // row i 1251 Kokkos::single(Kokkos::PerTeam(t), [=]() { 1252 // Only one thread works in a team 1253 PetscInt p, q = Yi(i); 1254 for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 1255 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 1256 if (Xj(p) == Yj(q)) { // Found it 1257 Ya(q) += alpha * Xa(p); 1258 q++; 1259 } else { 1260 // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1261 // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1262 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 1263 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 1264 #else 1265 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 1266 #endif 1267 } 1268 } 1269 }); 1270 }); 1271 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1272 } else { // different nonzero patterns 1273 Mat Z; 1274 KokkosCsrMatrix zcsr; 1275 KernelHandle kh; 1276 kh.create_spadd_handle(true); // X, Y are sorted 1277 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1278 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1279 zkok = new Mat_SeqAIJKokkos(zcsr); 1280 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1281 PetscCall(MatHeaderReplace(Y, &Z)); 1282 kh.destroy_spadd_handle(); 1283 } 1284 PetscCall(PetscLogGpuTimeEnd()); 1285 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 1286 PetscFunctionReturn(PETSC_SUCCESS); 1287 } 1288 1289 struct MatCOOStruct_SeqAIJKokkos { 1290 PetscCount n; 1291 PetscCount Atot; 1292 PetscInt nz; 1293 PetscCountKokkosView jmap; 1294 PetscCountKokkosView perm; 1295 1296 MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h) 1297 { 1298 nz = coo_h->nz; 1299 n = coo_h->n; 1300 Atot = coo_h->Atot; 1301 jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1)); 1302 perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot)); 1303 } 1304 }; 1305 1306 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void *data) 1307 { 1308 PetscFunctionBegin; 1309 PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(data)); 1310 PetscFunctionReturn(PETSC_SUCCESS); 1311 } 1312 1313 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1314 { 1315 Mat_SeqAIJKokkos *akok; 1316 Mat_SeqAIJ *aseq; 1317 PetscContainer container_h, container_d; 1318 MatCOOStruct_SeqAIJ *coo_h; 1319 MatCOOStruct_SeqAIJKokkos *coo_d; 1320 1321 PetscFunctionBegin; 1322 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1323 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1324 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1325 delete akok; 1326 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE); 1327 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1328 1329 // Copy the COO struct to device 1330 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 1331 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 1332 PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h)); 1333 1334 // Put the COO struct in a container and then attach that to the matrix 1335 PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d)); 1336 PetscCall(PetscContainerSetPointer(container_d, coo_d)); 1337 PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_SeqAIJKokkos)); 1338 PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d)); 1339 PetscCall(PetscContainerDestroy(&container_d)); 1340 PetscFunctionReturn(PETSC_SUCCESS); 1341 } 1342 1343 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1344 { 1345 MatScalarKokkosView Aa; 1346 ConstMatScalarKokkosView kv; 1347 PetscMemType memtype; 1348 PetscContainer container; 1349 MatCOOStruct_SeqAIJKokkos *coo; 1350 1351 PetscFunctionBegin; 1352 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 1353 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 1354 1355 const auto &n = coo->n; 1356 const auto &Annz = coo->nz; 1357 const auto &jmap = coo->jmap; 1358 const auto &perm = coo->perm; 1359 1360 PetscCall(PetscGetMemType(v, &memtype)); 1361 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1362 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n)); 1363 } else { 1364 kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */ 1365 } 1366 1367 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1368 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1369 1370 PetscCall(PetscLogGpuTimeBegin()); 1371 Kokkos::parallel_for( 1372 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) { 1373 PetscScalar sum = 0.0; 1374 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1375 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1376 }); 1377 PetscCall(PetscLogGpuTimeEnd()); 1378 1379 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1380 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1381 PetscFunctionReturn(PETSC_SUCCESS); 1382 } 1383 1384 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1385 { 1386 PetscFunctionBegin; 1387 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1388 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1389 B->offloadmask = PETSC_OFFLOAD_CPU; 1390 PetscFunctionReturn(PETSC_SUCCESS); 1391 } 1392 1393 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1394 { 1395 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1396 1397 PetscFunctionBegin; 1398 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1399 A->boundtocpu = PETSC_FALSE; 1400 1401 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1402 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1403 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1404 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1405 A->ops->scale = MatScale_SeqAIJKokkos; 1406 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1407 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1408 A->ops->mult = MatMult_SeqAIJKokkos; 1409 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1410 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1411 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1412 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1413 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1414 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1415 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1416 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1417 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1418 A->ops->shift = MatShift_SeqAIJKokkos; 1419 A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos; 1420 A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos; 1421 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1422 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1423 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1424 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1425 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1426 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1427 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1428 1429 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1430 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1431 PetscFunctionReturn(PETSC_SUCCESS); 1432 } 1433 1434 /* 1435 Extract the (prescribled) diagonal blocks of the matrix and then invert them 1436 1437 Input Parameters: 1438 + A - the MATSEQAIJKOKKOS matrix 1439 . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i) 1440 . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[] 1441 . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i) 1442 - work - a pre-allocated work buffer (as big as diagVal) for use by this routine 1443 1444 Output Parameter: 1445 . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order) 1446 */ 1447 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal) 1448 { 1449 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1450 PetscInt nblocks = bs.extent(0) - 1; 1451 1452 PetscFunctionBegin; 1453 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device 1454 1455 // Pull out the diagonal blocks of the matrix and then invert the blocks 1456 auto Aa = akok->a_dual.view_device(); 1457 auto Ai = akok->i_dual.view_device(); 1458 auto Aj = akok->j_dual.view_device(); 1459 auto Adiag = akok->diag_dual.view_device(); 1460 // TODO: how to tune the team size? 1461 #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HOST) 1462 auto ts = Kokkos::AUTO(); 1463 #else 1464 auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs 1465 #endif 1466 PetscCallCXX(Kokkos::parallel_for( 1467 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) { 1468 const PetscInt bid = teamMember.league_rank(); // block id 1469 const PetscInt rstart = bs(bid); // this block starts from this row 1470 const PetscInt m = bs(bid + 1) - bs(bid); // size of this block 1471 const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order 1472 const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m); 1473 1474 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B 1475 PetscInt i = rstart + r; // i-th row in A 1476 1477 if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case) 1478 PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row 1479 1480 for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet 1481 if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero 1482 B(r, c) = 0.0; 1483 } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column 1484 B(r, c) = Aa(first + c); 1485 } else { // this entry does not show up in the CSR 1486 B(r, c) = 0.0; 1487 } 1488 } 1489 } else { // rare case that the diagonal does not exist 1490 const PetscInt begin = Ai(i); 1491 const PetscInt end = Ai(i + 1); 1492 for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0; 1493 for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not. 1494 if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j); 1495 else if (Aj(j) >= rstart + m) break; 1496 } 1497 } 1498 }); 1499 1500 // LU-decompose B (w/o pivoting) and then invert B 1501 KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0); 1502 KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W); 1503 })); 1504 // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels 1505 PetscFunctionReturn(PETSC_SUCCESS); 1506 } 1507 1508 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1509 { 1510 Mat_SeqAIJ *aseq; 1511 PetscInt i, m, n; 1512 auto &exec = PetscGetKokkosExecutionSpace(); 1513 1514 PetscFunctionBegin; 1515 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1516 1517 m = akok->nrows(); 1518 n = akok->ncols(); 1519 PetscCall(MatSetSizes(A, m, n, m, n)); 1520 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1521 1522 /* Set up data structures of A as a MATSEQAIJ */ 1523 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1524 aseq = (Mat_SeqAIJ *)(A)->data; 1525 1526 PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */ 1527 PetscCallCXX(akok->j_dual.sync_host(exec)); 1528 PetscCallCXX(exec.fence()); 1529 1530 aseq->i = akok->i_host_data(); 1531 aseq->j = akok->j_host_data(); 1532 aseq->a = akok->a_host_data(); 1533 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1534 aseq->singlemalloc = PETSC_FALSE; 1535 aseq->free_a = PETSC_FALSE; 1536 aseq->free_ij = PETSC_FALSE; 1537 aseq->nz = akok->nnz(); 1538 aseq->maxnz = aseq->nz; 1539 1540 PetscCall(PetscMalloc1(m, &aseq->imax)); 1541 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1542 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1543 1544 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1545 akok->nonzerostate = A->nonzerostate; 1546 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1547 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1548 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1549 PetscFunctionReturn(PETSC_SUCCESS); 1550 } 1551 1552 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 1553 { 1554 PetscFunctionBegin; 1555 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1556 *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 1557 PetscFunctionReturn(PETSC_SUCCESS); 1558 } 1559 1560 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 1561 { 1562 Mat_SeqAIJKokkos *akok; 1563 1564 PetscFunctionBegin; 1565 PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 1566 PetscCall(MatCreate(comm, A)); 1567 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1568 PetscFunctionReturn(PETSC_SUCCESS); 1569 } 1570 1571 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1572 1573 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1574 */ 1575 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1576 { 1577 PetscFunctionBegin; 1578 PetscCall(MatCreate(comm, A)); 1579 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1580 PetscFunctionReturn(PETSC_SUCCESS); 1581 } 1582 1583 /*@C 1584 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1585 (the default parallel PETSc format). This matrix will ultimately be handled by 1586 Kokkos for calculations. 1587 1588 Collective 1589 1590 Input Parameters: 1591 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1592 . m - number of rows 1593 . n - number of columns 1594 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 1595 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 1596 1597 Output Parameter: 1598 . A - the matrix 1599 1600 Level: intermediate 1601 1602 Notes: 1603 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1604 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1605 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1606 1607 The AIJ format, also called 1608 compressed row storage, is fully compatible with standard Fortran 1609 storage. That is, the stored row and column indices can begin at 1610 either one (as in Fortran) or zero. 1611 1612 Specify the preallocated storage with either `nz` or `nnz` (not both). 1613 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1614 allocation. 1615 1616 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()` 1617 @*/ 1618 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1619 { 1620 PetscFunctionBegin; 1621 PetscCall(PetscKokkosInitializeCheck()); 1622 PetscCall(MatCreate(comm, A)); 1623 PetscCall(MatSetSizes(*A, m, n, m, n)); 1624 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1625 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1626 PetscFunctionReturn(PETSC_SUCCESS); 1627 } 1628 1629 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1630 { 1631 PetscFunctionBegin; 1632 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1633 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1634 PetscFunctionReturn(PETSC_SUCCESS); 1635 } 1636 1637 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1638 { 1639 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1640 1641 PetscFunctionBegin; 1642 if (!factors->sptrsv_symbolic_completed) { 1643 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d); 1644 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d); 1645 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1646 } 1647 PetscFunctionReturn(PETSC_SUCCESS); 1648 } 1649 1650 /* Check if we need to update factors etc for transpose solve */ 1651 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1652 { 1653 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1654 MatColIdxType n = A->rmap->n; 1655 1656 PetscFunctionBegin; 1657 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1658 /* Update L^T and do sptrsv symbolic */ 1659 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0 1660 factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 1661 factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 1662 1663 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d, 1664 factors->iLt_d, factors->jLt_d, factors->aLt_d); 1665 1666 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1667 We have to sort the indices, until KK provides finer control options. 1668 */ 1669 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d); 1670 1671 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d); 1672 1673 /* Update U^T and do sptrsv symbolic */ 1674 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0 1675 factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 1676 factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 1677 1678 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d, 1679 factors->iUt_d, factors->jUt_d, factors->aUt_d); 1680 1681 /* Sort indices. See comments above */ 1682 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d); 1683 1684 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d); 1685 factors->transpose_updated = PETSC_TRUE; 1686 } 1687 PetscFunctionReturn(PETSC_SUCCESS); 1688 } 1689 1690 /* Solve Ax = b, with A = LU */ 1691 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x) 1692 { 1693 ConstPetscScalarKokkosView bv; 1694 PetscScalarKokkosView xv; 1695 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1696 1697 PetscFunctionBegin; 1698 PetscCall(PetscLogGpuTimeBegin()); 1699 PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 1700 PetscCall(VecGetKokkosView(b, &bv)); 1701 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1702 /* Solve L tmpv = b */ 1703 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector)); 1704 /* Solve Ux = tmpv */ 1705 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv)); 1706 PetscCall(VecRestoreKokkosView(b, &bv)); 1707 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1708 PetscCall(PetscLogGpuTimeEnd()); 1709 PetscFunctionReturn(PETSC_SUCCESS); 1710 } 1711 1712 /* Solve A^T x = b, where A^T = U^T L^T */ 1713 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x) 1714 { 1715 ConstPetscScalarKokkosView bv; 1716 PetscScalarKokkosView xv; 1717 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1718 1719 PetscFunctionBegin; 1720 PetscCall(PetscLogGpuTimeBegin()); 1721 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 1722 PetscCall(VecGetKokkosView(b, &bv)); 1723 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1724 /* Solve U^T tmpv = b */ 1725 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector); 1726 1727 /* Solve L^T x = tmpv */ 1728 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv); 1729 PetscCall(VecRestoreKokkosView(b, &bv)); 1730 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1731 PetscCall(PetscLogGpuTimeEnd()); 1732 PetscFunctionReturn(PETSC_SUCCESS); 1733 } 1734 1735 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1736 { 1737 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1738 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1739 PetscInt fill_lev = info->levels; 1740 1741 PetscFunctionBegin; 1742 PetscCall(PetscLogGpuTimeBegin()); 1743 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1744 1745 auto a_d = aijkok->a_dual.view_device(); 1746 auto i_d = aijkok->i_dual.view_device(); 1747 auto j_d = aijkok->j_dual.view_device(); 1748 1749 KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d); 1750 1751 B->assembled = PETSC_TRUE; 1752 B->preallocated = PETSC_TRUE; 1753 B->ops->solve = MatSolve_SeqAIJKokkos; 1754 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1755 B->ops->matsolve = NULL; 1756 B->ops->matsolvetranspose = NULL; 1757 B->offloadmask = PETSC_OFFLOAD_GPU; 1758 1759 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1760 factors->transpose_updated = PETSC_FALSE; 1761 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1762 /* TODO: log flops, but how to know that? */ 1763 PetscCall(PetscLogGpuTimeEnd()); 1764 PetscFunctionReturn(PETSC_SUCCESS); 1765 } 1766 1767 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1768 { 1769 Mat_SeqAIJKokkos *aijkok; 1770 Mat_SeqAIJ *b; 1771 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1772 PetscInt fill_lev = info->levels; 1773 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 1774 PetscInt n = A->rmap->n; 1775 1776 PetscFunctionBegin; 1777 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1778 /* Rebuild factors */ 1779 if (factors) { 1780 factors->Destroy(); 1781 } /* Destroy the old if it exists */ 1782 else { 1783 B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n); 1784 } 1785 1786 /* Create a spiluk handle and then do symbolic factorization */ 1787 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 1788 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 1789 1790 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1791 1792 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 1793 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 1794 Kokkos::realloc(factors->iU_d, n + 1); 1795 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 1796 1797 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1798 auto i_d = aijkok->i_dual.view_device(); 1799 auto j_d = aijkok->j_dual.view_device(); 1800 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d); 1801 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1802 1803 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1804 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 1805 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 1806 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 1807 1808 /* TODO: add options to select sptrsv algorithms */ 1809 /* Create sptrsv handles for L, U and their transpose */ 1810 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1811 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1812 #else 1813 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1814 #endif 1815 1816 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 1817 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 1818 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 1819 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 1820 1821 /* Fill fields of the factor matrix B */ 1822 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 1823 b = (Mat_SeqAIJ *)B->data; 1824 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 1825 B->info.fill_ratio_given = info->fill; 1826 B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 1827 1828 B->offloadmask = PETSC_OFFLOAD_GPU; 1829 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1830 PetscFunctionReturn(PETSC_SUCCESS); 1831 } 1832 1833 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type) 1834 { 1835 PetscFunctionBegin; 1836 *type = MATSOLVERKOKKOS; 1837 PetscFunctionReturn(PETSC_SUCCESS); 1838 } 1839 1840 /*MC 1841 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1842 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 1843 1844 Level: beginner 1845 1846 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1847 M*/ 1848 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1849 { 1850 PetscInt n = A->rmap->n; 1851 1852 PetscFunctionBegin; 1853 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 1854 PetscCall(MatSetSizes(*B, n, n, n, n)); 1855 (*B)->factortype = ftype; 1856 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 1857 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1858 1859 if (ftype == MAT_FACTOR_LU) { 1860 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1861 (*B)->canuseordering = PETSC_TRUE; 1862 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1863 } else if (ftype == MAT_FACTOR_ILU) { 1864 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1865 (*B)->canuseordering = PETSC_FALSE; 1866 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1867 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1868 1869 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 1870 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos)); 1871 PetscFunctionReturn(PETSC_SUCCESS); 1872 } 1873 1874 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1875 { 1876 PetscFunctionBegin; 1877 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1878 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1879 PetscFunctionReturn(PETSC_SUCCESS); 1880 } 1881 1882 /* Utility to print out a KokkosCsrMatrix for debugging */ 1883 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 1884 { 1885 const auto &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map); 1886 const auto &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries); 1887 const auto &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values); 1888 const PetscInt *i = iv.data(); 1889 const PetscInt *j = jv.data(); 1890 const PetscScalar *a = av.data(); 1891 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 1892 1893 PetscFunctionBegin; 1894 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 1895 for (PetscInt k = 0; k < m; k++) { 1896 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 1897 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 1898 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 1899 } 1900 PetscFunctionReturn(PETSC_SUCCESS); 1901 } 1902