1 #include <petsc_kokkos.hpp> 2 #include <petscvec_kokkos.hpp> 3 #include <petscpkg_version.h> 4 #include <petsc/private/petscimpl.h> 5 #include <petsc/private/sfimpl.h> 6 #include <petscsystypes.h> 7 #include <petscerror.h> 8 9 #include <Kokkos_Core.hpp> 10 #include <KokkosBlas.hpp> 11 #include <KokkosSparse_CrsMatrix.hpp> 12 #include <KokkosSparse_spmv.hpp> 13 #include <KokkosSparse_spiluk.hpp> 14 #include <KokkosSparse_sptrsv.hpp> 15 #include <KokkosSparse_spgemm.hpp> 16 #include <KokkosSparse_spadd.hpp> 17 #include <KokkosBatched_LU_Decl.hpp> 18 #include <KokkosBatched_InverseLU_Decl.hpp> 19 20 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 21 22 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 23 #include <KokkosSparse_Utils.hpp> 24 using KokkosSparse::sort_crs_matrix; 25 using KokkosSparse::Impl::transpose_matrix; 26 #else 27 #include <KokkosKernels_Sorting.hpp> 28 using KokkosKernels::sort_crs_matrix; 29 using KokkosKernels::Impl::transpose_matrix; 30 #endif 31 32 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 33 34 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 35 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 36 In the latter case, it is important to set a_dual's sync state correctly. 37 */ 38 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 39 { 40 Mat_SeqAIJ *aijseq; 41 Mat_SeqAIJKokkos *aijkok; 42 43 PetscFunctionBegin; 44 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 45 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 46 47 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 48 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 49 50 /* If aijkok does not exist, we just copy i, j to device. 51 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 52 In both cases, we build a new aijkok structure. 53 */ 54 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 55 delete aijkok; 56 aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 57 A->spptr = aijkok; 58 } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag. 59 MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n); 60 auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h); 61 aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h); 62 } 63 PetscFunctionReturn(PETSC_SUCCESS); 64 } 65 66 /* Sync CSR data to device if not yet */ 67 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 68 { 69 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 70 71 PetscFunctionBegin; 72 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 73 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 74 if (aijkok->a_dual.need_sync_device()) { 75 aijkok->a_dual.sync_device(); 76 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 77 aijkok->hermitian_updated = PETSC_FALSE; 78 } 79 PetscFunctionReturn(PETSC_SUCCESS); 80 } 81 82 /* Mark the CSR data on device as modified */ 83 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 84 { 85 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 86 87 PetscFunctionBegin; 88 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 89 aijkok->a_dual.clear_sync_state(); 90 aijkok->a_dual.modify_device(); 91 aijkok->transpose_updated = PETSC_FALSE; 92 aijkok->hermitian_updated = PETSC_FALSE; 93 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 94 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 95 PetscFunctionReturn(PETSC_SUCCESS); 96 } 97 98 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 99 { 100 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 101 auto &exec = PetscGetKokkosExecutionSpace(); 102 103 PetscFunctionBegin; 104 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 105 /* We do not expect one needs factors on host */ 106 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 107 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 108 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 109 PetscCallCXX(exec.fence()); 110 PetscFunctionReturn(PETSC_SUCCESS); 111 } 112 113 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 114 { 115 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 116 117 PetscFunctionBegin; 118 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 119 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 120 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 121 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 122 */ 123 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 124 auto &exec = PetscGetKokkosExecutionSpace(); 125 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 126 PetscCallCXX(exec.fence()); 127 *array = aijkok->a_dual.view_host().data(); 128 } else { /* Happens when calling MatSetValues on a newly created matrix */ 129 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 130 } 131 PetscFunctionReturn(PETSC_SUCCESS); 132 } 133 134 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 135 { 136 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 137 138 PetscFunctionBegin; 139 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 140 PetscFunctionReturn(PETSC_SUCCESS); 141 } 142 143 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 144 { 145 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 146 147 PetscFunctionBegin; 148 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 149 auto &exec = PetscGetKokkosExecutionSpace(); 150 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 151 PetscCallCXX(exec.fence()); 152 *array = aijkok->a_dual.view_host().data(); 153 } else { 154 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 155 } 156 PetscFunctionReturn(PETSC_SUCCESS); 157 } 158 159 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 160 { 161 PetscFunctionBegin; 162 *array = NULL; 163 PetscFunctionReturn(PETSC_SUCCESS); 164 } 165 166 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 167 { 168 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 169 170 PetscFunctionBegin; 171 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 172 *array = aijkok->a_dual.view_host().data(); 173 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 174 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 175 } 176 PetscFunctionReturn(PETSC_SUCCESS); 177 } 178 179 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 180 { 181 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 182 183 PetscFunctionBegin; 184 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 185 aijkok->a_dual.clear_sync_state(); 186 aijkok->a_dual.modify_host(); 187 } 188 PetscFunctionReturn(PETSC_SUCCESS); 189 } 190 191 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 192 { 193 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 194 195 PetscFunctionBegin; 196 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 197 198 if (i) *i = aijkok->i_device_data(); 199 if (j) *j = aijkok->j_device_data(); 200 if (a) { 201 aijkok->a_dual.sync_device(); 202 *a = aijkok->a_device_data(); 203 } 204 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 205 PetscFunctionReturn(PETSC_SUCCESS); 206 } 207 208 /* 209 Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 210 211 Input Parameter: 212 . A - the MATSEQAIJKOKKOS matrix 213 214 Output Parameters: 215 + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 216 - T_d - the transpose on device, whose value array is allocated but not initialized 217 */ 218 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 219 { 220 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 221 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 222 const PetscInt *Ai = aseq->i, *Aj = aseq->j; 223 MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 224 MatRowMapType *Ti = Ti_h.data(); 225 MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 226 MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 227 PetscInt *Tj = Tj_h.data(); 228 PetscInt *perm = perm_h.data(); 229 PetscInt *offset; 230 231 PetscFunctionBegin; 232 // Populate Ti 233 PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 234 Ti++; 235 for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 236 Ti--; 237 for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 238 239 // Populate Tj and the permutation array 240 PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 241 for (PetscInt i = 0; i < m; i++) { 242 for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 243 PetscInt r = Aj[j]; // row r of T 244 PetscInt disp = Ti[r] + offset[r]; 245 246 Tj[disp] = i; // col i of T 247 perm[disp] = j; 248 offset[r]++; 249 } 250 } 251 PetscCall(PetscFree(offset)); 252 253 // Sort each row of T, along with the permutation array 254 for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 255 256 // Output perm and T on device 257 auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 258 auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 259 PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 260 PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 261 PetscFunctionReturn(PETSC_SUCCESS); 262 } 263 264 // Generate the transpose on device and cache it internally 265 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 266 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 267 { 268 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 269 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 270 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 271 KokkosCsrMatrix &T = akok->csrmatT; 272 273 PetscFunctionBegin; 274 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 275 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 276 277 const auto &Aa = akok->a_dual.view_device(); 278 279 if (A->symmetric == PETSC_BOOL3_TRUE) { 280 *csrmatT = akok->csrmat; 281 } else { 282 // See if we already have a cached transpose and its value is up to date 283 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 284 if (!akok->transpose_updated) { // if the value is out of date, update the cached version 285 const auto &perm = akok->transpose_perm; // get the permutation array 286 auto &Ta = T.values; 287 288 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 289 } 290 } else { // Generate T of size n x m for the first time 291 MatRowMapKokkosView perm; 292 293 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 294 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 295 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 296 } 297 akok->transpose_updated = PETSC_TRUE; 298 *csrmatT = akok->csrmatT; 299 } 300 PetscFunctionReturn(PETSC_SUCCESS); 301 } 302 303 // Generate the Hermitian on device and cache it internally 304 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 305 { 306 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 307 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 308 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 309 KokkosCsrMatrix &T = akok->csrmatH; 310 311 PetscFunctionBegin; 312 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 313 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 314 315 const auto &Aa = akok->a_dual.view_device(); 316 317 if (A->hermitian == PETSC_BOOL3_TRUE) { 318 *csrmatH = akok->csrmat; 319 } else { 320 // See if we already have a cached hermitian and its value is up to date 321 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 322 if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 323 const auto &perm = akok->transpose_perm; // get the permutation array 324 auto &Ta = T.values; 325 326 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 327 } 328 } else { // Generate T of size n x m for the first time 329 MatRowMapKokkosView perm; 330 331 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 332 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 333 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 334 } 335 akok->hermitian_updated = PETSC_TRUE; 336 *csrmatH = akok->csrmatH; 337 } 338 PetscFunctionReturn(PETSC_SUCCESS); 339 } 340 341 /* y = A x */ 342 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 343 { 344 Mat_SeqAIJKokkos *aijkok; 345 ConstPetscScalarKokkosView xv; 346 PetscScalarKokkosView yv; 347 348 PetscFunctionBegin; 349 PetscCall(PetscLogGpuTimeBegin()); 350 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 351 PetscCall(VecGetKokkosView(xx, &xv)); 352 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 353 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 354 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 355 PetscCall(VecRestoreKokkosView(xx, &xv)); 356 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 357 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 358 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 359 PetscCall(PetscLogGpuTimeEnd()); 360 PetscFunctionReturn(PETSC_SUCCESS); 361 } 362 363 /* y = A^T x */ 364 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 365 { 366 Mat_SeqAIJKokkos *aijkok; 367 const char *mode; 368 ConstPetscScalarKokkosView xv; 369 PetscScalarKokkosView yv; 370 KokkosCsrMatrix csrmat; 371 372 PetscFunctionBegin; 373 PetscCall(PetscLogGpuTimeBegin()); 374 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 375 PetscCall(VecGetKokkosView(xx, &xv)); 376 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 377 if (A->form_explicit_transpose) { 378 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 379 mode = "N"; 380 } else { 381 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 382 csrmat = aijkok->csrmat; 383 mode = "T"; 384 } 385 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 386 PetscCall(VecRestoreKokkosView(xx, &xv)); 387 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 388 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 389 PetscCall(PetscLogGpuTimeEnd()); 390 PetscFunctionReturn(PETSC_SUCCESS); 391 } 392 393 /* y = A^H x */ 394 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 395 { 396 Mat_SeqAIJKokkos *aijkok; 397 const char *mode; 398 ConstPetscScalarKokkosView xv; 399 PetscScalarKokkosView yv; 400 KokkosCsrMatrix csrmat; 401 402 PetscFunctionBegin; 403 PetscCall(PetscLogGpuTimeBegin()); 404 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 405 PetscCall(VecGetKokkosView(xx, &xv)); 406 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 407 if (A->form_explicit_transpose) { 408 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 409 mode = "N"; 410 } else { 411 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 412 csrmat = aijkok->csrmat; 413 mode = "C"; 414 } 415 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 416 PetscCall(VecRestoreKokkosView(xx, &xv)); 417 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 418 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 419 PetscCall(PetscLogGpuTimeEnd()); 420 PetscFunctionReturn(PETSC_SUCCESS); 421 } 422 423 /* z = A x + y */ 424 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 425 { 426 Mat_SeqAIJKokkos *aijkok; 427 ConstPetscScalarKokkosView xv; 428 PetscScalarKokkosView zv; 429 430 PetscFunctionBegin; 431 PetscCall(PetscLogGpuTimeBegin()); 432 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 433 if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host 434 PetscCall(VecGetKokkosView(xx, &xv)); 435 PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device 436 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 437 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 438 PetscCall(VecRestoreKokkosView(xx, &xv)); 439 PetscCall(VecRestoreKokkosView(zz, &zv)); 440 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 441 PetscCall(PetscLogGpuTimeEnd()); 442 PetscFunctionReturn(PETSC_SUCCESS); 443 } 444 445 /* z = A^T x + y */ 446 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 447 { 448 Mat_SeqAIJKokkos *aijkok; 449 const char *mode; 450 ConstPetscScalarKokkosView xv; 451 PetscScalarKokkosView zv; 452 KokkosCsrMatrix csrmat; 453 454 PetscFunctionBegin; 455 PetscCall(PetscLogGpuTimeBegin()); 456 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 457 if (zz != yy) PetscCall(VecCopy(yy, zz)); 458 PetscCall(VecGetKokkosView(xx, &xv)); 459 PetscCall(VecGetKokkosView(zz, &zv)); 460 if (A->form_explicit_transpose) { 461 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 462 mode = "N"; 463 } else { 464 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 465 csrmat = aijkok->csrmat; 466 mode = "T"; 467 } 468 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 469 PetscCall(VecRestoreKokkosView(xx, &xv)); 470 PetscCall(VecRestoreKokkosView(zz, &zv)); 471 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 472 PetscCall(PetscLogGpuTimeEnd()); 473 PetscFunctionReturn(PETSC_SUCCESS); 474 } 475 476 /* z = A^H x + y */ 477 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 478 { 479 Mat_SeqAIJKokkos *aijkok; 480 const char *mode; 481 ConstPetscScalarKokkosView xv; 482 PetscScalarKokkosView zv; 483 KokkosCsrMatrix csrmat; 484 485 PetscFunctionBegin; 486 PetscCall(PetscLogGpuTimeBegin()); 487 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 488 if (zz != yy) PetscCall(VecCopy(yy, zz)); 489 PetscCall(VecGetKokkosView(xx, &xv)); 490 PetscCall(VecGetKokkosView(zz, &zv)); 491 if (A->form_explicit_transpose) { 492 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 493 mode = "N"; 494 } else { 495 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 496 csrmat = aijkok->csrmat; 497 mode = "C"; 498 } 499 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 500 PetscCall(VecRestoreKokkosView(xx, &xv)); 501 PetscCall(VecRestoreKokkosView(zz, &zv)); 502 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 503 PetscCall(PetscLogGpuTimeEnd()); 504 PetscFunctionReturn(PETSC_SUCCESS); 505 } 506 507 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 508 { 509 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 510 511 PetscFunctionBegin; 512 switch (op) { 513 case MAT_FORM_EXPLICIT_TRANSPOSE: 514 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 515 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 516 A->form_explicit_transpose = flg; 517 break; 518 default: 519 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 520 break; 521 } 522 PetscFunctionReturn(PETSC_SUCCESS); 523 } 524 525 /* Depending on reuse, either build a new mat, or use the existing mat */ 526 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 527 { 528 Mat_SeqAIJ *aseq; 529 530 PetscFunctionBegin; 531 PetscCall(PetscKokkosInitializeCheck()); 532 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 533 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */ 534 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 535 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 536 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 537 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 538 PetscCall(PetscFree(A->defaultvectype)); 539 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 540 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 541 PetscCall(MatSetOps_SeqAIJKokkos(A)); 542 aseq = static_cast<Mat_SeqAIJ *>(A->data); 543 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 544 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 545 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE); 546 } 547 } 548 PetscFunctionReturn(PETSC_SUCCESS); 549 } 550 551 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 552 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 553 */ 554 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 555 { 556 Mat_SeqAIJ *bseq; 557 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 558 Mat mat; 559 560 PetscFunctionBegin; 561 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 562 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 563 mat = *B; 564 if (A->assembled) { 565 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 566 bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE); 567 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 568 /* Now copy values to B if needed */ 569 if (dupOption == MAT_COPY_VALUES) { 570 if (akok->a_dual.need_sync_device()) { 571 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 572 bkok->a_dual.modify_host(); 573 } else { /* If device has the latest data, we only copy data on device */ 574 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 575 bkok->a_dual.modify_device(); 576 } 577 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 578 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 579 bkok->a_dual.modify_host(); 580 } 581 mat->spptr = bkok; 582 } 583 584 PetscCall(PetscFree(mat->defaultvectype)); 585 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 586 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 587 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 588 PetscFunctionReturn(PETSC_SUCCESS); 589 } 590 591 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 592 { 593 Mat At; 594 KokkosCsrMatrix internT; 595 Mat_SeqAIJKokkos *atkok, *bkok; 596 597 PetscFunctionBegin; 598 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 599 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 600 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 601 /* Deep copy internT, as we want to isolate the internal transpose */ 602 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 603 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 604 if (reuse == MAT_INITIAL_MATRIX) *B = At; 605 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 606 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 607 if ((*B)->assembled) { 608 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 609 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 610 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 611 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 612 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 613 MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 614 MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 615 PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 616 PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 617 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 618 } 619 PetscFunctionReturn(PETSC_SUCCESS); 620 } 621 622 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 623 { 624 Mat_SeqAIJKokkos *aijkok; 625 626 PetscFunctionBegin; 627 if (A->factortype == MAT_FACTOR_NONE) { 628 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 629 delete aijkok; 630 } else { 631 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 632 } 633 A->spptr = NULL; 634 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 635 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 636 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 637 PetscCall(MatDestroy_SeqAIJ(A)); 638 PetscFunctionReturn(PETSC_SUCCESS); 639 } 640 641 /*MC 642 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 643 644 A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types 645 646 Options Database Key: 647 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 648 649 Level: beginner 650 651 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 652 M*/ 653 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 654 { 655 PetscFunctionBegin; 656 PetscCall(PetscKokkosInitializeCheck()); 657 PetscCall(MatCreate_SeqAIJ(A)); 658 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 659 PetscFunctionReturn(PETSC_SUCCESS); 660 } 661 662 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 663 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 664 { 665 Mat_SeqAIJ *a, *b; 666 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 667 MatScalarKokkosView aa, ba, ca; 668 MatRowMapKokkosView ai, bi, ci; 669 MatColIdxKokkosView aj, bj, cj; 670 PetscInt m, n, nnz, aN; 671 672 PetscFunctionBegin; 673 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 674 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 675 PetscAssertPointer(C, 4); 676 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 677 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 678 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 679 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 680 681 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 682 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 683 a = static_cast<Mat_SeqAIJ *>(A->data); 684 b = static_cast<Mat_SeqAIJ *>(B->data); 685 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 686 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 687 aa = akok->a_dual.view_device(); 688 ai = akok->i_dual.view_device(); 689 ba = bkok->a_dual.view_device(); 690 bi = bkok->i_dual.view_device(); 691 m = A->rmap->n; /* M, N and nnz of C */ 692 n = A->cmap->n + B->cmap->n; 693 nnz = a->nz + b->nz; 694 aN = A->cmap->n; /* N of A */ 695 if (reuse == MAT_INITIAL_MATRIX) { 696 aj = akok->j_dual.view_device(); 697 bj = bkok->j_dual.view_device(); 698 auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 699 auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 700 auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 701 ca = ca_dual.view_device(); 702 ci = ci_dual.view_device(); 703 cj = cj_dual.view_device(); 704 705 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 706 Kokkos::parallel_for( 707 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 708 PetscInt i = t.league_rank(); /* row i */ 709 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 710 711 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 712 ci(i) = coffset; 713 if (i == m - 1) ci(m) = ai(m) + bi(m); 714 }); 715 716 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 717 if (k < alen) { 718 ca(coffset + k) = aa(ai(i) + k); 719 cj(coffset + k) = aj(ai(i) + k); 720 } else { 721 ca(coffset + k) = ba(bi(i) + k - alen); 722 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 723 } 724 }); 725 }); 726 ca_dual.modify_device(); 727 ci_dual.modify_device(); 728 cj_dual.modify_device(); 729 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 730 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 731 } else if (reuse == MAT_REUSE_MATRIX) { 732 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 733 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 734 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 735 ca = ckok->a_dual.view_device(); 736 ci = ckok->i_dual.view_device(); 737 738 Kokkos::parallel_for( 739 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 740 PetscInt i = t.league_rank(); /* row i */ 741 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 742 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 743 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 744 else ca(ci(i) + k) = ba(bi(i) + k - alen); 745 }); 746 }); 747 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 748 } 749 PetscFunctionReturn(PETSC_SUCCESS); 750 } 751 752 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 753 { 754 PetscFunctionBegin; 755 delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 756 PetscFunctionReturn(PETSC_SUCCESS); 757 } 758 759 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 760 { 761 Mat_Product *product = C->product; 762 Mat A, B; 763 bool transA, transB; /* use bool, since KK needs this type */ 764 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 765 Mat_SeqAIJ *c; 766 MatProductData_SeqAIJKokkos *pdata; 767 KokkosCsrMatrix csrmatA, csrmatB; 768 769 PetscFunctionBegin; 770 MatCheckProduct(C, 1); 771 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 772 pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 773 774 // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 775 // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 776 // we still do numeric. 777 if (pdata->reusesym) { // numeric reuses results from symbolic 778 pdata->reusesym = PETSC_FALSE; 779 PetscFunctionReturn(PETSC_SUCCESS); 780 } 781 782 switch (product->type) { 783 case MATPRODUCT_AB: 784 transA = false; 785 transB = false; 786 break; 787 case MATPRODUCT_AtB: 788 transA = true; 789 transB = false; 790 break; 791 case MATPRODUCT_ABt: 792 transA = false; 793 transB = true; 794 break; 795 default: 796 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 797 } 798 799 A = product->A; 800 B = product->B; 801 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 802 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 803 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 804 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 805 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 806 807 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 808 809 csrmatA = akok->csrmat; 810 csrmatB = bkok->csrmat; 811 812 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 813 if (transA) { 814 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 815 transA = false; 816 } 817 818 if (transB) { 819 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 820 transB = false; 821 } 822 PetscCall(PetscLogGpuTimeBegin()); 823 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 824 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 825 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 826 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 827 #endif 828 829 PetscCall(PetscLogGpuTimeEnd()); 830 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 831 /* shorter version of MatAssemblyEnd_SeqAIJ */ 832 c = (Mat_SeqAIJ *)C->data; 833 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 834 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 835 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 836 c->reallocs = 0; 837 C->info.mallocs = 0; 838 C->info.nz_unneeded = 0; 839 C->assembled = C->was_assembled = PETSC_TRUE; 840 C->num_ass++; 841 PetscFunctionReturn(PETSC_SUCCESS); 842 } 843 844 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 845 { 846 Mat_Product *product = C->product; 847 MatProductType ptype; 848 Mat A, B; 849 bool transA, transB; 850 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 851 MatProductData_SeqAIJKokkos *pdata; 852 MPI_Comm comm; 853 KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 854 855 PetscFunctionBegin; 856 MatCheckProduct(C, 1); 857 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 858 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 859 A = product->A; 860 B = product->B; 861 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 862 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 863 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 864 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 865 csrmatA = akok->csrmat; 866 csrmatB = bkok->csrmat; 867 868 ptype = product->type; 869 // Take advantage of the symmetry if true 870 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 871 ptype = MATPRODUCT_AB; 872 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 873 } 874 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 875 ptype = MATPRODUCT_AB; 876 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 877 } 878 879 switch (ptype) { 880 case MATPRODUCT_AB: 881 transA = false; 882 transB = false; 883 break; 884 case MATPRODUCT_AtB: 885 transA = true; 886 transB = false; 887 break; 888 case MATPRODUCT_ABt: 889 transA = false; 890 transB = true; 891 break; 892 default: 893 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 894 } 895 PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 896 pdata->reusesym = product->api_user; 897 898 /* TODO: add command line options to select spgemm algorithms */ 899 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 900 901 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 902 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 903 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 904 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 905 #endif 906 #endif 907 PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 908 909 PetscCall(PetscLogGpuTimeBegin()); 910 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 911 if (transA) { 912 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 913 transA = false; 914 } 915 916 if (transB) { 917 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 918 transB = false; 919 } 920 921 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 922 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 923 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 924 calling new Mat_SeqAIJKokkos(). 925 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 926 */ 927 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 928 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 929 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 930 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 931 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 932 #endif 933 PetscCall(PetscLogGpuTimeEnd()); 934 935 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 936 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 937 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 938 PetscFunctionReturn(PETSC_SUCCESS); 939 } 940 941 /* handles sparse matrix matrix ops */ 942 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 943 { 944 Mat_Product *product = mat->product; 945 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 946 947 PetscFunctionBegin; 948 MatCheckProduct(mat, 1); 949 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 950 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 951 if (Biskok && Ciskok) { 952 switch (product->type) { 953 case MATPRODUCT_AB: 954 case MATPRODUCT_AtB: 955 case MATPRODUCT_ABt: 956 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 957 break; 958 case MATPRODUCT_PtAP: 959 case MATPRODUCT_RARt: 960 case MATPRODUCT_ABC: 961 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 962 break; 963 default: 964 break; 965 } 966 } else { /* fallback for AIJ */ 967 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 968 } 969 PetscFunctionReturn(PETSC_SUCCESS); 970 } 971 972 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 973 { 974 Mat_SeqAIJKokkos *aijkok; 975 976 PetscFunctionBegin; 977 PetscCall(PetscLogGpuTimeBegin()); 978 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 979 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 980 KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 981 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 982 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 983 PetscCall(PetscLogGpuTimeEnd()); 984 PetscFunctionReturn(PETSC_SUCCESS); 985 } 986 987 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular) 988 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a) 989 { 990 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 991 992 PetscFunctionBegin; 993 if (A->assembled && aijseq->diagonaldense) { // no missing diagonals 994 PetscInt n = PetscMin(A->rmap->n, A->cmap->n); 995 996 PetscCall(PetscLogGpuTimeBegin()); 997 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 998 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 999 const auto &Aa = aijkok->a_dual.view_device(); 1000 const auto &Adiag = aijkok->diag_dual.view_device(); 1001 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; })); 1002 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1003 PetscCall(PetscLogGpuFlops(n)); 1004 PetscCall(PetscLogGpuTimeEnd()); 1005 } else { // need reassembly, very slow! 1006 PetscCall(MatShift_Basic(A, a)); 1007 } 1008 PetscFunctionReturn(PETSC_SUCCESS); 1009 } 1010 1011 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is) 1012 { 1013 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data); 1014 1015 PetscFunctionBegin; 1016 if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals 1017 ConstPetscScalarKokkosView dv; 1018 PetscInt n, nv; 1019 1020 PetscCall(PetscLogGpuTimeBegin()); 1021 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1022 PetscCall(VecGetKokkosView(D, &dv)); 1023 PetscCall(VecGetLocalSize(D, &nv)); 1024 n = PetscMin(Y->rmap->n, Y->cmap->n); 1025 PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match"); 1026 1027 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1028 const auto &Aa = aijkok->a_dual.view_device(); 1029 const auto &Adiag = aijkok->diag_dual.view_device(); 1030 PetscCallCXX(Kokkos::parallel_for( 1031 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1032 if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i); 1033 else Aa(Adiag(i)) += dv(i); 1034 })); 1035 PetscCall(VecRestoreKokkosView(D, &dv)); 1036 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1037 PetscCall(PetscLogGpuFlops(n)); 1038 PetscCall(PetscLogGpuTimeEnd()); 1039 } else { // need reassembly, very slow! 1040 PetscCall(MatDiagonalSet_Default(Y, D, is)); 1041 } 1042 PetscFunctionReturn(PETSC_SUCCESS); 1043 } 1044 1045 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr) 1046 { 1047 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1048 PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz; 1049 ConstPetscScalarKokkosView lv, rv; 1050 1051 PetscFunctionBegin; 1052 PetscCall(PetscLogGpuTimeBegin()); 1053 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1054 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1055 const auto &Aa = aijkok->a_dual.view_device(); 1056 const auto &Ai = aijkok->i_dual.view_device(); 1057 const auto &Aj = aijkok->j_dual.view_device(); 1058 if (ll) { 1059 PetscCall(VecGetLocalSize(ll, &m)); 1060 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1061 PetscCall(VecGetKokkosView(ll, &lv)); 1062 PetscCallCXX(Kokkos::parallel_for( // for each row 1063 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1064 PetscInt i = t.league_rank(); // row i 1065 PetscInt len = Ai(i + 1) - Ai(i); 1066 // scale entries on the row 1067 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); }); 1068 })); 1069 PetscCall(VecRestoreKokkosView(ll, &lv)); 1070 PetscCall(PetscLogGpuFlops(nz)); 1071 } 1072 if (rr) { 1073 PetscCall(VecGetLocalSize(rr, &n)); 1074 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1075 PetscCall(VecGetKokkosView(rr, &rv)); 1076 PetscCallCXX(Kokkos::parallel_for( // for each nonzero 1077 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); })); 1078 PetscCall(VecRestoreKokkosView(rr, &lv)); 1079 PetscCall(PetscLogGpuFlops(nz)); 1080 } 1081 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1082 PetscCall(PetscLogGpuTimeEnd()); 1083 PetscFunctionReturn(PETSC_SUCCESS); 1084 } 1085 1086 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 1087 { 1088 Mat_SeqAIJKokkos *aijkok; 1089 1090 PetscFunctionBegin; 1091 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1092 if (aijkok) { /* Only zero the device if data is already there */ 1093 KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0); 1094 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1095 } else { /* Might be preallocated but not assembled */ 1096 PetscCall(MatZeroEntries_SeqAIJ(A)); 1097 } 1098 PetscFunctionReturn(PETSC_SUCCESS); 1099 } 1100 1101 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1102 { 1103 Mat_SeqAIJKokkos *aijkok; 1104 PetscInt n; 1105 PetscScalarKokkosView xv; 1106 1107 PetscFunctionBegin; 1108 PetscCall(VecGetLocalSize(x, &n)); 1109 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1110 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1111 1112 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1113 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1114 1115 const auto &Aa = aijkok->a_dual.view_device(); 1116 const auto &Ai = aijkok->i_dual.view_device(); 1117 const auto &Adiag = aijkok->diag_dual.view_device(); 1118 1119 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1120 Kokkos::parallel_for( 1121 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1122 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1123 else xv(i) = 0; 1124 }); 1125 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1126 PetscFunctionReturn(PETSC_SUCCESS); 1127 } 1128 1129 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1130 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1131 { 1132 Mat_SeqAIJKokkos *aijkok; 1133 1134 PetscFunctionBegin; 1135 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1136 PetscAssertPointer(kv, 2); 1137 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1138 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1139 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1140 *kv = aijkok->a_dual.view_device(); 1141 PetscFunctionReturn(PETSC_SUCCESS); 1142 } 1143 1144 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1145 { 1146 PetscFunctionBegin; 1147 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1148 PetscAssertPointer(kv, 2); 1149 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1150 PetscFunctionReturn(PETSC_SUCCESS); 1151 } 1152 1153 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1154 { 1155 Mat_SeqAIJKokkos *aijkok; 1156 1157 PetscFunctionBegin; 1158 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1159 PetscAssertPointer(kv, 2); 1160 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1161 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1162 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1163 *kv = aijkok->a_dual.view_device(); 1164 PetscFunctionReturn(PETSC_SUCCESS); 1165 } 1166 1167 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1168 { 1169 PetscFunctionBegin; 1170 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1171 PetscAssertPointer(kv, 2); 1172 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1173 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1174 PetscFunctionReturn(PETSC_SUCCESS); 1175 } 1176 1177 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1178 { 1179 Mat_SeqAIJKokkos *aijkok; 1180 1181 PetscFunctionBegin; 1182 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1183 PetscAssertPointer(kv, 2); 1184 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1185 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1186 *kv = aijkok->a_dual.view_device(); 1187 PetscFunctionReturn(PETSC_SUCCESS); 1188 } 1189 1190 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1191 { 1192 PetscFunctionBegin; 1193 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1194 PetscAssertPointer(kv, 2); 1195 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1196 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1197 PetscFunctionReturn(PETSC_SUCCESS); 1198 } 1199 1200 /* Computes Y += alpha X */ 1201 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1202 { 1203 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1204 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1205 ConstMatScalarKokkosView Xa; 1206 MatScalarKokkosView Ya; 1207 auto &exec = PetscGetKokkosExecutionSpace(); 1208 1209 PetscFunctionBegin; 1210 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1211 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1212 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1213 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1214 PetscCall(PetscLogGpuTimeBegin()); 1215 1216 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1217 PetscBool e; 1218 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1219 if (e) { 1220 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1221 if (e) pattern = SAME_NONZERO_PATTERN; 1222 } 1223 } 1224 1225 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1226 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1227 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1228 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1229 */ 1230 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1231 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1232 Xa = xkok->a_dual.view_device(); 1233 Ya = ykok->a_dual.view_device(); 1234 1235 if (pattern == SAME_NONZERO_PATTERN) { 1236 KokkosBlas::axpy(exec, alpha, Xa, Ya); 1237 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1238 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1239 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1240 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1241 1242 Kokkos::parallel_for( 1243 Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1244 PetscInt i = t.league_rank(); // row i 1245 Kokkos::single(Kokkos::PerTeam(t), [=]() { 1246 // Only one thread works in a team 1247 PetscInt p, q = Yi(i); 1248 for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 1249 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 1250 if (Xj(p) == Yj(q)) { // Found it 1251 Ya(q) += alpha * Xa(p); 1252 q++; 1253 } else { 1254 // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1255 // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1256 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 1257 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 1258 #else 1259 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 1260 #endif 1261 } 1262 } 1263 }); 1264 }); 1265 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1266 } else { // different nonzero patterns 1267 Mat Z; 1268 KokkosCsrMatrix zcsr; 1269 KernelHandle kh; 1270 kh.create_spadd_handle(true); // X, Y are sorted 1271 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1272 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1273 zkok = new Mat_SeqAIJKokkos(zcsr); 1274 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1275 PetscCall(MatHeaderReplace(Y, &Z)); 1276 kh.destroy_spadd_handle(); 1277 } 1278 PetscCall(PetscLogGpuTimeEnd()); 1279 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 1280 PetscFunctionReturn(PETSC_SUCCESS); 1281 } 1282 1283 struct MatCOOStruct_SeqAIJKokkos { 1284 PetscCount n; 1285 PetscCount Atot; 1286 PetscInt nz; 1287 PetscCountKokkosView jmap; 1288 PetscCountKokkosView perm; 1289 1290 MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h) 1291 { 1292 nz = coo_h->nz; 1293 n = coo_h->n; 1294 Atot = coo_h->Atot; 1295 jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1)); 1296 perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot)); 1297 } 1298 }; 1299 1300 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void *data) 1301 { 1302 PetscFunctionBegin; 1303 PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(data)); 1304 PetscFunctionReturn(PETSC_SUCCESS); 1305 } 1306 1307 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1308 { 1309 Mat_SeqAIJKokkos *akok; 1310 Mat_SeqAIJ *aseq; 1311 PetscContainer container_h, container_d; 1312 MatCOOStruct_SeqAIJ *coo_h; 1313 MatCOOStruct_SeqAIJKokkos *coo_d; 1314 1315 PetscFunctionBegin; 1316 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1317 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1318 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1319 delete akok; 1320 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE); 1321 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1322 1323 // Copy the COO struct to device 1324 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 1325 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 1326 PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h)); 1327 1328 // Put the COO struct in a container and then attach that to the matrix 1329 PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d)); 1330 PetscCall(PetscContainerSetPointer(container_d, coo_d)); 1331 PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_SeqAIJKokkos)); 1332 PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d)); 1333 PetscCall(PetscContainerDestroy(&container_d)); 1334 PetscFunctionReturn(PETSC_SUCCESS); 1335 } 1336 1337 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1338 { 1339 MatScalarKokkosView Aa; 1340 ConstMatScalarKokkosView kv; 1341 PetscMemType memtype; 1342 PetscContainer container; 1343 MatCOOStruct_SeqAIJKokkos *coo; 1344 1345 PetscFunctionBegin; 1346 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 1347 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 1348 1349 const auto &n = coo->n; 1350 const auto &Annz = coo->nz; 1351 const auto &jmap = coo->jmap; 1352 const auto &perm = coo->perm; 1353 1354 PetscCall(PetscGetMemType(v, &memtype)); 1355 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1356 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n)); 1357 } else { 1358 kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */ 1359 } 1360 1361 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1362 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1363 1364 PetscCall(PetscLogGpuTimeBegin()); 1365 Kokkos::parallel_for( 1366 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) { 1367 PetscScalar sum = 0.0; 1368 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1369 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1370 }); 1371 PetscCall(PetscLogGpuTimeEnd()); 1372 1373 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1374 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1375 PetscFunctionReturn(PETSC_SUCCESS); 1376 } 1377 1378 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1379 { 1380 PetscFunctionBegin; 1381 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1382 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1383 B->offloadmask = PETSC_OFFLOAD_CPU; 1384 PetscFunctionReturn(PETSC_SUCCESS); 1385 } 1386 1387 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1388 { 1389 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1390 1391 PetscFunctionBegin; 1392 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1393 A->boundtocpu = PETSC_FALSE; 1394 1395 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1396 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1397 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1398 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1399 A->ops->scale = MatScale_SeqAIJKokkos; 1400 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1401 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1402 A->ops->mult = MatMult_SeqAIJKokkos; 1403 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1404 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1405 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1406 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1407 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1408 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1409 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1410 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1411 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1412 A->ops->shift = MatShift_SeqAIJKokkos; 1413 A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos; 1414 A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos; 1415 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1416 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1417 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1418 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1419 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1420 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1421 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1422 1423 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1424 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1425 PetscFunctionReturn(PETSC_SUCCESS); 1426 } 1427 1428 /* 1429 Extract the (prescribled) diagonal blocks of the matrix and then invert them 1430 1431 Input Parameters: 1432 + A - the MATSEQAIJKOKKOS matrix 1433 . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i) 1434 . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[] 1435 . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i) 1436 - work - a pre-allocated work buffer (as big as diagVal) for use by this routine 1437 1438 Output Parameter: 1439 . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order) 1440 */ 1441 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal) 1442 { 1443 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1444 PetscInt nblocks = bs.extent(0) - 1; 1445 1446 PetscFunctionBegin; 1447 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device 1448 1449 // Pull out the diagonal blocks of the matrix and then invert the blocks 1450 auto Aa = akok->a_dual.view_device(); 1451 auto Ai = akok->i_dual.view_device(); 1452 auto Aj = akok->j_dual.view_device(); 1453 auto Adiag = akok->diag_dual.view_device(); 1454 // TODO: how to tune the team size? 1455 #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HOST) 1456 auto ts = Kokkos::AUTO(); 1457 #else 1458 auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs 1459 #endif 1460 PetscCallCXX(Kokkos::parallel_for( 1461 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) { 1462 const PetscInt bid = teamMember.league_rank(); // block id 1463 const PetscInt rstart = bs(bid); // this block starts from this row 1464 const PetscInt m = bs(bid + 1) - bs(bid); // size of this block 1465 const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order 1466 const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m); 1467 1468 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B 1469 PetscInt i = rstart + r; // i-th row in A 1470 1471 if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case) 1472 PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row 1473 1474 for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet 1475 if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero 1476 B(r, c) = 0.0; 1477 } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column 1478 B(r, c) = Aa(first + c); 1479 } else { // this entry does not show up in the CSR 1480 B(r, c) = 0.0; 1481 } 1482 } 1483 } else { // rare case that the diagonal does not exist 1484 const PetscInt begin = Ai(i); 1485 const PetscInt end = Ai(i + 1); 1486 for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0; 1487 for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not. 1488 if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j); 1489 else if (Aj(j) >= rstart + m) break; 1490 } 1491 } 1492 }); 1493 1494 // LU-decompose B (w/o pivoting) and then invert B 1495 KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0); 1496 KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W); 1497 })); 1498 // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels 1499 PetscFunctionReturn(PETSC_SUCCESS); 1500 } 1501 1502 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1503 { 1504 Mat_SeqAIJ *aseq; 1505 PetscInt i, m, n; 1506 auto &exec = PetscGetKokkosExecutionSpace(); 1507 1508 PetscFunctionBegin; 1509 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1510 1511 m = akok->nrows(); 1512 n = akok->ncols(); 1513 PetscCall(MatSetSizes(A, m, n, m, n)); 1514 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1515 1516 /* Set up data structures of A as a MATSEQAIJ */ 1517 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1518 aseq = (Mat_SeqAIJ *)(A)->data; 1519 1520 PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */ 1521 PetscCallCXX(akok->j_dual.sync_host(exec)); 1522 PetscCallCXX(exec.fence()); 1523 1524 aseq->i = akok->i_host_data(); 1525 aseq->j = akok->j_host_data(); 1526 aseq->a = akok->a_host_data(); 1527 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1528 aseq->free_a = PETSC_FALSE; 1529 aseq->free_ij = PETSC_FALSE; 1530 aseq->nz = akok->nnz(); 1531 aseq->maxnz = aseq->nz; 1532 1533 PetscCall(PetscMalloc1(m, &aseq->imax)); 1534 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1535 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1536 1537 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1538 akok->nonzerostate = A->nonzerostate; 1539 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1540 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1541 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1542 PetscFunctionReturn(PETSC_SUCCESS); 1543 } 1544 1545 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 1546 { 1547 PetscFunctionBegin; 1548 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1549 *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 1550 PetscFunctionReturn(PETSC_SUCCESS); 1551 } 1552 1553 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 1554 { 1555 Mat_SeqAIJKokkos *akok; 1556 1557 PetscFunctionBegin; 1558 PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 1559 PetscCall(MatCreate(comm, A)); 1560 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1561 PetscFunctionReturn(PETSC_SUCCESS); 1562 } 1563 1564 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1565 1566 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1567 */ 1568 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1569 { 1570 PetscFunctionBegin; 1571 PetscCall(MatCreate(comm, A)); 1572 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1573 PetscFunctionReturn(PETSC_SUCCESS); 1574 } 1575 1576 /*@C 1577 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1578 (the default parallel PETSc format). This matrix will ultimately be handled by 1579 Kokkos for calculations. 1580 1581 Collective 1582 1583 Input Parameters: 1584 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1585 . m - number of rows 1586 . n - number of columns 1587 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 1588 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 1589 1590 Output Parameter: 1591 . A - the matrix 1592 1593 Level: intermediate 1594 1595 Notes: 1596 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1597 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1598 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1599 1600 The AIJ format, also called 1601 compressed row storage, is fully compatible with standard Fortran 1602 storage. That is, the stored row and column indices can begin at 1603 either one (as in Fortran) or zero. 1604 1605 Specify the preallocated storage with either `nz` or `nnz` (not both). 1606 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1607 allocation. 1608 1609 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()` 1610 @*/ 1611 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1612 { 1613 PetscFunctionBegin; 1614 PetscCall(PetscKokkosInitializeCheck()); 1615 PetscCall(MatCreate(comm, A)); 1616 PetscCall(MatSetSizes(*A, m, n, m, n)); 1617 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1618 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1619 PetscFunctionReturn(PETSC_SUCCESS); 1620 } 1621 1622 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1623 { 1624 PetscFunctionBegin; 1625 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1626 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1627 PetscFunctionReturn(PETSC_SUCCESS); 1628 } 1629 1630 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1631 { 1632 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1633 1634 PetscFunctionBegin; 1635 if (!factors->sptrsv_symbolic_completed) { 1636 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d); 1637 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d); 1638 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1639 } 1640 PetscFunctionReturn(PETSC_SUCCESS); 1641 } 1642 1643 /* Check if we need to update factors etc for transpose solve */ 1644 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1645 { 1646 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1647 MatColIdxType n = A->rmap->n; 1648 1649 PetscFunctionBegin; 1650 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1651 /* Update L^T and do sptrsv symbolic */ 1652 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0 1653 factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 1654 factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 1655 1656 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d, 1657 factors->iLt_d, factors->jLt_d, factors->aLt_d); 1658 1659 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1660 We have to sort the indices, until KK provides finer control options. 1661 */ 1662 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d); 1663 1664 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d); 1665 1666 /* Update U^T and do sptrsv symbolic */ 1667 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0 1668 factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 1669 factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 1670 1671 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d, 1672 factors->iUt_d, factors->jUt_d, factors->aUt_d); 1673 1674 /* Sort indices. See comments above */ 1675 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d); 1676 1677 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d); 1678 factors->transpose_updated = PETSC_TRUE; 1679 } 1680 PetscFunctionReturn(PETSC_SUCCESS); 1681 } 1682 1683 /* Solve Ax = b, with A = LU */ 1684 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x) 1685 { 1686 ConstPetscScalarKokkosView bv; 1687 PetscScalarKokkosView xv; 1688 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1689 1690 PetscFunctionBegin; 1691 PetscCall(PetscLogGpuTimeBegin()); 1692 PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 1693 PetscCall(VecGetKokkosView(b, &bv)); 1694 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1695 /* Solve L tmpv = b */ 1696 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector)); 1697 /* Solve Ux = tmpv */ 1698 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv)); 1699 PetscCall(VecRestoreKokkosView(b, &bv)); 1700 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1701 PetscCall(PetscLogGpuTimeEnd()); 1702 PetscFunctionReturn(PETSC_SUCCESS); 1703 } 1704 1705 /* Solve A^T x = b, where A^T = U^T L^T */ 1706 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x) 1707 { 1708 ConstPetscScalarKokkosView bv; 1709 PetscScalarKokkosView xv; 1710 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1711 1712 PetscFunctionBegin; 1713 PetscCall(PetscLogGpuTimeBegin()); 1714 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 1715 PetscCall(VecGetKokkosView(b, &bv)); 1716 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1717 /* Solve U^T tmpv = b */ 1718 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector); 1719 1720 /* Solve L^T x = tmpv */ 1721 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv); 1722 PetscCall(VecRestoreKokkosView(b, &bv)); 1723 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1724 PetscCall(PetscLogGpuTimeEnd()); 1725 PetscFunctionReturn(PETSC_SUCCESS); 1726 } 1727 1728 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1729 { 1730 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1731 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1732 PetscInt fill_lev = info->levels; 1733 1734 PetscFunctionBegin; 1735 PetscCall(PetscLogGpuTimeBegin()); 1736 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1737 1738 auto a_d = aijkok->a_dual.view_device(); 1739 auto i_d = aijkok->i_dual.view_device(); 1740 auto j_d = aijkok->j_dual.view_device(); 1741 1742 KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d); 1743 1744 B->assembled = PETSC_TRUE; 1745 B->preallocated = PETSC_TRUE; 1746 B->ops->solve = MatSolve_SeqAIJKokkos; 1747 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1748 B->ops->matsolve = NULL; 1749 B->ops->matsolvetranspose = NULL; 1750 B->offloadmask = PETSC_OFFLOAD_GPU; 1751 1752 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1753 factors->transpose_updated = PETSC_FALSE; 1754 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1755 /* TODO: log flops, but how to know that? */ 1756 PetscCall(PetscLogGpuTimeEnd()); 1757 PetscFunctionReturn(PETSC_SUCCESS); 1758 } 1759 1760 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1761 { 1762 Mat_SeqAIJKokkos *aijkok; 1763 Mat_SeqAIJ *b; 1764 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1765 PetscInt fill_lev = info->levels; 1766 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 1767 PetscInt n = A->rmap->n; 1768 1769 PetscFunctionBegin; 1770 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1771 /* Rebuild factors */ 1772 if (factors) { 1773 factors->Destroy(); 1774 } /* Destroy the old if it exists */ 1775 else { 1776 B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n); 1777 } 1778 1779 /* Create a spiluk handle and then do symbolic factorization */ 1780 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 1781 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 1782 1783 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1784 1785 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 1786 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 1787 Kokkos::realloc(factors->iU_d, n + 1); 1788 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 1789 1790 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1791 auto i_d = aijkok->i_dual.view_device(); 1792 auto j_d = aijkok->j_dual.view_device(); 1793 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d); 1794 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1795 1796 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1797 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 1798 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 1799 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 1800 1801 /* TODO: add options to select sptrsv algorithms */ 1802 /* Create sptrsv handles for L, U and their transpose */ 1803 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1804 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1805 #else 1806 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1807 #endif 1808 1809 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 1810 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 1811 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 1812 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 1813 1814 /* Fill fields of the factor matrix B */ 1815 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 1816 b = (Mat_SeqAIJ *)B->data; 1817 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 1818 B->info.fill_ratio_given = info->fill; 1819 B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 1820 1821 B->offloadmask = PETSC_OFFLOAD_GPU; 1822 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1823 PetscFunctionReturn(PETSC_SUCCESS); 1824 } 1825 1826 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type) 1827 { 1828 PetscFunctionBegin; 1829 *type = MATSOLVERKOKKOS; 1830 PetscFunctionReturn(PETSC_SUCCESS); 1831 } 1832 1833 /*MC 1834 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1835 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 1836 1837 Level: beginner 1838 1839 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1840 M*/ 1841 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1842 { 1843 PetscInt n = A->rmap->n; 1844 1845 PetscFunctionBegin; 1846 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 1847 PetscCall(MatSetSizes(*B, n, n, n, n)); 1848 (*B)->factortype = ftype; 1849 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 1850 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1851 1852 if (ftype == MAT_FACTOR_LU) { 1853 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1854 (*B)->canuseordering = PETSC_TRUE; 1855 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1856 } else if (ftype == MAT_FACTOR_ILU) { 1857 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1858 (*B)->canuseordering = PETSC_FALSE; 1859 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1860 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1861 1862 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 1863 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos)); 1864 PetscFunctionReturn(PETSC_SUCCESS); 1865 } 1866 1867 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1868 { 1869 PetscFunctionBegin; 1870 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1871 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1872 PetscFunctionReturn(PETSC_SUCCESS); 1873 } 1874 1875 /* Utility to print out a KokkosCsrMatrix for debugging */ 1876 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 1877 { 1878 const auto &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map); 1879 const auto &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries); 1880 const auto &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values); 1881 const PetscInt *i = iv.data(); 1882 const PetscInt *j = jv.data(); 1883 const PetscScalar *a = av.data(); 1884 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 1885 1886 PetscFunctionBegin; 1887 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 1888 for (PetscInt k = 0; k < m; k++) { 1889 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 1890 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 1891 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 1892 } 1893 PetscFunctionReturn(PETSC_SUCCESS); 1894 } 1895