1 #include <petsc_kokkos.hpp> 2 #include <petscvec_kokkos.hpp> 3 #include <petscpkg_version.h> 4 #include <petsc/private/petscimpl.h> 5 #include <petsc/private/sfimpl.h> 6 #include <petscsystypes.h> 7 #include <petscerror.h> 8 9 #include <Kokkos_Core.hpp> 10 #include <KokkosBlas.hpp> 11 #include <KokkosSparse_CrsMatrix.hpp> 12 #include <KokkosSparse_spmv.hpp> 13 #include <KokkosSparse_spiluk.hpp> 14 #include <KokkosSparse_sptrsv.hpp> 15 #include <KokkosSparse_spgemm.hpp> 16 #include <KokkosSparse_spadd.hpp> 17 #include <KokkosBatched_LU_Decl.hpp> 18 #include <KokkosBatched_InverseLU_Decl.hpp> 19 20 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 21 22 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 23 #include <KokkosSparse_Utils.hpp> 24 using KokkosSparse::sort_crs_matrix; 25 using KokkosSparse::Impl::transpose_matrix; 26 #else 27 #include <KokkosKernels_Sorting.hpp> 28 using KokkosKernels::sort_crs_matrix; 29 using KokkosKernels::Impl::transpose_matrix; 30 #endif 31 32 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 33 34 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 35 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 36 In the latter case, it is important to set a_dual's sync state correctly. 37 */ 38 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 39 { 40 Mat_SeqAIJ *aijseq; 41 Mat_SeqAIJKokkos *aijkok; 42 43 PetscFunctionBegin; 44 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 45 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 46 47 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 48 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 49 50 /* If aijkok does not exist, we just copy i, j to device. 51 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 52 In both cases, we build a new aijkok structure. 53 */ 54 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 55 delete aijkok; 56 aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 57 A->spptr = aijkok; 58 } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag. 59 MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n); 60 auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h); 61 aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h); 62 } 63 PetscFunctionReturn(PETSC_SUCCESS); 64 } 65 66 /* Sync CSR data to device if not yet */ 67 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 68 { 69 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 70 71 PetscFunctionBegin; 72 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 73 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 74 if (aijkok->a_dual.need_sync_device()) { 75 aijkok->a_dual.sync_device(); 76 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 77 aijkok->hermitian_updated = PETSC_FALSE; 78 } 79 PetscFunctionReturn(PETSC_SUCCESS); 80 } 81 82 /* Mark the CSR data on device as modified */ 83 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 84 { 85 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 86 87 PetscFunctionBegin; 88 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 89 aijkok->a_dual.clear_sync_state(); 90 aijkok->a_dual.modify_device(); 91 aijkok->transpose_updated = PETSC_FALSE; 92 aijkok->hermitian_updated = PETSC_FALSE; 93 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 94 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 95 PetscFunctionReturn(PETSC_SUCCESS); 96 } 97 98 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 99 { 100 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 101 auto &exec = PetscGetKokkosExecutionSpace(); 102 103 PetscFunctionBegin; 104 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 105 /* We do not expect one needs factors on host */ 106 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 107 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 108 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 109 PetscCallCXX(exec.fence()); 110 PetscFunctionReturn(PETSC_SUCCESS); 111 } 112 113 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 114 { 115 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 116 117 PetscFunctionBegin; 118 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 119 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 120 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 121 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 122 */ 123 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 124 auto &exec = PetscGetKokkosExecutionSpace(); 125 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 126 PetscCallCXX(exec.fence()); 127 *array = aijkok->a_dual.view_host().data(); 128 } else { /* Happens when calling MatSetValues on a newly created matrix */ 129 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 130 } 131 PetscFunctionReturn(PETSC_SUCCESS); 132 } 133 134 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 135 { 136 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 137 138 PetscFunctionBegin; 139 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 140 PetscFunctionReturn(PETSC_SUCCESS); 141 } 142 143 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 144 { 145 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 146 147 PetscFunctionBegin; 148 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 149 auto &exec = PetscGetKokkosExecutionSpace(); 150 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 151 PetscCallCXX(exec.fence()); 152 *array = aijkok->a_dual.view_host().data(); 153 } else { 154 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 155 } 156 PetscFunctionReturn(PETSC_SUCCESS); 157 } 158 159 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 160 { 161 PetscFunctionBegin; 162 *array = NULL; 163 PetscFunctionReturn(PETSC_SUCCESS); 164 } 165 166 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 167 { 168 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 169 170 PetscFunctionBegin; 171 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 172 *array = aijkok->a_dual.view_host().data(); 173 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 174 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 175 } 176 PetscFunctionReturn(PETSC_SUCCESS); 177 } 178 179 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 180 { 181 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 182 183 PetscFunctionBegin; 184 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 185 aijkok->a_dual.clear_sync_state(); 186 aijkok->a_dual.modify_host(); 187 } 188 PetscFunctionReturn(PETSC_SUCCESS); 189 } 190 191 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 192 { 193 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 194 195 PetscFunctionBegin; 196 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 197 198 if (i) *i = aijkok->i_device_data(); 199 if (j) *j = aijkok->j_device_data(); 200 if (a) { 201 aijkok->a_dual.sync_device(); 202 *a = aijkok->a_device_data(); 203 } 204 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 205 PetscFunctionReturn(PETSC_SUCCESS); 206 } 207 208 /* 209 Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 210 211 Input Parameter: 212 . A - the MATSEQAIJKOKKOS matrix 213 214 Output Parameters: 215 + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 216 - T_d - the transpose on device, whose value array is allocated but not initialized 217 */ 218 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 219 { 220 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 221 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 222 const PetscInt *Ai = aseq->i, *Aj = aseq->j; 223 MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 224 MatRowMapType *Ti = Ti_h.data(); 225 MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 226 MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 227 PetscInt *Tj = Tj_h.data(); 228 PetscInt *perm = perm_h.data(); 229 PetscInt *offset; 230 231 PetscFunctionBegin; 232 // Populate Ti 233 PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 234 Ti++; 235 for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 236 Ti--; 237 for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 238 239 // Populate Tj and the permutation array 240 PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 241 for (PetscInt i = 0; i < m; i++) { 242 for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 243 PetscInt r = Aj[j]; // row r of T 244 PetscInt disp = Ti[r] + offset[r]; 245 246 Tj[disp] = i; // col i of T 247 perm[disp] = j; 248 offset[r]++; 249 } 250 } 251 PetscCall(PetscFree(offset)); 252 253 // Sort each row of T, along with the permutation array 254 for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 255 256 // Output perm and T on device 257 auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 258 auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 259 PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 260 PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 261 PetscFunctionReturn(PETSC_SUCCESS); 262 } 263 264 // Generate the transpose on device and cache it internally 265 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 266 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 267 { 268 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 269 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 270 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 271 KokkosCsrMatrix &T = akok->csrmatT; 272 273 PetscFunctionBegin; 274 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 275 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 276 277 const auto &Aa = akok->a_dual.view_device(); 278 279 if (A->symmetric == PETSC_BOOL3_TRUE) { 280 *csrmatT = akok->csrmat; 281 } else { 282 // See if we already have a cached transpose and its value is up to date 283 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 284 if (!akok->transpose_updated) { // if the value is out of date, update the cached version 285 const auto &perm = akok->transpose_perm; // get the permutation array 286 auto &Ta = T.values; 287 288 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 289 } 290 } else { // Generate T of size n x m for the first time 291 MatRowMapKokkosView perm; 292 293 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 294 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 295 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 296 } 297 akok->transpose_updated = PETSC_TRUE; 298 *csrmatT = akok->csrmatT; 299 } 300 PetscFunctionReturn(PETSC_SUCCESS); 301 } 302 303 // Generate the Hermitian on device and cache it internally 304 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 305 { 306 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 307 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 308 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 309 KokkosCsrMatrix &T = akok->csrmatH; 310 311 PetscFunctionBegin; 312 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 313 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 314 315 const auto &Aa = akok->a_dual.view_device(); 316 317 if (A->hermitian == PETSC_BOOL3_TRUE) { 318 *csrmatH = akok->csrmat; 319 } else { 320 // See if we already have a cached hermitian and its value is up to date 321 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 322 if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 323 const auto &perm = akok->transpose_perm; // get the permutation array 324 auto &Ta = T.values; 325 326 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 327 } 328 } else { // Generate T of size n x m for the first time 329 MatRowMapKokkosView perm; 330 331 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 332 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 333 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 334 } 335 akok->hermitian_updated = PETSC_TRUE; 336 *csrmatH = akok->csrmatH; 337 } 338 PetscFunctionReturn(PETSC_SUCCESS); 339 } 340 341 /* y = A x */ 342 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 343 { 344 Mat_SeqAIJKokkos *aijkok; 345 ConstPetscScalarKokkosView xv; 346 PetscScalarKokkosView yv; 347 348 PetscFunctionBegin; 349 PetscCall(PetscLogGpuTimeBegin()); 350 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 351 PetscCall(VecGetKokkosView(xx, &xv)); 352 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 353 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 354 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 355 PetscCall(VecRestoreKokkosView(xx, &xv)); 356 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 357 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 358 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 359 PetscCall(PetscLogGpuTimeEnd()); 360 PetscFunctionReturn(PETSC_SUCCESS); 361 } 362 363 /* y = A^T x */ 364 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 365 { 366 Mat_SeqAIJKokkos *aijkok; 367 const char *mode; 368 ConstPetscScalarKokkosView xv; 369 PetscScalarKokkosView yv; 370 KokkosCsrMatrix csrmat; 371 372 PetscFunctionBegin; 373 PetscCall(PetscLogGpuTimeBegin()); 374 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 375 PetscCall(VecGetKokkosView(xx, &xv)); 376 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 377 if (A->form_explicit_transpose) { 378 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 379 mode = "N"; 380 } else { 381 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 382 csrmat = aijkok->csrmat; 383 mode = "T"; 384 } 385 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 386 PetscCall(VecRestoreKokkosView(xx, &xv)); 387 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 388 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 389 PetscCall(PetscLogGpuTimeEnd()); 390 PetscFunctionReturn(PETSC_SUCCESS); 391 } 392 393 /* y = A^H x */ 394 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 395 { 396 Mat_SeqAIJKokkos *aijkok; 397 const char *mode; 398 ConstPetscScalarKokkosView xv; 399 PetscScalarKokkosView yv; 400 KokkosCsrMatrix csrmat; 401 402 PetscFunctionBegin; 403 PetscCall(PetscLogGpuTimeBegin()); 404 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 405 PetscCall(VecGetKokkosView(xx, &xv)); 406 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 407 if (A->form_explicit_transpose) { 408 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 409 mode = "N"; 410 } else { 411 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 412 csrmat = aijkok->csrmat; 413 mode = "C"; 414 } 415 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 416 PetscCall(VecRestoreKokkosView(xx, &xv)); 417 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 418 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 419 PetscCall(PetscLogGpuTimeEnd()); 420 PetscFunctionReturn(PETSC_SUCCESS); 421 } 422 423 /* z = A x + y */ 424 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 425 { 426 Mat_SeqAIJKokkos *aijkok; 427 ConstPetscScalarKokkosView xv; 428 PetscScalarKokkosView zv; 429 430 PetscFunctionBegin; 431 PetscCall(PetscLogGpuTimeBegin()); 432 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 433 if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host 434 PetscCall(VecGetKokkosView(xx, &xv)); 435 PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device 436 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 437 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 438 PetscCall(VecRestoreKokkosView(xx, &xv)); 439 PetscCall(VecRestoreKokkosView(zz, &zv)); 440 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 441 PetscCall(PetscLogGpuTimeEnd()); 442 PetscFunctionReturn(PETSC_SUCCESS); 443 } 444 445 /* z = A^T x + y */ 446 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 447 { 448 Mat_SeqAIJKokkos *aijkok; 449 const char *mode; 450 ConstPetscScalarKokkosView xv; 451 PetscScalarKokkosView zv; 452 KokkosCsrMatrix csrmat; 453 454 PetscFunctionBegin; 455 PetscCall(PetscLogGpuTimeBegin()); 456 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 457 if (zz != yy) PetscCall(VecCopy(yy, zz)); 458 PetscCall(VecGetKokkosView(xx, &xv)); 459 PetscCall(VecGetKokkosView(zz, &zv)); 460 if (A->form_explicit_transpose) { 461 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 462 mode = "N"; 463 } else { 464 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 465 csrmat = aijkok->csrmat; 466 mode = "T"; 467 } 468 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 469 PetscCall(VecRestoreKokkosView(xx, &xv)); 470 PetscCall(VecRestoreKokkosView(zz, &zv)); 471 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 472 PetscCall(PetscLogGpuTimeEnd()); 473 PetscFunctionReturn(PETSC_SUCCESS); 474 } 475 476 /* z = A^H x + y */ 477 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 478 { 479 Mat_SeqAIJKokkos *aijkok; 480 const char *mode; 481 ConstPetscScalarKokkosView xv; 482 PetscScalarKokkosView zv; 483 KokkosCsrMatrix csrmat; 484 485 PetscFunctionBegin; 486 PetscCall(PetscLogGpuTimeBegin()); 487 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 488 if (zz != yy) PetscCall(VecCopy(yy, zz)); 489 PetscCall(VecGetKokkosView(xx, &xv)); 490 PetscCall(VecGetKokkosView(zz, &zv)); 491 if (A->form_explicit_transpose) { 492 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 493 mode = "N"; 494 } else { 495 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 496 csrmat = aijkok->csrmat; 497 mode = "C"; 498 } 499 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 500 PetscCall(VecRestoreKokkosView(xx, &xv)); 501 PetscCall(VecRestoreKokkosView(zz, &zv)); 502 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 503 PetscCall(PetscLogGpuTimeEnd()); 504 PetscFunctionReturn(PETSC_SUCCESS); 505 } 506 507 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 508 { 509 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 510 511 PetscFunctionBegin; 512 switch (op) { 513 case MAT_FORM_EXPLICIT_TRANSPOSE: 514 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 515 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 516 A->form_explicit_transpose = flg; 517 break; 518 default: 519 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 520 break; 521 } 522 PetscFunctionReturn(PETSC_SUCCESS); 523 } 524 525 /* Depending on reuse, either build a new mat, or use the existing mat */ 526 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 527 { 528 Mat_SeqAIJ *aseq; 529 530 PetscFunctionBegin; 531 PetscCall(PetscKokkosInitializeCheck()); 532 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 533 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */ 534 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 535 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 536 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 537 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 538 PetscCall(PetscFree(A->defaultvectype)); 539 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 540 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 541 PetscCall(MatSetOps_SeqAIJKokkos(A)); 542 aseq = static_cast<Mat_SeqAIJ *>(A->data); 543 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 544 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 545 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE); 546 } 547 } 548 PetscFunctionReturn(PETSC_SUCCESS); 549 } 550 551 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 552 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 553 */ 554 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 555 { 556 Mat_SeqAIJ *bseq; 557 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 558 Mat mat; 559 560 PetscFunctionBegin; 561 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 562 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 563 mat = *B; 564 if (A->assembled) { 565 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 566 bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE); 567 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 568 /* Now copy values to B if needed */ 569 if (dupOption == MAT_COPY_VALUES) { 570 if (akok->a_dual.need_sync_device()) { 571 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 572 bkok->a_dual.modify_host(); 573 } else { /* If device has the latest data, we only copy data on device */ 574 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 575 bkok->a_dual.modify_device(); 576 } 577 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 578 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 579 bkok->a_dual.modify_host(); 580 } 581 mat->spptr = bkok; 582 } 583 584 PetscCall(PetscFree(mat->defaultvectype)); 585 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 586 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 587 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 588 PetscFunctionReturn(PETSC_SUCCESS); 589 } 590 591 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 592 { 593 Mat At; 594 KokkosCsrMatrix internT; 595 Mat_SeqAIJKokkos *atkok, *bkok; 596 597 PetscFunctionBegin; 598 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 599 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 600 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 601 /* Deep copy internT, as we want to isolate the internal transpose */ 602 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 603 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 604 if (reuse == MAT_INITIAL_MATRIX) *B = At; 605 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 606 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 607 if ((*B)->assembled) { 608 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 609 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 610 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 611 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 612 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 613 MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 614 MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 615 PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 616 PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 617 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 618 } 619 PetscFunctionReturn(PETSC_SUCCESS); 620 } 621 622 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 623 { 624 Mat_SeqAIJKokkos *aijkok; 625 626 PetscFunctionBegin; 627 if (A->factortype == MAT_FACTOR_NONE) { 628 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 629 delete aijkok; 630 } else { 631 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 632 } 633 A->spptr = NULL; 634 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 635 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 636 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 637 PetscCall(MatDestroy_SeqAIJ(A)); 638 PetscFunctionReturn(PETSC_SUCCESS); 639 } 640 641 /*MC 642 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 643 644 A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types 645 646 Options Database Key: 647 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 648 649 Level: beginner 650 651 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 652 M*/ 653 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 654 { 655 PetscFunctionBegin; 656 PetscCall(PetscKokkosInitializeCheck()); 657 PetscCall(MatCreate_SeqAIJ(A)); 658 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 659 PetscFunctionReturn(PETSC_SUCCESS); 660 } 661 662 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 663 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 664 { 665 Mat_SeqAIJ *a, *b; 666 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 667 MatScalarKokkosView aa, ba, ca; 668 MatRowMapKokkosView ai, bi, ci; 669 MatColIdxKokkosView aj, bj, cj; 670 PetscInt m, n, nnz, aN; 671 672 PetscFunctionBegin; 673 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 674 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 675 PetscAssertPointer(C, 4); 676 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 677 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 678 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 679 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 680 681 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 682 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 683 a = static_cast<Mat_SeqAIJ *>(A->data); 684 b = static_cast<Mat_SeqAIJ *>(B->data); 685 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 686 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 687 aa = akok->a_dual.view_device(); 688 ai = akok->i_dual.view_device(); 689 ba = bkok->a_dual.view_device(); 690 bi = bkok->i_dual.view_device(); 691 m = A->rmap->n; /* M, N and nnz of C */ 692 n = A->cmap->n + B->cmap->n; 693 nnz = a->nz + b->nz; 694 aN = A->cmap->n; /* N of A */ 695 if (reuse == MAT_INITIAL_MATRIX) { 696 aj = akok->j_dual.view_device(); 697 bj = bkok->j_dual.view_device(); 698 auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 699 auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 700 auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 701 ca = ca_dual.view_device(); 702 ci = ci_dual.view_device(); 703 cj = cj_dual.view_device(); 704 705 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 706 Kokkos::parallel_for( 707 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 708 PetscInt i = t.league_rank(); /* row i */ 709 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 710 711 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 712 ci(i) = coffset; 713 if (i == m - 1) ci(m) = ai(m) + bi(m); 714 }); 715 716 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 717 if (k < alen) { 718 ca(coffset + k) = aa(ai(i) + k); 719 cj(coffset + k) = aj(ai(i) + k); 720 } else { 721 ca(coffset + k) = ba(bi(i) + k - alen); 722 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 723 } 724 }); 725 }); 726 ca_dual.modify_device(); 727 ci_dual.modify_device(); 728 cj_dual.modify_device(); 729 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 730 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 731 } else if (reuse == MAT_REUSE_MATRIX) { 732 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 733 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 734 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 735 ca = ckok->a_dual.view_device(); 736 ci = ckok->i_dual.view_device(); 737 738 Kokkos::parallel_for( 739 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 740 PetscInt i = t.league_rank(); /* row i */ 741 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 742 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 743 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 744 else ca(ci(i) + k) = ba(bi(i) + k - alen); 745 }); 746 }); 747 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 748 } 749 PetscFunctionReturn(PETSC_SUCCESS); 750 } 751 752 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 753 { 754 PetscFunctionBegin; 755 delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 756 PetscFunctionReturn(PETSC_SUCCESS); 757 } 758 759 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 760 { 761 Mat_Product *product = C->product; 762 Mat A, B; 763 bool transA, transB; /* use bool, since KK needs this type */ 764 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 765 Mat_SeqAIJ *c; 766 MatProductData_SeqAIJKokkos *pdata; 767 KokkosCsrMatrix csrmatA, csrmatB; 768 769 PetscFunctionBegin; 770 MatCheckProduct(C, 1); 771 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 772 pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 773 774 // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 775 // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 776 // we still do numeric. 777 if (pdata->reusesym) { // numeric reuses results from symbolic 778 pdata->reusesym = PETSC_FALSE; 779 PetscFunctionReturn(PETSC_SUCCESS); 780 } 781 782 switch (product->type) { 783 case MATPRODUCT_AB: 784 transA = false; 785 transB = false; 786 break; 787 case MATPRODUCT_AtB: 788 transA = true; 789 transB = false; 790 break; 791 case MATPRODUCT_ABt: 792 transA = false; 793 transB = true; 794 break; 795 default: 796 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 797 } 798 799 A = product->A; 800 B = product->B; 801 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 802 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 803 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 804 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 805 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 806 807 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 808 809 csrmatA = akok->csrmat; 810 csrmatB = bkok->csrmat; 811 812 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 813 if (transA) { 814 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 815 transA = false; 816 } 817 818 if (transB) { 819 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 820 transB = false; 821 } 822 PetscCall(PetscLogGpuTimeBegin()); 823 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 824 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 825 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 826 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 827 #endif 828 829 PetscCall(PetscLogGpuTimeEnd()); 830 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 831 /* shorter version of MatAssemblyEnd_SeqAIJ */ 832 c = (Mat_SeqAIJ *)C->data; 833 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 834 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 835 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 836 c->reallocs = 0; 837 C->info.mallocs = 0; 838 C->info.nz_unneeded = 0; 839 C->assembled = C->was_assembled = PETSC_TRUE; 840 C->num_ass++; 841 PetscFunctionReturn(PETSC_SUCCESS); 842 } 843 844 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 845 { 846 Mat_Product *product = C->product; 847 MatProductType ptype; 848 Mat A, B; 849 bool transA, transB; 850 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 851 MatProductData_SeqAIJKokkos *pdata; 852 MPI_Comm comm; 853 KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 854 855 PetscFunctionBegin; 856 MatCheckProduct(C, 1); 857 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 858 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 859 A = product->A; 860 B = product->B; 861 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 862 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 863 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 864 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 865 csrmatA = akok->csrmat; 866 csrmatB = bkok->csrmat; 867 868 ptype = product->type; 869 // Take advantage of the symmetry if true 870 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 871 ptype = MATPRODUCT_AB; 872 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 873 } 874 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 875 ptype = MATPRODUCT_AB; 876 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 877 } 878 879 switch (ptype) { 880 case MATPRODUCT_AB: 881 transA = false; 882 transB = false; 883 break; 884 case MATPRODUCT_AtB: 885 transA = true; 886 transB = false; 887 break; 888 case MATPRODUCT_ABt: 889 transA = false; 890 transB = true; 891 break; 892 default: 893 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 894 } 895 PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 896 pdata->reusesym = product->api_user; 897 898 /* TODO: add command line options to select spgemm algorithms */ 899 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 900 901 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 902 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 903 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 904 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 905 #endif 906 #endif 907 PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 908 909 PetscCall(PetscLogGpuTimeBegin()); 910 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 911 if (transA) { 912 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 913 transA = false; 914 } 915 916 if (transB) { 917 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 918 transB = false; 919 } 920 921 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 922 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 923 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 924 calling new Mat_SeqAIJKokkos(). 925 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 926 */ 927 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 928 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 929 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 930 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 931 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 932 #endif 933 PetscCall(PetscLogGpuTimeEnd()); 934 935 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 936 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 937 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 938 PetscFunctionReturn(PETSC_SUCCESS); 939 } 940 941 /* handles sparse matrix matrix ops */ 942 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 943 { 944 Mat_Product *product = mat->product; 945 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 946 947 PetscFunctionBegin; 948 MatCheckProduct(mat, 1); 949 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 950 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 951 if (Biskok && Ciskok) { 952 switch (product->type) { 953 case MATPRODUCT_AB: 954 case MATPRODUCT_AtB: 955 case MATPRODUCT_ABt: 956 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 957 break; 958 case MATPRODUCT_PtAP: 959 case MATPRODUCT_RARt: 960 case MATPRODUCT_ABC: 961 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 962 break; 963 default: 964 break; 965 } 966 } else { /* fallback for AIJ */ 967 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 968 } 969 PetscFunctionReturn(PETSC_SUCCESS); 970 } 971 972 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 973 { 974 Mat_SeqAIJKokkos *aijkok; 975 976 PetscFunctionBegin; 977 PetscCall(PetscLogGpuTimeBegin()); 978 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 979 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 980 KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 981 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 982 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 983 PetscCall(PetscLogGpuTimeEnd()); 984 PetscFunctionReturn(PETSC_SUCCESS); 985 } 986 987 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular) 988 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a) 989 { 990 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 991 992 PetscFunctionBegin; 993 if (A->assembled && aijseq->diagonaldense) { // no missing diagonals 994 PetscInt n = PetscMin(A->rmap->n, A->cmap->n); 995 996 PetscCall(PetscLogGpuTimeBegin()); 997 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 998 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 999 const auto &Aa = aijkok->a_dual.view_device(); 1000 const auto &Adiag = aijkok->diag_dual.view_device(); 1001 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; })); 1002 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1003 PetscCall(PetscLogGpuFlops(n)); 1004 PetscCall(PetscLogGpuTimeEnd()); 1005 } else { // need reassembly, very slow! 1006 PetscCall(MatShift_Basic(A, a)); 1007 } 1008 PetscFunctionReturn(PETSC_SUCCESS); 1009 } 1010 1011 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is) 1012 { 1013 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data); 1014 1015 PetscFunctionBegin; 1016 if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals 1017 ConstPetscScalarKokkosView dv; 1018 PetscInt n, nv; 1019 1020 PetscCall(PetscLogGpuTimeBegin()); 1021 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1022 PetscCall(VecGetKokkosView(D, &dv)); 1023 PetscCall(VecGetLocalSize(D, &nv)); 1024 n = PetscMin(Y->rmap->n, Y->cmap->n); 1025 PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match"); 1026 1027 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1028 const auto &Aa = aijkok->a_dual.view_device(); 1029 const auto &Adiag = aijkok->diag_dual.view_device(); 1030 PetscCallCXX(Kokkos::parallel_for( 1031 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1032 if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i); 1033 else Aa(Adiag(i)) += dv(i); 1034 })); 1035 PetscCall(VecRestoreKokkosView(D, &dv)); 1036 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1037 PetscCall(PetscLogGpuFlops(n)); 1038 PetscCall(PetscLogGpuTimeEnd()); 1039 } else { // need reassembly, very slow! 1040 PetscCall(MatDiagonalSet_Default(Y, D, is)); 1041 } 1042 PetscFunctionReturn(PETSC_SUCCESS); 1043 } 1044 1045 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr) 1046 { 1047 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1048 PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz; 1049 ConstPetscScalarKokkosView lv, rv; 1050 1051 PetscFunctionBegin; 1052 PetscCall(PetscLogGpuTimeBegin()); 1053 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1054 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1055 const auto &Aa = aijkok->a_dual.view_device(); 1056 const auto &Ai = aijkok->i_dual.view_device(); 1057 const auto &Aj = aijkok->j_dual.view_device(); 1058 if (ll) { 1059 PetscCall(VecGetLocalSize(ll, &m)); 1060 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1061 PetscCall(VecGetKokkosView(ll, &lv)); 1062 PetscCallCXX(Kokkos::parallel_for( // for each row 1063 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1064 PetscInt i = t.league_rank(); // row i 1065 PetscInt len = Ai(i + 1) - Ai(i); 1066 // scale entries on the row 1067 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); }); 1068 })); 1069 PetscCall(VecRestoreKokkosView(ll, &lv)); 1070 PetscCall(PetscLogGpuFlops(nz)); 1071 } 1072 if (rr) { 1073 PetscCall(VecGetLocalSize(rr, &n)); 1074 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1075 PetscCall(VecGetKokkosView(rr, &rv)); 1076 PetscCallCXX(Kokkos::parallel_for( // for each nonzero 1077 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); })); 1078 PetscCall(VecRestoreKokkosView(rr, &lv)); 1079 PetscCall(PetscLogGpuFlops(nz)); 1080 } 1081 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1082 PetscCall(PetscLogGpuTimeEnd()); 1083 PetscFunctionReturn(PETSC_SUCCESS); 1084 } 1085 1086 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 1087 { 1088 Mat_SeqAIJKokkos *aijkok; 1089 1090 PetscFunctionBegin; 1091 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1092 if (aijkok) { /* Only zero the device if data is already there */ 1093 KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0); 1094 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1095 } else { /* Might be preallocated but not assembled */ 1096 PetscCall(MatZeroEntries_SeqAIJ(A)); 1097 } 1098 PetscFunctionReturn(PETSC_SUCCESS); 1099 } 1100 1101 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1102 { 1103 Mat_SeqAIJKokkos *aijkok; 1104 PetscInt n; 1105 PetscScalarKokkosView xv; 1106 1107 PetscFunctionBegin; 1108 PetscCall(VecGetLocalSize(x, &n)); 1109 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1110 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1111 1112 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1113 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1114 1115 const auto &Aa = aijkok->a_dual.view_device(); 1116 const auto &Ai = aijkok->i_dual.view_device(); 1117 const auto &Adiag = aijkok->diag_dual.view_device(); 1118 1119 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1120 Kokkos::parallel_for( 1121 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1122 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1123 else xv(i) = 0; 1124 }); 1125 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1126 PetscFunctionReturn(PETSC_SUCCESS); 1127 } 1128 1129 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1130 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1131 { 1132 Mat_SeqAIJKokkos *aijkok; 1133 1134 PetscFunctionBegin; 1135 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1136 PetscAssertPointer(kv, 2); 1137 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1138 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1139 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1140 *kv = aijkok->a_dual.view_device(); 1141 PetscFunctionReturn(PETSC_SUCCESS); 1142 } 1143 1144 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1145 { 1146 PetscFunctionBegin; 1147 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1148 PetscAssertPointer(kv, 2); 1149 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1150 PetscFunctionReturn(PETSC_SUCCESS); 1151 } 1152 1153 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1154 { 1155 Mat_SeqAIJKokkos *aijkok; 1156 1157 PetscFunctionBegin; 1158 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1159 PetscAssertPointer(kv, 2); 1160 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1161 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1162 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1163 *kv = aijkok->a_dual.view_device(); 1164 PetscFunctionReturn(PETSC_SUCCESS); 1165 } 1166 1167 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1168 { 1169 PetscFunctionBegin; 1170 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1171 PetscAssertPointer(kv, 2); 1172 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1173 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1174 PetscFunctionReturn(PETSC_SUCCESS); 1175 } 1176 1177 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1178 { 1179 Mat_SeqAIJKokkos *aijkok; 1180 1181 PetscFunctionBegin; 1182 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1183 PetscAssertPointer(kv, 2); 1184 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1185 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1186 *kv = aijkok->a_dual.view_device(); 1187 PetscFunctionReturn(PETSC_SUCCESS); 1188 } 1189 1190 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1191 { 1192 PetscFunctionBegin; 1193 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1194 PetscAssertPointer(kv, 2); 1195 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1196 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1197 PetscFunctionReturn(PETSC_SUCCESS); 1198 } 1199 1200 /* Computes Y += alpha X */ 1201 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1202 { 1203 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1204 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1205 ConstMatScalarKokkosView Xa; 1206 MatScalarKokkosView Ya; 1207 auto &exec = PetscGetKokkosExecutionSpace(); 1208 1209 PetscFunctionBegin; 1210 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1211 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1212 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1213 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1214 PetscCall(PetscLogGpuTimeBegin()); 1215 1216 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1217 PetscBool e; 1218 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1219 if (e) { 1220 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1221 if (e) pattern = SAME_NONZERO_PATTERN; 1222 } 1223 } 1224 1225 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1226 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1227 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1228 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1229 */ 1230 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1231 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1232 Xa = xkok->a_dual.view_device(); 1233 Ya = ykok->a_dual.view_device(); 1234 1235 if (pattern == SAME_NONZERO_PATTERN) { 1236 KokkosBlas::axpy(exec, alpha, Xa, Ya); 1237 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1238 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1239 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1240 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1241 1242 Kokkos::parallel_for( 1243 Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1244 PetscInt i = t.league_rank(); // row i 1245 Kokkos::single(Kokkos::PerTeam(t), [=]() { 1246 // Only one thread works in a team 1247 PetscInt p, q = Yi(i); 1248 for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 1249 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 1250 if (Xj(p) == Yj(q)) { // Found it 1251 Ya(q) += alpha * Xa(p); 1252 q++; 1253 } else { 1254 // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1255 // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1256 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 1257 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 1258 #else 1259 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 1260 #endif 1261 } 1262 } 1263 }); 1264 }); 1265 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1266 } else { // different nonzero patterns 1267 Mat Z; 1268 KokkosCsrMatrix zcsr; 1269 KernelHandle kh; 1270 kh.create_spadd_handle(true); // X, Y are sorted 1271 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1272 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1273 zkok = new Mat_SeqAIJKokkos(zcsr); 1274 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1275 PetscCall(MatHeaderReplace(Y, &Z)); 1276 kh.destroy_spadd_handle(); 1277 } 1278 PetscCall(PetscLogGpuTimeEnd()); 1279 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 1280 PetscFunctionReturn(PETSC_SUCCESS); 1281 } 1282 1283 struct MatCOOStruct_SeqAIJKokkos { 1284 PetscCount n; 1285 PetscCount Atot; 1286 PetscInt nz; 1287 PetscCountKokkosView jmap; 1288 PetscCountKokkosView perm; 1289 1290 MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h) 1291 { 1292 nz = coo_h->nz; 1293 n = coo_h->n; 1294 Atot = coo_h->Atot; 1295 jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1)); 1296 perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot)); 1297 } 1298 }; 1299 1300 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void *data) 1301 { 1302 PetscFunctionBegin; 1303 PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(data)); 1304 PetscFunctionReturn(PETSC_SUCCESS); 1305 } 1306 1307 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1308 { 1309 Mat_SeqAIJKokkos *akok; 1310 Mat_SeqAIJ *aseq; 1311 PetscContainer container_h; 1312 MatCOOStruct_SeqAIJ *coo_h; 1313 MatCOOStruct_SeqAIJKokkos *coo_d; 1314 1315 PetscFunctionBegin; 1316 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1317 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1318 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1319 delete akok; 1320 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE); 1321 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1322 1323 // Copy the COO struct to device 1324 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 1325 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 1326 PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h)); 1327 1328 // Put the COO struct in a container and then attach that to the matrix 1329 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos)); 1330 PetscFunctionReturn(PETSC_SUCCESS); 1331 } 1332 1333 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1334 { 1335 MatScalarKokkosView Aa; 1336 ConstMatScalarKokkosView kv; 1337 PetscMemType memtype; 1338 PetscContainer container; 1339 MatCOOStruct_SeqAIJKokkos *coo; 1340 1341 PetscFunctionBegin; 1342 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 1343 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 1344 1345 const auto &n = coo->n; 1346 const auto &Annz = coo->nz; 1347 const auto &jmap = coo->jmap; 1348 const auto &perm = coo->perm; 1349 1350 PetscCall(PetscGetMemType(v, &memtype)); 1351 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1352 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n)); 1353 } else { 1354 kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */ 1355 } 1356 1357 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1358 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1359 1360 PetscCall(PetscLogGpuTimeBegin()); 1361 Kokkos::parallel_for( 1362 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) { 1363 PetscScalar sum = 0.0; 1364 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1365 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1366 }); 1367 PetscCall(PetscLogGpuTimeEnd()); 1368 1369 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1370 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1371 PetscFunctionReturn(PETSC_SUCCESS); 1372 } 1373 1374 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1375 { 1376 PetscFunctionBegin; 1377 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1378 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1379 B->offloadmask = PETSC_OFFLOAD_CPU; 1380 PetscFunctionReturn(PETSC_SUCCESS); 1381 } 1382 1383 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1384 { 1385 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1386 1387 PetscFunctionBegin; 1388 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1389 A->boundtocpu = PETSC_FALSE; 1390 1391 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1392 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1393 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1394 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1395 A->ops->scale = MatScale_SeqAIJKokkos; 1396 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1397 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1398 A->ops->mult = MatMult_SeqAIJKokkos; 1399 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1400 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1401 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1402 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1403 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1404 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1405 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1406 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1407 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1408 A->ops->shift = MatShift_SeqAIJKokkos; 1409 A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos; 1410 A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos; 1411 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1412 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1413 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1414 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1415 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1416 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1417 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1418 1419 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1420 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1421 PetscFunctionReturn(PETSC_SUCCESS); 1422 } 1423 1424 /* 1425 Extract the (prescribled) diagonal blocks of the matrix and then invert them 1426 1427 Input Parameters: 1428 + A - the MATSEQAIJKOKKOS matrix 1429 . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i) 1430 . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[] 1431 . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i) 1432 - work - a pre-allocated work buffer (as big as diagVal) for use by this routine 1433 1434 Output Parameter: 1435 . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order) 1436 */ 1437 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal) 1438 { 1439 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1440 PetscInt nblocks = bs.extent(0) - 1; 1441 1442 PetscFunctionBegin; 1443 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device 1444 1445 // Pull out the diagonal blocks of the matrix and then invert the blocks 1446 auto Aa = akok->a_dual.view_device(); 1447 auto Ai = akok->i_dual.view_device(); 1448 auto Aj = akok->j_dual.view_device(); 1449 auto Adiag = akok->diag_dual.view_device(); 1450 // TODO: how to tune the team size? 1451 #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HOST) 1452 auto ts = Kokkos::AUTO(); 1453 #else 1454 auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs 1455 #endif 1456 PetscCallCXX(Kokkos::parallel_for( 1457 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) { 1458 const PetscInt bid = teamMember.league_rank(); // block id 1459 const PetscInt rstart = bs(bid); // this block starts from this row 1460 const PetscInt m = bs(bid + 1) - bs(bid); // size of this block 1461 const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order 1462 const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m); 1463 1464 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B 1465 PetscInt i = rstart + r; // i-th row in A 1466 1467 if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case) 1468 PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row 1469 1470 for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet 1471 if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero 1472 B(r, c) = 0.0; 1473 } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column 1474 B(r, c) = Aa(first + c); 1475 } else { // this entry does not show up in the CSR 1476 B(r, c) = 0.0; 1477 } 1478 } 1479 } else { // rare case that the diagonal does not exist 1480 const PetscInt begin = Ai(i); 1481 const PetscInt end = Ai(i + 1); 1482 for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0; 1483 for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not. 1484 if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j); 1485 else if (Aj(j) >= rstart + m) break; 1486 } 1487 } 1488 }); 1489 1490 // LU-decompose B (w/o pivoting) and then invert B 1491 KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0); 1492 KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W); 1493 })); 1494 // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels 1495 PetscFunctionReturn(PETSC_SUCCESS); 1496 } 1497 1498 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1499 { 1500 Mat_SeqAIJ *aseq; 1501 PetscInt i, m, n; 1502 auto &exec = PetscGetKokkosExecutionSpace(); 1503 1504 PetscFunctionBegin; 1505 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1506 1507 m = akok->nrows(); 1508 n = akok->ncols(); 1509 PetscCall(MatSetSizes(A, m, n, m, n)); 1510 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1511 1512 /* Set up data structures of A as a MATSEQAIJ */ 1513 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1514 aseq = (Mat_SeqAIJ *)A->data; 1515 1516 PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */ 1517 PetscCallCXX(akok->j_dual.sync_host(exec)); 1518 PetscCallCXX(exec.fence()); 1519 1520 aseq->i = akok->i_host_data(); 1521 aseq->j = akok->j_host_data(); 1522 aseq->a = akok->a_host_data(); 1523 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1524 aseq->free_a = PETSC_FALSE; 1525 aseq->free_ij = PETSC_FALSE; 1526 aseq->nz = akok->nnz(); 1527 aseq->maxnz = aseq->nz; 1528 1529 PetscCall(PetscMalloc1(m, &aseq->imax)); 1530 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1531 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1532 1533 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1534 akok->nonzerostate = A->nonzerostate; 1535 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1536 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1537 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1538 PetscFunctionReturn(PETSC_SUCCESS); 1539 } 1540 1541 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 1542 { 1543 PetscFunctionBegin; 1544 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1545 *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 1546 PetscFunctionReturn(PETSC_SUCCESS); 1547 } 1548 1549 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 1550 { 1551 Mat_SeqAIJKokkos *akok; 1552 1553 PetscFunctionBegin; 1554 PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 1555 PetscCall(MatCreate(comm, A)); 1556 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1557 PetscFunctionReturn(PETSC_SUCCESS); 1558 } 1559 1560 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1561 1562 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1563 */ 1564 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1565 { 1566 PetscFunctionBegin; 1567 PetscCall(MatCreate(comm, A)); 1568 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1569 PetscFunctionReturn(PETSC_SUCCESS); 1570 } 1571 1572 /*@C 1573 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1574 (the default parallel PETSc format). This matrix will ultimately be handled by 1575 Kokkos for calculations. 1576 1577 Collective 1578 1579 Input Parameters: 1580 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1581 . m - number of rows 1582 . n - number of columns 1583 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 1584 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 1585 1586 Output Parameter: 1587 . A - the matrix 1588 1589 Level: intermediate 1590 1591 Notes: 1592 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1593 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1594 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1595 1596 The AIJ format, also called 1597 compressed row storage, is fully compatible with standard Fortran 1598 storage. That is, the stored row and column indices can begin at 1599 either one (as in Fortran) or zero. 1600 1601 Specify the preallocated storage with either `nz` or `nnz` (not both). 1602 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1603 allocation. 1604 1605 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()` 1606 @*/ 1607 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1608 { 1609 PetscFunctionBegin; 1610 PetscCall(PetscKokkosInitializeCheck()); 1611 PetscCall(MatCreate(comm, A)); 1612 PetscCall(MatSetSizes(*A, m, n, m, n)); 1613 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1614 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1615 PetscFunctionReturn(PETSC_SUCCESS); 1616 } 1617 1618 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1619 { 1620 PetscFunctionBegin; 1621 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1622 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1623 PetscFunctionReturn(PETSC_SUCCESS); 1624 } 1625 1626 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1627 { 1628 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1629 1630 PetscFunctionBegin; 1631 if (!factors->sptrsv_symbolic_completed) { 1632 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d); 1633 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d); 1634 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1635 } 1636 PetscFunctionReturn(PETSC_SUCCESS); 1637 } 1638 1639 /* Check if we need to update factors etc for transpose solve */ 1640 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1641 { 1642 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1643 MatColIdxType n = A->rmap->n; 1644 1645 PetscFunctionBegin; 1646 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1647 /* Update L^T and do sptrsv symbolic */ 1648 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0 1649 factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 1650 factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 1651 1652 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d, 1653 factors->iLt_d, factors->jLt_d, factors->aLt_d); 1654 1655 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1656 We have to sort the indices, until KK provides finer control options. 1657 */ 1658 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d); 1659 1660 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d); 1661 1662 /* Update U^T and do sptrsv symbolic */ 1663 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0 1664 factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 1665 factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 1666 1667 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d, 1668 factors->iUt_d, factors->jUt_d, factors->aUt_d); 1669 1670 /* Sort indices. See comments above */ 1671 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d); 1672 1673 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d); 1674 factors->transpose_updated = PETSC_TRUE; 1675 } 1676 PetscFunctionReturn(PETSC_SUCCESS); 1677 } 1678 1679 /* Solve Ax = b, with A = LU */ 1680 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x) 1681 { 1682 ConstPetscScalarKokkosView bv; 1683 PetscScalarKokkosView xv; 1684 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1685 1686 PetscFunctionBegin; 1687 PetscCall(PetscLogGpuTimeBegin()); 1688 PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 1689 PetscCall(VecGetKokkosView(b, &bv)); 1690 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1691 /* Solve L tmpv = b */ 1692 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector)); 1693 /* Solve Ux = tmpv */ 1694 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv)); 1695 PetscCall(VecRestoreKokkosView(b, &bv)); 1696 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1697 PetscCall(PetscLogGpuTimeEnd()); 1698 PetscFunctionReturn(PETSC_SUCCESS); 1699 } 1700 1701 /* Solve A^T x = b, where A^T = U^T L^T */ 1702 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x) 1703 { 1704 ConstPetscScalarKokkosView bv; 1705 PetscScalarKokkosView xv; 1706 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1707 1708 PetscFunctionBegin; 1709 PetscCall(PetscLogGpuTimeBegin()); 1710 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 1711 PetscCall(VecGetKokkosView(b, &bv)); 1712 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1713 /* Solve U^T tmpv = b */ 1714 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector); 1715 1716 /* Solve L^T x = tmpv */ 1717 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv); 1718 PetscCall(VecRestoreKokkosView(b, &bv)); 1719 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1720 PetscCall(PetscLogGpuTimeEnd()); 1721 PetscFunctionReturn(PETSC_SUCCESS); 1722 } 1723 1724 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1725 { 1726 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1727 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1728 PetscInt fill_lev = info->levels; 1729 1730 PetscFunctionBegin; 1731 PetscCall(PetscLogGpuTimeBegin()); 1732 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1733 1734 auto a_d = aijkok->a_dual.view_device(); 1735 auto i_d = aijkok->i_dual.view_device(); 1736 auto j_d = aijkok->j_dual.view_device(); 1737 1738 KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d); 1739 1740 B->assembled = PETSC_TRUE; 1741 B->preallocated = PETSC_TRUE; 1742 B->ops->solve = MatSolve_SeqAIJKokkos; 1743 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1744 B->ops->matsolve = NULL; 1745 B->ops->matsolvetranspose = NULL; 1746 B->offloadmask = PETSC_OFFLOAD_GPU; 1747 1748 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1749 factors->transpose_updated = PETSC_FALSE; 1750 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1751 /* TODO: log flops, but how to know that? */ 1752 PetscCall(PetscLogGpuTimeEnd()); 1753 PetscFunctionReturn(PETSC_SUCCESS); 1754 } 1755 1756 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1757 { 1758 Mat_SeqAIJKokkos *aijkok; 1759 Mat_SeqAIJ *b; 1760 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1761 PetscInt fill_lev = info->levels; 1762 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 1763 PetscInt n = A->rmap->n; 1764 1765 PetscFunctionBegin; 1766 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1767 /* Rebuild factors */ 1768 if (factors) { 1769 factors->Destroy(); 1770 } /* Destroy the old if it exists */ 1771 else { 1772 B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n); 1773 } 1774 1775 /* Create a spiluk handle and then do symbolic factorization */ 1776 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 1777 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 1778 1779 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1780 1781 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 1782 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 1783 Kokkos::realloc(factors->iU_d, n + 1); 1784 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 1785 1786 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1787 auto i_d = aijkok->i_dual.view_device(); 1788 auto j_d = aijkok->j_dual.view_device(); 1789 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d); 1790 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1791 1792 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1793 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 1794 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 1795 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 1796 1797 /* TODO: add options to select sptrsv algorithms */ 1798 /* Create sptrsv handles for L, U and their transpose */ 1799 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1800 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1801 #else 1802 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1803 #endif 1804 1805 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 1806 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 1807 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 1808 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 1809 1810 /* Fill fields of the factor matrix B */ 1811 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 1812 b = (Mat_SeqAIJ *)B->data; 1813 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 1814 B->info.fill_ratio_given = info->fill; 1815 B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 1816 1817 B->offloadmask = PETSC_OFFLOAD_GPU; 1818 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1819 PetscFunctionReturn(PETSC_SUCCESS); 1820 } 1821 1822 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type) 1823 { 1824 PetscFunctionBegin; 1825 *type = MATSOLVERKOKKOS; 1826 PetscFunctionReturn(PETSC_SUCCESS); 1827 } 1828 1829 /*MC 1830 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1831 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 1832 1833 Level: beginner 1834 1835 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1836 M*/ 1837 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1838 { 1839 PetscInt n = A->rmap->n; 1840 1841 PetscFunctionBegin; 1842 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 1843 PetscCall(MatSetSizes(*B, n, n, n, n)); 1844 (*B)->factortype = ftype; 1845 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 1846 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1847 1848 if (ftype == MAT_FACTOR_LU) { 1849 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1850 (*B)->canuseordering = PETSC_TRUE; 1851 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1852 } else if (ftype == MAT_FACTOR_ILU) { 1853 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1854 (*B)->canuseordering = PETSC_FALSE; 1855 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1856 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1857 1858 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 1859 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos)); 1860 PetscFunctionReturn(PETSC_SUCCESS); 1861 } 1862 1863 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1864 { 1865 PetscFunctionBegin; 1866 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1867 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1868 PetscFunctionReturn(PETSC_SUCCESS); 1869 } 1870 1871 /* Utility to print out a KokkosCsrMatrix for debugging */ 1872 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 1873 { 1874 const auto &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map); 1875 const auto &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries); 1876 const auto &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values); 1877 const PetscInt *i = iv.data(); 1878 const PetscInt *j = jv.data(); 1879 const PetscScalar *a = av.data(); 1880 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 1881 1882 PetscFunctionBegin; 1883 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 1884 for (PetscInt k = 0; k < m; k++) { 1885 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 1886 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 1887 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 1888 } 1889 PetscFunctionReturn(PETSC_SUCCESS); 1890 } 1891