1 #include <petsc_kokkos.hpp> 2 #include <petscvec_kokkos.hpp> 3 #include <petscpkg_version.h> 4 #include <petsc/private/petscimpl.h> 5 #include <petsc/private/sfimpl.h> 6 #include <petscsystypes.h> 7 #include <petscerror.h> 8 9 #include <Kokkos_Core.hpp> 10 #include <KokkosBlas.hpp> 11 #include <KokkosSparse_CrsMatrix.hpp> 12 #include <KokkosSparse_spmv.hpp> 13 #include <KokkosSparse_spiluk.hpp> 14 #include <KokkosSparse_sptrsv.hpp> 15 #include <KokkosSparse_spgemm.hpp> 16 #include <KokkosSparse_spadd.hpp> 17 #include <KokkosBatched_LU_Decl.hpp> 18 #include <KokkosBatched_InverseLU_Decl.hpp> 19 20 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 21 22 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 23 #include <KokkosSparse_Utils.hpp> 24 using KokkosSparse::sort_crs_matrix; 25 using KokkosSparse::Impl::transpose_matrix; 26 #else 27 #include <KokkosKernels_Sorting.hpp> 28 using KokkosKernels::sort_crs_matrix; 29 using KokkosKernels::Impl::transpose_matrix; 30 #endif 31 32 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 33 34 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 35 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 36 In the latter case, it is important to set a_dual's sync state correctly. 37 */ 38 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 39 { 40 Mat_SeqAIJ *aijseq; 41 Mat_SeqAIJKokkos *aijkok; 42 43 PetscFunctionBegin; 44 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 45 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 46 47 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 48 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 49 50 /* If aijkok does not exist, we just copy i, j to device. 51 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 52 In both cases, we build a new aijkok structure. 53 */ 54 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 55 delete aijkok; 56 aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 57 A->spptr = aijkok; 58 } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag. 59 MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n); 60 auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h); 61 aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h); 62 } 63 PetscFunctionReturn(PETSC_SUCCESS); 64 } 65 66 /* Sync CSR data to device if not yet */ 67 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 68 { 69 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 70 71 PetscFunctionBegin; 72 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 73 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 74 if (aijkok->a_dual.need_sync_device()) { 75 aijkok->a_dual.sync_device(); 76 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 77 aijkok->hermitian_updated = PETSC_FALSE; 78 } 79 PetscFunctionReturn(PETSC_SUCCESS); 80 } 81 82 /* Mark the CSR data on device as modified */ 83 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 84 { 85 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 86 87 PetscFunctionBegin; 88 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 89 aijkok->a_dual.clear_sync_state(); 90 aijkok->a_dual.modify_device(); 91 aijkok->transpose_updated = PETSC_FALSE; 92 aijkok->hermitian_updated = PETSC_FALSE; 93 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 94 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 95 PetscFunctionReturn(PETSC_SUCCESS); 96 } 97 98 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 99 { 100 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 101 auto &exec = PetscGetKokkosExecutionSpace(); 102 103 PetscFunctionBegin; 104 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 105 /* We do not expect one needs factors on host */ 106 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 107 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 108 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 109 PetscCallCXX(exec.fence()); 110 PetscFunctionReturn(PETSC_SUCCESS); 111 } 112 113 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 114 { 115 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 116 117 PetscFunctionBegin; 118 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 119 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 120 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 121 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 122 */ 123 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 124 auto &exec = PetscGetKokkosExecutionSpace(); 125 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 126 PetscCallCXX(exec.fence()); 127 *array = aijkok->a_dual.view_host().data(); 128 } else { /* Happens when calling MatSetValues on a newly created matrix */ 129 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 130 } 131 PetscFunctionReturn(PETSC_SUCCESS); 132 } 133 134 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 135 { 136 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 137 138 PetscFunctionBegin; 139 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 140 PetscFunctionReturn(PETSC_SUCCESS); 141 } 142 143 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 144 { 145 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 146 147 PetscFunctionBegin; 148 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 149 auto &exec = PetscGetKokkosExecutionSpace(); 150 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 151 PetscCallCXX(exec.fence()); 152 *array = aijkok->a_dual.view_host().data(); 153 } else { 154 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 155 } 156 PetscFunctionReturn(PETSC_SUCCESS); 157 } 158 159 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 160 { 161 PetscFunctionBegin; 162 *array = NULL; 163 PetscFunctionReturn(PETSC_SUCCESS); 164 } 165 166 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 167 { 168 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 169 170 PetscFunctionBegin; 171 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 172 *array = aijkok->a_dual.view_host().data(); 173 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 174 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 175 } 176 PetscFunctionReturn(PETSC_SUCCESS); 177 } 178 179 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 180 { 181 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 182 183 PetscFunctionBegin; 184 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 185 aijkok->a_dual.clear_sync_state(); 186 aijkok->a_dual.modify_host(); 187 } 188 PetscFunctionReturn(PETSC_SUCCESS); 189 } 190 191 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 192 { 193 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 194 195 PetscFunctionBegin; 196 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 197 198 if (i) *i = aijkok->i_device_data(); 199 if (j) *j = aijkok->j_device_data(); 200 if (a) { 201 aijkok->a_dual.sync_device(); 202 *a = aijkok->a_device_data(); 203 } 204 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 205 PetscFunctionReturn(PETSC_SUCCESS); 206 } 207 208 /* 209 Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 210 211 Input Parameter: 212 . A - the MATSEQAIJKOKKOS matrix 213 214 Output Parameters: 215 + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 216 - T_d - the transpose on device, whose value array is allocated but not initialized 217 */ 218 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 219 { 220 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 221 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 222 const PetscInt *Ai = aseq->i, *Aj = aseq->j; 223 MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 224 MatRowMapType *Ti = Ti_h.data(); 225 MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 226 MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 227 PetscInt *Tj = Tj_h.data(); 228 PetscInt *perm = perm_h.data(); 229 PetscInt *offset; 230 231 PetscFunctionBegin; 232 // Populate Ti 233 PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 234 Ti++; 235 for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 236 Ti--; 237 for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 238 239 // Populate Tj and the permutation array 240 PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 241 for (PetscInt i = 0; i < m; i++) { 242 for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 243 PetscInt r = Aj[j]; // row r of T 244 PetscInt disp = Ti[r] + offset[r]; 245 246 Tj[disp] = i; // col i of T 247 perm[disp] = j; 248 offset[r]++; 249 } 250 } 251 PetscCall(PetscFree(offset)); 252 253 // Sort each row of T, along with the permutation array 254 for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 255 256 // Output perm and T on device 257 auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 258 auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 259 PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 260 PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 261 PetscFunctionReturn(PETSC_SUCCESS); 262 } 263 264 // Generate the transpose on device and cache it internally 265 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 266 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 267 { 268 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 269 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 270 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 271 KokkosCsrMatrix &T = akok->csrmatT; 272 273 PetscFunctionBegin; 274 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 275 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 276 277 const auto &Aa = akok->a_dual.view_device(); 278 279 if (A->symmetric == PETSC_BOOL3_TRUE) { 280 *csrmatT = akok->csrmat; 281 } else { 282 // See if we already have a cached transpose and its value is up to date 283 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 284 if (!akok->transpose_updated) { // if the value is out of date, update the cached version 285 const auto &perm = akok->transpose_perm; // get the permutation array 286 auto &Ta = T.values; 287 288 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 289 } 290 } else { // Generate T of size n x m for the first time 291 MatRowMapKokkosView perm; 292 293 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 294 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 295 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 296 } 297 akok->transpose_updated = PETSC_TRUE; 298 *csrmatT = akok->csrmatT; 299 } 300 PetscFunctionReturn(PETSC_SUCCESS); 301 } 302 303 // Generate the Hermitian on device and cache it internally 304 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 305 { 306 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 307 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 308 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 309 KokkosCsrMatrix &T = akok->csrmatH; 310 311 PetscFunctionBegin; 312 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 313 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 314 315 const auto &Aa = akok->a_dual.view_device(); 316 317 if (A->hermitian == PETSC_BOOL3_TRUE) { 318 *csrmatH = akok->csrmat; 319 } else { 320 // See if we already have a cached hermitian and its value is up to date 321 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 322 if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 323 const auto &perm = akok->transpose_perm; // get the permutation array 324 auto &Ta = T.values; 325 326 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 327 } 328 } else { // Generate T of size n x m for the first time 329 MatRowMapKokkosView perm; 330 331 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 332 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 333 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 334 } 335 akok->hermitian_updated = PETSC_TRUE; 336 *csrmatH = akok->csrmatH; 337 } 338 PetscFunctionReturn(PETSC_SUCCESS); 339 } 340 341 /* y = A x */ 342 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 343 { 344 Mat_SeqAIJKokkos *aijkok; 345 ConstPetscScalarKokkosView xv; 346 PetscScalarKokkosView yv; 347 348 PetscFunctionBegin; 349 PetscCall(PetscLogGpuTimeBegin()); 350 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 351 PetscCall(VecGetKokkosView(xx, &xv)); 352 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 353 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 354 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 355 PetscCall(VecRestoreKokkosView(xx, &xv)); 356 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 357 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 358 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 359 PetscCall(PetscLogGpuTimeEnd()); 360 PetscFunctionReturn(PETSC_SUCCESS); 361 } 362 363 /* y = A^T x */ 364 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 365 { 366 Mat_SeqAIJKokkos *aijkok; 367 const char *mode; 368 ConstPetscScalarKokkosView xv; 369 PetscScalarKokkosView yv; 370 KokkosCsrMatrix csrmat; 371 372 PetscFunctionBegin; 373 PetscCall(PetscLogGpuTimeBegin()); 374 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 375 PetscCall(VecGetKokkosView(xx, &xv)); 376 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 377 if (A->form_explicit_transpose) { 378 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 379 mode = "N"; 380 } else { 381 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 382 csrmat = aijkok->csrmat; 383 mode = "T"; 384 } 385 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 386 PetscCall(VecRestoreKokkosView(xx, &xv)); 387 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 388 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 389 PetscCall(PetscLogGpuTimeEnd()); 390 PetscFunctionReturn(PETSC_SUCCESS); 391 } 392 393 /* y = A^H x */ 394 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 395 { 396 Mat_SeqAIJKokkos *aijkok; 397 const char *mode; 398 ConstPetscScalarKokkosView xv; 399 PetscScalarKokkosView yv; 400 KokkosCsrMatrix csrmat; 401 402 PetscFunctionBegin; 403 PetscCall(PetscLogGpuTimeBegin()); 404 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 405 PetscCall(VecGetKokkosView(xx, &xv)); 406 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 407 if (A->form_explicit_transpose) { 408 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 409 mode = "N"; 410 } else { 411 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 412 csrmat = aijkok->csrmat; 413 mode = "C"; 414 } 415 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 416 PetscCall(VecRestoreKokkosView(xx, &xv)); 417 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 418 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 419 PetscCall(PetscLogGpuTimeEnd()); 420 PetscFunctionReturn(PETSC_SUCCESS); 421 } 422 423 /* z = A x + y */ 424 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 425 { 426 Mat_SeqAIJKokkos *aijkok; 427 ConstPetscScalarKokkosView xv; 428 PetscScalarKokkosView zv; 429 430 PetscFunctionBegin; 431 PetscCall(PetscLogGpuTimeBegin()); 432 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 433 if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host 434 PetscCall(VecGetKokkosView(xx, &xv)); 435 PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device 436 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 437 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 438 PetscCall(VecRestoreKokkosView(xx, &xv)); 439 PetscCall(VecRestoreKokkosView(zz, &zv)); 440 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 441 PetscCall(PetscLogGpuTimeEnd()); 442 PetscFunctionReturn(PETSC_SUCCESS); 443 } 444 445 /* z = A^T x + y */ 446 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 447 { 448 Mat_SeqAIJKokkos *aijkok; 449 const char *mode; 450 ConstPetscScalarKokkosView xv; 451 PetscScalarKokkosView zv; 452 KokkosCsrMatrix csrmat; 453 454 PetscFunctionBegin; 455 PetscCall(PetscLogGpuTimeBegin()); 456 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 457 if (zz != yy) PetscCall(VecCopy(yy, zz)); 458 PetscCall(VecGetKokkosView(xx, &xv)); 459 PetscCall(VecGetKokkosView(zz, &zv)); 460 if (A->form_explicit_transpose) { 461 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 462 mode = "N"; 463 } else { 464 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 465 csrmat = aijkok->csrmat; 466 mode = "T"; 467 } 468 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 469 PetscCall(VecRestoreKokkosView(xx, &xv)); 470 PetscCall(VecRestoreKokkosView(zz, &zv)); 471 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 472 PetscCall(PetscLogGpuTimeEnd()); 473 PetscFunctionReturn(PETSC_SUCCESS); 474 } 475 476 /* z = A^H x + y */ 477 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 478 { 479 Mat_SeqAIJKokkos *aijkok; 480 const char *mode; 481 ConstPetscScalarKokkosView xv; 482 PetscScalarKokkosView zv; 483 KokkosCsrMatrix csrmat; 484 485 PetscFunctionBegin; 486 PetscCall(PetscLogGpuTimeBegin()); 487 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 488 if (zz != yy) PetscCall(VecCopy(yy, zz)); 489 PetscCall(VecGetKokkosView(xx, &xv)); 490 PetscCall(VecGetKokkosView(zz, &zv)); 491 if (A->form_explicit_transpose) { 492 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 493 mode = "N"; 494 } else { 495 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 496 csrmat = aijkok->csrmat; 497 mode = "C"; 498 } 499 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 500 PetscCall(VecRestoreKokkosView(xx, &xv)); 501 PetscCall(VecRestoreKokkosView(zz, &zv)); 502 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 503 PetscCall(PetscLogGpuTimeEnd()); 504 PetscFunctionReturn(PETSC_SUCCESS); 505 } 506 507 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 508 { 509 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 510 511 PetscFunctionBegin; 512 switch (op) { 513 case MAT_FORM_EXPLICIT_TRANSPOSE: 514 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 515 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 516 A->form_explicit_transpose = flg; 517 break; 518 default: 519 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 520 break; 521 } 522 PetscFunctionReturn(PETSC_SUCCESS); 523 } 524 525 /* Depending on reuse, either build a new mat, or use the existing mat */ 526 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 527 { 528 Mat_SeqAIJ *aseq; 529 530 PetscFunctionBegin; 531 PetscCall(PetscKokkosInitializeCheck()); 532 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 533 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */ 534 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 535 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 536 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 537 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 538 PetscCall(PetscFree(A->defaultvectype)); 539 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 540 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 541 PetscCall(MatSetOps_SeqAIJKokkos(A)); 542 aseq = static_cast<Mat_SeqAIJ *>(A->data); 543 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 544 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 545 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE); 546 } 547 } 548 PetscFunctionReturn(PETSC_SUCCESS); 549 } 550 551 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 552 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 553 */ 554 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 555 { 556 Mat_SeqAIJ *bseq; 557 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 558 Mat mat; 559 560 PetscFunctionBegin; 561 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 562 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 563 mat = *B; 564 if (A->assembled) { 565 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 566 bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE); 567 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 568 /* Now copy values to B if needed */ 569 if (dupOption == MAT_COPY_VALUES) { 570 if (akok->a_dual.need_sync_device()) { 571 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 572 bkok->a_dual.modify_host(); 573 } else { /* If device has the latest data, we only copy data on device */ 574 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 575 bkok->a_dual.modify_device(); 576 } 577 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 578 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 579 bkok->a_dual.modify_host(); 580 } 581 mat->spptr = bkok; 582 } 583 584 PetscCall(PetscFree(mat->defaultvectype)); 585 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 586 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 587 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 588 PetscFunctionReturn(PETSC_SUCCESS); 589 } 590 591 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 592 { 593 Mat At; 594 KokkosCsrMatrix internT; 595 Mat_SeqAIJKokkos *atkok, *bkok; 596 597 PetscFunctionBegin; 598 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 599 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 600 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 601 /* Deep copy internT, as we want to isolate the internal transpose */ 602 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 603 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 604 if (reuse == MAT_INITIAL_MATRIX) *B = At; 605 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 606 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 607 if ((*B)->assembled) { 608 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 609 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 610 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 611 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 612 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 613 MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 614 MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 615 PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 616 PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 617 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 618 } 619 PetscFunctionReturn(PETSC_SUCCESS); 620 } 621 622 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 623 { 624 Mat_SeqAIJKokkos *aijkok; 625 626 PetscFunctionBegin; 627 if (A->factortype == MAT_FACTOR_NONE) { 628 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 629 delete aijkok; 630 } else { 631 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 632 } 633 A->spptr = NULL; 634 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 635 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 636 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 637 #if defined(PETSC_HAVE_HYPRE) 638 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL)); 639 #endif 640 PetscCall(MatDestroy_SeqAIJ(A)); 641 PetscFunctionReturn(PETSC_SUCCESS); 642 } 643 644 /*MC 645 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 646 647 A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types 648 649 Options Database Key: 650 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 651 652 Level: beginner 653 654 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 655 M*/ 656 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 657 { 658 PetscFunctionBegin; 659 PetscCall(PetscKokkosInitializeCheck()); 660 PetscCall(MatCreate_SeqAIJ(A)); 661 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 662 PetscFunctionReturn(PETSC_SUCCESS); 663 } 664 665 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 666 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 667 { 668 Mat_SeqAIJ *a, *b; 669 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 670 MatScalarKokkosView aa, ba, ca; 671 MatRowMapKokkosView ai, bi, ci; 672 MatColIdxKokkosView aj, bj, cj; 673 PetscInt m, n, nnz, aN; 674 675 PetscFunctionBegin; 676 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 677 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 678 PetscAssertPointer(C, 4); 679 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 680 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 681 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 682 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 683 684 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 685 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 686 a = static_cast<Mat_SeqAIJ *>(A->data); 687 b = static_cast<Mat_SeqAIJ *>(B->data); 688 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 689 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 690 aa = akok->a_dual.view_device(); 691 ai = akok->i_dual.view_device(); 692 ba = bkok->a_dual.view_device(); 693 bi = bkok->i_dual.view_device(); 694 m = A->rmap->n; /* M, N and nnz of C */ 695 n = A->cmap->n + B->cmap->n; 696 nnz = a->nz + b->nz; 697 aN = A->cmap->n; /* N of A */ 698 if (reuse == MAT_INITIAL_MATRIX) { 699 aj = akok->j_dual.view_device(); 700 bj = bkok->j_dual.view_device(); 701 auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 702 auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 703 auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 704 ca = ca_dual.view_device(); 705 ci = ci_dual.view_device(); 706 cj = cj_dual.view_device(); 707 708 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 709 Kokkos::parallel_for( 710 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 711 PetscInt i = t.league_rank(); /* row i */ 712 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 713 714 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 715 ci(i) = coffset; 716 if (i == m - 1) ci(m) = ai(m) + bi(m); 717 }); 718 719 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 720 if (k < alen) { 721 ca(coffset + k) = aa(ai(i) + k); 722 cj(coffset + k) = aj(ai(i) + k); 723 } else { 724 ca(coffset + k) = ba(bi(i) + k - alen); 725 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 726 } 727 }); 728 }); 729 ca_dual.modify_device(); 730 ci_dual.modify_device(); 731 cj_dual.modify_device(); 732 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 733 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 734 } else if (reuse == MAT_REUSE_MATRIX) { 735 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 736 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 737 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 738 ca = ckok->a_dual.view_device(); 739 ci = ckok->i_dual.view_device(); 740 741 Kokkos::parallel_for( 742 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 743 PetscInt i = t.league_rank(); /* row i */ 744 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 745 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 746 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 747 else ca(ci(i) + k) = ba(bi(i) + k - alen); 748 }); 749 }); 750 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 751 } 752 PetscFunctionReturn(PETSC_SUCCESS); 753 } 754 755 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 756 { 757 PetscFunctionBegin; 758 delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 759 PetscFunctionReturn(PETSC_SUCCESS); 760 } 761 762 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 763 { 764 Mat_Product *product = C->product; 765 Mat A, B; 766 bool transA, transB; /* use bool, since KK needs this type */ 767 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 768 Mat_SeqAIJ *c; 769 MatProductData_SeqAIJKokkos *pdata; 770 KokkosCsrMatrix csrmatA, csrmatB; 771 772 PetscFunctionBegin; 773 MatCheckProduct(C, 1); 774 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 775 pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 776 777 // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 778 // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 779 // we still do numeric. 780 if (pdata->reusesym) { // numeric reuses results from symbolic 781 pdata->reusesym = PETSC_FALSE; 782 PetscFunctionReturn(PETSC_SUCCESS); 783 } 784 785 switch (product->type) { 786 case MATPRODUCT_AB: 787 transA = false; 788 transB = false; 789 break; 790 case MATPRODUCT_AtB: 791 transA = true; 792 transB = false; 793 break; 794 case MATPRODUCT_ABt: 795 transA = false; 796 transB = true; 797 break; 798 default: 799 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 800 } 801 802 A = product->A; 803 B = product->B; 804 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 805 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 806 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 807 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 808 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 809 810 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 811 812 csrmatA = akok->csrmat; 813 csrmatB = bkok->csrmat; 814 815 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 816 if (transA) { 817 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 818 transA = false; 819 } 820 821 if (transB) { 822 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 823 transB = false; 824 } 825 PetscCall(PetscLogGpuTimeBegin()); 826 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 827 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 828 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 829 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 830 #endif 831 832 PetscCall(PetscLogGpuTimeEnd()); 833 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 834 /* shorter version of MatAssemblyEnd_SeqAIJ */ 835 c = (Mat_SeqAIJ *)C->data; 836 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 837 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 838 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 839 c->reallocs = 0; 840 C->info.mallocs = 0; 841 C->info.nz_unneeded = 0; 842 C->assembled = C->was_assembled = PETSC_TRUE; 843 C->num_ass++; 844 PetscFunctionReturn(PETSC_SUCCESS); 845 } 846 847 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 848 { 849 Mat_Product *product = C->product; 850 MatProductType ptype; 851 Mat A, B; 852 bool transA, transB; 853 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 854 MatProductData_SeqAIJKokkos *pdata; 855 MPI_Comm comm; 856 KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 857 858 PetscFunctionBegin; 859 MatCheckProduct(C, 1); 860 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 861 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 862 A = product->A; 863 B = product->B; 864 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 865 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 866 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 867 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 868 csrmatA = akok->csrmat; 869 csrmatB = bkok->csrmat; 870 871 ptype = product->type; 872 // Take advantage of the symmetry if true 873 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 874 ptype = MATPRODUCT_AB; 875 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 876 } 877 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 878 ptype = MATPRODUCT_AB; 879 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 880 } 881 882 switch (ptype) { 883 case MATPRODUCT_AB: 884 transA = false; 885 transB = false; 886 break; 887 case MATPRODUCT_AtB: 888 transA = true; 889 transB = false; 890 break; 891 case MATPRODUCT_ABt: 892 transA = false; 893 transB = true; 894 break; 895 default: 896 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 897 } 898 PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 899 pdata->reusesym = product->api_user; 900 901 /* TODO: add command line options to select spgemm algorithms */ 902 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 903 904 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 905 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 906 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 907 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 908 #endif 909 #endif 910 PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 911 912 PetscCall(PetscLogGpuTimeBegin()); 913 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 914 if (transA) { 915 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 916 transA = false; 917 } 918 919 if (transB) { 920 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 921 transB = false; 922 } 923 924 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 925 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 926 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 927 calling new Mat_SeqAIJKokkos(). 928 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 929 */ 930 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 931 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 932 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 933 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 934 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 935 #endif 936 PetscCall(PetscLogGpuTimeEnd()); 937 938 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 939 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 940 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 941 PetscFunctionReturn(PETSC_SUCCESS); 942 } 943 944 /* handles sparse matrix matrix ops */ 945 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 946 { 947 Mat_Product *product = mat->product; 948 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 949 950 PetscFunctionBegin; 951 MatCheckProduct(mat, 1); 952 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 953 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 954 if (Biskok && Ciskok) { 955 switch (product->type) { 956 case MATPRODUCT_AB: 957 case MATPRODUCT_AtB: 958 case MATPRODUCT_ABt: 959 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 960 break; 961 case MATPRODUCT_PtAP: 962 case MATPRODUCT_RARt: 963 case MATPRODUCT_ABC: 964 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 965 break; 966 default: 967 break; 968 } 969 } else { /* fallback for AIJ */ 970 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 971 } 972 PetscFunctionReturn(PETSC_SUCCESS); 973 } 974 975 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 976 { 977 Mat_SeqAIJKokkos *aijkok; 978 979 PetscFunctionBegin; 980 PetscCall(PetscLogGpuTimeBegin()); 981 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 982 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 983 KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 984 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 985 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 986 PetscCall(PetscLogGpuTimeEnd()); 987 PetscFunctionReturn(PETSC_SUCCESS); 988 } 989 990 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular) 991 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a) 992 { 993 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 994 995 PetscFunctionBegin; 996 if (A->assembled && aijseq->diagonaldense) { // no missing diagonals 997 PetscInt n = PetscMin(A->rmap->n, A->cmap->n); 998 999 PetscCall(PetscLogGpuTimeBegin()); 1000 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1001 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1002 const auto &Aa = aijkok->a_dual.view_device(); 1003 const auto &Adiag = aijkok->diag_dual.view_device(); 1004 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; })); 1005 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1006 PetscCall(PetscLogGpuFlops(n)); 1007 PetscCall(PetscLogGpuTimeEnd()); 1008 } else { // need reassembly, very slow! 1009 PetscCall(MatShift_Basic(A, a)); 1010 } 1011 PetscFunctionReturn(PETSC_SUCCESS); 1012 } 1013 1014 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is) 1015 { 1016 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data); 1017 1018 PetscFunctionBegin; 1019 if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals 1020 ConstPetscScalarKokkosView dv; 1021 PetscInt n, nv; 1022 1023 PetscCall(PetscLogGpuTimeBegin()); 1024 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1025 PetscCall(VecGetKokkosView(D, &dv)); 1026 PetscCall(VecGetLocalSize(D, &nv)); 1027 n = PetscMin(Y->rmap->n, Y->cmap->n); 1028 PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match"); 1029 1030 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1031 const auto &Aa = aijkok->a_dual.view_device(); 1032 const auto &Adiag = aijkok->diag_dual.view_device(); 1033 PetscCallCXX(Kokkos::parallel_for( 1034 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1035 if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i); 1036 else Aa(Adiag(i)) += dv(i); 1037 })); 1038 PetscCall(VecRestoreKokkosView(D, &dv)); 1039 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1040 PetscCall(PetscLogGpuFlops(n)); 1041 PetscCall(PetscLogGpuTimeEnd()); 1042 } else { // need reassembly, very slow! 1043 PetscCall(MatDiagonalSet_Default(Y, D, is)); 1044 } 1045 PetscFunctionReturn(PETSC_SUCCESS); 1046 } 1047 1048 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr) 1049 { 1050 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1051 PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz; 1052 ConstPetscScalarKokkosView lv, rv; 1053 1054 PetscFunctionBegin; 1055 PetscCall(PetscLogGpuTimeBegin()); 1056 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1057 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1058 const auto &Aa = aijkok->a_dual.view_device(); 1059 const auto &Ai = aijkok->i_dual.view_device(); 1060 const auto &Aj = aijkok->j_dual.view_device(); 1061 if (ll) { 1062 PetscCall(VecGetLocalSize(ll, &m)); 1063 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1064 PetscCall(VecGetKokkosView(ll, &lv)); 1065 PetscCallCXX(Kokkos::parallel_for( // for each row 1066 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1067 PetscInt i = t.league_rank(); // row i 1068 PetscInt len = Ai(i + 1) - Ai(i); 1069 // scale entries on the row 1070 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); }); 1071 })); 1072 PetscCall(VecRestoreKokkosView(ll, &lv)); 1073 PetscCall(PetscLogGpuFlops(nz)); 1074 } 1075 if (rr) { 1076 PetscCall(VecGetLocalSize(rr, &n)); 1077 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1078 PetscCall(VecGetKokkosView(rr, &rv)); 1079 PetscCallCXX(Kokkos::parallel_for( // for each nonzero 1080 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); })); 1081 PetscCall(VecRestoreKokkosView(rr, &lv)); 1082 PetscCall(PetscLogGpuFlops(nz)); 1083 } 1084 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1085 PetscCall(PetscLogGpuTimeEnd()); 1086 PetscFunctionReturn(PETSC_SUCCESS); 1087 } 1088 1089 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 1090 { 1091 Mat_SeqAIJKokkos *aijkok; 1092 1093 PetscFunctionBegin; 1094 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1095 if (aijkok) { /* Only zero the device if data is already there */ 1096 KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0); 1097 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1098 } else { /* Might be preallocated but not assembled */ 1099 PetscCall(MatZeroEntries_SeqAIJ(A)); 1100 } 1101 PetscFunctionReturn(PETSC_SUCCESS); 1102 } 1103 1104 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1105 { 1106 Mat_SeqAIJKokkos *aijkok; 1107 PetscInt n; 1108 PetscScalarKokkosView xv; 1109 1110 PetscFunctionBegin; 1111 PetscCall(VecGetLocalSize(x, &n)); 1112 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1113 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1114 1115 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1116 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1117 1118 const auto &Aa = aijkok->a_dual.view_device(); 1119 const auto &Ai = aijkok->i_dual.view_device(); 1120 const auto &Adiag = aijkok->diag_dual.view_device(); 1121 1122 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1123 Kokkos::parallel_for( 1124 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1125 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1126 else xv(i) = 0; 1127 }); 1128 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1129 PetscFunctionReturn(PETSC_SUCCESS); 1130 } 1131 1132 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1133 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1134 { 1135 Mat_SeqAIJKokkos *aijkok; 1136 1137 PetscFunctionBegin; 1138 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1139 PetscAssertPointer(kv, 2); 1140 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1141 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1142 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1143 *kv = aijkok->a_dual.view_device(); 1144 PetscFunctionReturn(PETSC_SUCCESS); 1145 } 1146 1147 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1148 { 1149 PetscFunctionBegin; 1150 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1151 PetscAssertPointer(kv, 2); 1152 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1153 PetscFunctionReturn(PETSC_SUCCESS); 1154 } 1155 1156 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1157 { 1158 Mat_SeqAIJKokkos *aijkok; 1159 1160 PetscFunctionBegin; 1161 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1162 PetscAssertPointer(kv, 2); 1163 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1164 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1165 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1166 *kv = aijkok->a_dual.view_device(); 1167 PetscFunctionReturn(PETSC_SUCCESS); 1168 } 1169 1170 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1171 { 1172 PetscFunctionBegin; 1173 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1174 PetscAssertPointer(kv, 2); 1175 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1176 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1177 PetscFunctionReturn(PETSC_SUCCESS); 1178 } 1179 1180 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1181 { 1182 Mat_SeqAIJKokkos *aijkok; 1183 1184 PetscFunctionBegin; 1185 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1186 PetscAssertPointer(kv, 2); 1187 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1188 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1189 *kv = aijkok->a_dual.view_device(); 1190 PetscFunctionReturn(PETSC_SUCCESS); 1191 } 1192 1193 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1194 { 1195 PetscFunctionBegin; 1196 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1197 PetscAssertPointer(kv, 2); 1198 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1199 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1200 PetscFunctionReturn(PETSC_SUCCESS); 1201 } 1202 1203 /* Computes Y += alpha X */ 1204 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1205 { 1206 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1207 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1208 ConstMatScalarKokkosView Xa; 1209 MatScalarKokkosView Ya; 1210 auto &exec = PetscGetKokkosExecutionSpace(); 1211 1212 PetscFunctionBegin; 1213 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1214 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1215 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1216 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1217 PetscCall(PetscLogGpuTimeBegin()); 1218 1219 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1220 PetscBool e; 1221 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1222 if (e) { 1223 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1224 if (e) pattern = SAME_NONZERO_PATTERN; 1225 } 1226 } 1227 1228 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1229 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1230 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1231 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1232 */ 1233 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1234 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1235 Xa = xkok->a_dual.view_device(); 1236 Ya = ykok->a_dual.view_device(); 1237 1238 if (pattern == SAME_NONZERO_PATTERN) { 1239 KokkosBlas::axpy(exec, alpha, Xa, Ya); 1240 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1241 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1242 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1243 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1244 1245 Kokkos::parallel_for( 1246 Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1247 PetscInt i = t.league_rank(); // row i 1248 Kokkos::single(Kokkos::PerTeam(t), [=]() { 1249 // Only one thread works in a team 1250 PetscInt p, q = Yi(i); 1251 for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 1252 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 1253 if (Xj(p) == Yj(q)) { // Found it 1254 Ya(q) += alpha * Xa(p); 1255 q++; 1256 } else { 1257 // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1258 // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1259 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 1260 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 1261 #else 1262 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 1263 #endif 1264 } 1265 } 1266 }); 1267 }); 1268 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1269 } else { // different nonzero patterns 1270 Mat Z; 1271 KokkosCsrMatrix zcsr; 1272 KernelHandle kh; 1273 kh.create_spadd_handle(true); // X, Y are sorted 1274 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1275 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1276 zkok = new Mat_SeqAIJKokkos(zcsr); 1277 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1278 PetscCall(MatHeaderReplace(Y, &Z)); 1279 kh.destroy_spadd_handle(); 1280 } 1281 PetscCall(PetscLogGpuTimeEnd()); 1282 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 1283 PetscFunctionReturn(PETSC_SUCCESS); 1284 } 1285 1286 struct MatCOOStruct_SeqAIJKokkos { 1287 PetscCount n; 1288 PetscCount Atot; 1289 PetscInt nz; 1290 PetscCountKokkosView jmap; 1291 PetscCountKokkosView perm; 1292 1293 MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h) 1294 { 1295 nz = coo_h->nz; 1296 n = coo_h->n; 1297 Atot = coo_h->Atot; 1298 jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1)); 1299 perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot)); 1300 } 1301 }; 1302 1303 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data) 1304 { 1305 PetscFunctionBegin; 1306 PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data)); 1307 PetscFunctionReturn(PETSC_SUCCESS); 1308 } 1309 1310 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1311 { 1312 Mat_SeqAIJKokkos *akok; 1313 Mat_SeqAIJ *aseq; 1314 PetscContainer container_h; 1315 MatCOOStruct_SeqAIJ *coo_h; 1316 MatCOOStruct_SeqAIJKokkos *coo_d; 1317 1318 PetscFunctionBegin; 1319 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1320 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1321 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1322 delete akok; 1323 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE); 1324 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1325 1326 // Copy the COO struct to device 1327 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 1328 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 1329 PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h)); 1330 1331 // Put the COO struct in a container and then attach that to the matrix 1332 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos)); 1333 PetscFunctionReturn(PETSC_SUCCESS); 1334 } 1335 1336 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1337 { 1338 MatScalarKokkosView Aa; 1339 ConstMatScalarKokkosView kv; 1340 PetscMemType memtype; 1341 PetscContainer container; 1342 MatCOOStruct_SeqAIJKokkos *coo; 1343 1344 PetscFunctionBegin; 1345 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 1346 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 1347 1348 const auto &n = coo->n; 1349 const auto &Annz = coo->nz; 1350 const auto &jmap = coo->jmap; 1351 const auto &perm = coo->perm; 1352 1353 PetscCall(PetscGetMemType(v, &memtype)); 1354 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1355 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n)); 1356 } else { 1357 kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */ 1358 } 1359 1360 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1361 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1362 1363 PetscCall(PetscLogGpuTimeBegin()); 1364 Kokkos::parallel_for( 1365 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) { 1366 PetscScalar sum = 0.0; 1367 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1368 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1369 }); 1370 PetscCall(PetscLogGpuTimeEnd()); 1371 1372 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1373 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1374 PetscFunctionReturn(PETSC_SUCCESS); 1375 } 1376 1377 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1378 { 1379 PetscFunctionBegin; 1380 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1381 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1382 B->offloadmask = PETSC_OFFLOAD_CPU; 1383 PetscFunctionReturn(PETSC_SUCCESS); 1384 } 1385 1386 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1387 { 1388 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1389 1390 PetscFunctionBegin; 1391 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1392 A->boundtocpu = PETSC_FALSE; 1393 1394 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1395 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1396 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1397 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1398 A->ops->scale = MatScale_SeqAIJKokkos; 1399 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1400 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1401 A->ops->mult = MatMult_SeqAIJKokkos; 1402 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1403 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1404 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1405 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1406 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1407 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1408 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1409 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1410 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1411 A->ops->shift = MatShift_SeqAIJKokkos; 1412 A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos; 1413 A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos; 1414 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1415 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1416 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1417 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1418 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1419 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1420 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1421 1422 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1423 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1424 #if defined(PETSC_HAVE_HYPRE) 1425 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE)); 1426 #endif 1427 PetscFunctionReturn(PETSC_SUCCESS); 1428 } 1429 1430 /* 1431 Extract the (prescribled) diagonal blocks of the matrix and then invert them 1432 1433 Input Parameters: 1434 + A - the MATSEQAIJKOKKOS matrix 1435 . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i) 1436 . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[] 1437 . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i) 1438 - work - a pre-allocated work buffer (as big as diagVal) for use by this routine 1439 1440 Output Parameter: 1441 . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order) 1442 */ 1443 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal) 1444 { 1445 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1446 PetscInt nblocks = bs.extent(0) - 1; 1447 1448 PetscFunctionBegin; 1449 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device 1450 1451 // Pull out the diagonal blocks of the matrix and then invert the blocks 1452 auto Aa = akok->a_dual.view_device(); 1453 auto Ai = akok->i_dual.view_device(); 1454 auto Aj = akok->j_dual.view_device(); 1455 auto Adiag = akok->diag_dual.view_device(); 1456 // TODO: how to tune the team size? 1457 #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HOST) 1458 auto ts = Kokkos::AUTO(); 1459 #else 1460 auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs 1461 #endif 1462 PetscCallCXX(Kokkos::parallel_for( 1463 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) { 1464 const PetscInt bid = teamMember.league_rank(); // block id 1465 const PetscInt rstart = bs(bid); // this block starts from this row 1466 const PetscInt m = bs(bid + 1) - bs(bid); // size of this block 1467 const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order 1468 const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m); 1469 1470 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B 1471 PetscInt i = rstart + r; // i-th row in A 1472 1473 if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case) 1474 PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row 1475 1476 for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet 1477 if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero 1478 B(r, c) = 0.0; 1479 } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column 1480 B(r, c) = Aa(first + c); 1481 } else { // this entry does not show up in the CSR 1482 B(r, c) = 0.0; 1483 } 1484 } 1485 } else { // rare case that the diagonal does not exist 1486 const PetscInt begin = Ai(i); 1487 const PetscInt end = Ai(i + 1); 1488 for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0; 1489 for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not. 1490 if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j); 1491 else if (Aj(j) >= rstart + m) break; 1492 } 1493 } 1494 }); 1495 1496 // LU-decompose B (w/o pivoting) and then invert B 1497 KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0); 1498 KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W); 1499 })); 1500 // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels 1501 PetscFunctionReturn(PETSC_SUCCESS); 1502 } 1503 1504 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1505 { 1506 Mat_SeqAIJ *aseq; 1507 PetscInt i, m, n; 1508 auto &exec = PetscGetKokkosExecutionSpace(); 1509 1510 PetscFunctionBegin; 1511 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1512 1513 m = akok->nrows(); 1514 n = akok->ncols(); 1515 PetscCall(MatSetSizes(A, m, n, m, n)); 1516 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1517 1518 /* Set up data structures of A as a MATSEQAIJ */ 1519 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1520 aseq = (Mat_SeqAIJ *)A->data; 1521 1522 PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */ 1523 PetscCallCXX(akok->j_dual.sync_host(exec)); 1524 PetscCallCXX(exec.fence()); 1525 1526 aseq->i = akok->i_host_data(); 1527 aseq->j = akok->j_host_data(); 1528 aseq->a = akok->a_host_data(); 1529 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1530 aseq->free_a = PETSC_FALSE; 1531 aseq->free_ij = PETSC_FALSE; 1532 aseq->nz = akok->nnz(); 1533 aseq->maxnz = aseq->nz; 1534 1535 PetscCall(PetscMalloc1(m, &aseq->imax)); 1536 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1537 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1538 1539 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1540 akok->nonzerostate = A->nonzerostate; 1541 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1542 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1543 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1544 PetscFunctionReturn(PETSC_SUCCESS); 1545 } 1546 1547 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 1548 { 1549 PetscFunctionBegin; 1550 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1551 *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 1552 PetscFunctionReturn(PETSC_SUCCESS); 1553 } 1554 1555 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 1556 { 1557 Mat_SeqAIJKokkos *akok; 1558 1559 PetscFunctionBegin; 1560 PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 1561 PetscCall(MatCreate(comm, A)); 1562 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1563 PetscFunctionReturn(PETSC_SUCCESS); 1564 } 1565 1566 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1567 1568 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1569 */ 1570 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1571 { 1572 PetscFunctionBegin; 1573 PetscCall(MatCreate(comm, A)); 1574 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1575 PetscFunctionReturn(PETSC_SUCCESS); 1576 } 1577 1578 /*@C 1579 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1580 (the default parallel PETSc format). This matrix will ultimately be handled by 1581 Kokkos for calculations. 1582 1583 Collective 1584 1585 Input Parameters: 1586 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1587 . m - number of rows 1588 . n - number of columns 1589 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 1590 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 1591 1592 Output Parameter: 1593 . A - the matrix 1594 1595 Level: intermediate 1596 1597 Notes: 1598 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1599 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1600 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1601 1602 The AIJ format, also called 1603 compressed row storage, is fully compatible with standard Fortran 1604 storage. That is, the stored row and column indices can begin at 1605 either one (as in Fortran) or zero. 1606 1607 Specify the preallocated storage with either `nz` or `nnz` (not both). 1608 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1609 allocation. 1610 1611 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()` 1612 @*/ 1613 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1614 { 1615 PetscFunctionBegin; 1616 PetscCall(PetscKokkosInitializeCheck()); 1617 PetscCall(MatCreate(comm, A)); 1618 PetscCall(MatSetSizes(*A, m, n, m, n)); 1619 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1620 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1621 PetscFunctionReturn(PETSC_SUCCESS); 1622 } 1623 1624 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1625 { 1626 PetscFunctionBegin; 1627 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1628 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1629 PetscFunctionReturn(PETSC_SUCCESS); 1630 } 1631 1632 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1633 { 1634 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1635 1636 PetscFunctionBegin; 1637 if (!factors->sptrsv_symbolic_completed) { 1638 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d); 1639 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d); 1640 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1641 } 1642 PetscFunctionReturn(PETSC_SUCCESS); 1643 } 1644 1645 /* Check if we need to update factors etc for transpose solve */ 1646 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1647 { 1648 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1649 MatColIdxType n = A->rmap->n; 1650 1651 PetscFunctionBegin; 1652 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1653 /* Update L^T and do sptrsv symbolic */ 1654 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0 1655 factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 1656 factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 1657 1658 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d, 1659 factors->iLt_d, factors->jLt_d, factors->aLt_d); 1660 1661 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1662 We have to sort the indices, until KK provides finer control options. 1663 */ 1664 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d); 1665 1666 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d); 1667 1668 /* Update U^T and do sptrsv symbolic */ 1669 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0 1670 factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 1671 factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 1672 1673 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d, 1674 factors->iUt_d, factors->jUt_d, factors->aUt_d); 1675 1676 /* Sort indices. See comments above */ 1677 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d); 1678 1679 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d); 1680 factors->transpose_updated = PETSC_TRUE; 1681 } 1682 PetscFunctionReturn(PETSC_SUCCESS); 1683 } 1684 1685 /* Solve Ax = b, with A = LU */ 1686 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x) 1687 { 1688 ConstPetscScalarKokkosView bv; 1689 PetscScalarKokkosView xv; 1690 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1691 1692 PetscFunctionBegin; 1693 PetscCall(PetscLogGpuTimeBegin()); 1694 PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 1695 PetscCall(VecGetKokkosView(b, &bv)); 1696 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1697 /* Solve L tmpv = b */ 1698 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector)); 1699 /* Solve Ux = tmpv */ 1700 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv)); 1701 PetscCall(VecRestoreKokkosView(b, &bv)); 1702 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1703 PetscCall(PetscLogGpuTimeEnd()); 1704 PetscFunctionReturn(PETSC_SUCCESS); 1705 } 1706 1707 /* Solve A^T x = b, where A^T = U^T L^T */ 1708 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x) 1709 { 1710 ConstPetscScalarKokkosView bv; 1711 PetscScalarKokkosView xv; 1712 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1713 1714 PetscFunctionBegin; 1715 PetscCall(PetscLogGpuTimeBegin()); 1716 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 1717 PetscCall(VecGetKokkosView(b, &bv)); 1718 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1719 /* Solve U^T tmpv = b */ 1720 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector); 1721 1722 /* Solve L^T x = tmpv */ 1723 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv); 1724 PetscCall(VecRestoreKokkosView(b, &bv)); 1725 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1726 PetscCall(PetscLogGpuTimeEnd()); 1727 PetscFunctionReturn(PETSC_SUCCESS); 1728 } 1729 1730 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1731 { 1732 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1733 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1734 PetscInt fill_lev = info->levels; 1735 1736 PetscFunctionBegin; 1737 PetscCall(PetscLogGpuTimeBegin()); 1738 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1739 1740 auto a_d = aijkok->a_dual.view_device(); 1741 auto i_d = aijkok->i_dual.view_device(); 1742 auto j_d = aijkok->j_dual.view_device(); 1743 1744 KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d); 1745 1746 B->assembled = PETSC_TRUE; 1747 B->preallocated = PETSC_TRUE; 1748 B->ops->solve = MatSolve_SeqAIJKokkos; 1749 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1750 B->ops->matsolve = NULL; 1751 B->ops->matsolvetranspose = NULL; 1752 B->offloadmask = PETSC_OFFLOAD_GPU; 1753 1754 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1755 factors->transpose_updated = PETSC_FALSE; 1756 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1757 /* TODO: log flops, but how to know that? */ 1758 PetscCall(PetscLogGpuTimeEnd()); 1759 PetscFunctionReturn(PETSC_SUCCESS); 1760 } 1761 1762 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1763 { 1764 Mat_SeqAIJKokkos *aijkok; 1765 Mat_SeqAIJ *b; 1766 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1767 PetscInt fill_lev = info->levels; 1768 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 1769 PetscInt n = A->rmap->n; 1770 1771 PetscFunctionBegin; 1772 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1773 /* Rebuild factors */ 1774 if (factors) { 1775 factors->Destroy(); 1776 } /* Destroy the old if it exists */ 1777 else { 1778 B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n); 1779 } 1780 1781 /* Create a spiluk handle and then do symbolic factorization */ 1782 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 1783 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 1784 1785 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1786 1787 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 1788 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 1789 Kokkos::realloc(factors->iU_d, n + 1); 1790 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 1791 1792 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1793 auto i_d = aijkok->i_dual.view_device(); 1794 auto j_d = aijkok->j_dual.view_device(); 1795 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d); 1796 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1797 1798 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1799 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 1800 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 1801 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 1802 1803 /* TODO: add options to select sptrsv algorithms */ 1804 /* Create sptrsv handles for L, U and their transpose */ 1805 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1806 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1807 #else 1808 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1809 #endif 1810 1811 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 1812 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 1813 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 1814 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 1815 1816 /* Fill fields of the factor matrix B */ 1817 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 1818 b = (Mat_SeqAIJ *)B->data; 1819 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 1820 B->info.fill_ratio_given = info->fill; 1821 B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 1822 1823 B->offloadmask = PETSC_OFFLOAD_GPU; 1824 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1825 PetscFunctionReturn(PETSC_SUCCESS); 1826 } 1827 1828 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type) 1829 { 1830 PetscFunctionBegin; 1831 *type = MATSOLVERKOKKOS; 1832 PetscFunctionReturn(PETSC_SUCCESS); 1833 } 1834 1835 /*MC 1836 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1837 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 1838 1839 Level: beginner 1840 1841 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1842 M*/ 1843 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1844 { 1845 PetscInt n = A->rmap->n; 1846 1847 PetscFunctionBegin; 1848 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 1849 PetscCall(MatSetSizes(*B, n, n, n, n)); 1850 (*B)->factortype = ftype; 1851 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 1852 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1853 1854 if (ftype == MAT_FACTOR_LU) { 1855 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1856 (*B)->canuseordering = PETSC_TRUE; 1857 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1858 } else if (ftype == MAT_FACTOR_ILU) { 1859 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1860 (*B)->canuseordering = PETSC_FALSE; 1861 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1862 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1863 1864 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 1865 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos)); 1866 PetscFunctionReturn(PETSC_SUCCESS); 1867 } 1868 1869 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1870 { 1871 PetscFunctionBegin; 1872 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1873 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1874 PetscFunctionReturn(PETSC_SUCCESS); 1875 } 1876 1877 /* Utility to print out a KokkosCsrMatrix for debugging */ 1878 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 1879 { 1880 const auto &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map); 1881 const auto &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries); 1882 const auto &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values); 1883 const PetscInt *i = iv.data(); 1884 const PetscInt *j = jv.data(); 1885 const PetscScalar *a = av.data(); 1886 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 1887 1888 PetscFunctionBegin; 1889 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 1890 for (PetscInt k = 0; k < m; k++) { 1891 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 1892 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 1893 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 1894 } 1895 PetscFunctionReturn(PETSC_SUCCESS); 1896 } 1897