1 #include <petsc_kokkos.hpp> 2 #include <petscvec_kokkos.hpp> 3 #include <petscmat_kokkos.hpp> 4 #include <petscpkg_version.h> 5 #include <petsc/private/petscimpl.h> 6 #include <petsc/private/sfimpl.h> 7 #include <petscsystypes.h> 8 #include <petscerror.h> 9 10 #include <Kokkos_Core.hpp> 11 #include <KokkosBlas.hpp> 12 #include <KokkosSparse_CrsMatrix.hpp> 13 14 // To suppress compiler warnings: 15 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63: 16 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, 17 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*, 18 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations] 19 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations") 20 #include <KokkosSparse_spmv.hpp> 21 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END() 22 23 #include <KokkosSparse_spiluk.hpp> 24 #include <KokkosSparse_sptrsv.hpp> 25 #include <KokkosSparse_spgemm.hpp> 26 #include <KokkosSparse_spadd.hpp> 27 #include <KokkosBatched_LU_Decl.hpp> 28 #include <KokkosBatched_InverseLU_Decl.hpp> 29 30 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 31 32 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 33 #include <KokkosSparse_Utils.hpp> 34 using KokkosSparse::sort_crs_matrix; 35 using KokkosSparse::Impl::transpose_matrix; 36 #else 37 #include <KokkosKernels_Sorting.hpp> 38 using KokkosKernels::sort_crs_matrix; 39 using KokkosKernels::Impl::transpose_matrix; 40 #endif 41 42 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 43 44 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 45 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 46 In the latter case, it is important to set a_dual's sync state correctly. 47 */ 48 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 49 { 50 Mat_SeqAIJ *aijseq; 51 Mat_SeqAIJKokkos *aijkok; 52 53 PetscFunctionBegin; 54 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 55 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 56 57 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 58 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 59 60 /* If aijkok does not exist, we just copy i, j to device. 61 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 62 In both cases, we build a new aijkok structure. 63 */ 64 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 65 delete aijkok; 66 aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 67 A->spptr = aijkok; 68 } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag. 69 MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n); 70 auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h); 71 aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h); 72 } 73 PetscFunctionReturn(PETSC_SUCCESS); 74 } 75 76 /* Sync CSR data to device if not yet */ 77 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 78 { 79 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 80 81 PetscFunctionBegin; 82 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 83 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 84 if (aijkok->a_dual.need_sync_device()) { 85 aijkok->a_dual.sync_device(); 86 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 87 aijkok->hermitian_updated = PETSC_FALSE; 88 } 89 PetscFunctionReturn(PETSC_SUCCESS); 90 } 91 92 /* Mark the CSR data on device as modified */ 93 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 94 { 95 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 96 97 PetscFunctionBegin; 98 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 99 aijkok->a_dual.clear_sync_state(); 100 aijkok->a_dual.modify_device(); 101 aijkok->transpose_updated = PETSC_FALSE; 102 aijkok->hermitian_updated = PETSC_FALSE; 103 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 104 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 105 PetscFunctionReturn(PETSC_SUCCESS); 106 } 107 108 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 109 { 110 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 111 auto exec = PetscGetKokkosExecutionSpace(); 112 113 PetscFunctionBegin; 114 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 115 /* We do not expect one needs factors on host */ 116 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 117 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 118 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 119 PetscCallCXX(exec.fence()); 120 PetscFunctionReturn(PETSC_SUCCESS); 121 } 122 123 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 124 { 125 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 126 127 PetscFunctionBegin; 128 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 129 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 130 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 131 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 132 */ 133 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 134 auto exec = PetscGetKokkosExecutionSpace(); 135 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 136 PetscCallCXX(exec.fence()); 137 *array = aijkok->a_dual.view_host().data(); 138 } else { /* Happens when calling MatSetValues on a newly created matrix */ 139 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 140 } 141 PetscFunctionReturn(PETSC_SUCCESS); 142 } 143 144 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 145 { 146 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 147 148 PetscFunctionBegin; 149 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 150 PetscFunctionReturn(PETSC_SUCCESS); 151 } 152 153 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 154 { 155 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 156 157 PetscFunctionBegin; 158 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 159 auto exec = PetscGetKokkosExecutionSpace(); 160 PetscCallCXX(aijkok->a_dual.sync_host(exec)); 161 PetscCallCXX(exec.fence()); 162 *array = aijkok->a_dual.view_host().data(); 163 } else { 164 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 165 } 166 PetscFunctionReturn(PETSC_SUCCESS); 167 } 168 169 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 170 { 171 PetscFunctionBegin; 172 *array = NULL; 173 PetscFunctionReturn(PETSC_SUCCESS); 174 } 175 176 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 177 { 178 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 179 180 PetscFunctionBegin; 181 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 182 *array = aijkok->a_dual.view_host().data(); 183 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 184 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 185 } 186 PetscFunctionReturn(PETSC_SUCCESS); 187 } 188 189 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 190 { 191 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 192 193 PetscFunctionBegin; 194 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 195 aijkok->a_dual.clear_sync_state(); 196 aijkok->a_dual.modify_host(); 197 } 198 PetscFunctionReturn(PETSC_SUCCESS); 199 } 200 201 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 202 { 203 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 204 205 PetscFunctionBegin; 206 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 207 208 if (i) *i = aijkok->i_device_data(); 209 if (j) *j = aijkok->j_device_data(); 210 if (a) { 211 aijkok->a_dual.sync_device(); 212 *a = aijkok->a_device_data(); 213 } 214 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 215 PetscFunctionReturn(PETSC_SUCCESS); 216 } 217 218 /* 219 Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 220 221 Input Parameter: 222 . A - the MATSEQAIJKOKKOS matrix 223 224 Output Parameters: 225 + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 226 - T_d - the transpose on device, whose value array is allocated but not initialized 227 */ 228 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 229 { 230 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 231 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 232 const PetscInt *Ai = aseq->i, *Aj = aseq->j; 233 MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 234 MatRowMapType *Ti = Ti_h.data(); 235 MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 236 MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 237 PetscInt *Tj = Tj_h.data(); 238 PetscInt *perm = perm_h.data(); 239 PetscInt *offset; 240 241 PetscFunctionBegin; 242 // Populate Ti 243 PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 244 Ti++; 245 for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 246 Ti--; 247 for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 248 249 // Populate Tj and the permutation array 250 PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 251 for (PetscInt i = 0; i < m; i++) { 252 for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 253 PetscInt r = Aj[j]; // row r of T 254 PetscInt disp = Ti[r] + offset[r]; 255 256 Tj[disp] = i; // col i of T 257 perm[disp] = j; 258 offset[r]++; 259 } 260 } 261 PetscCall(PetscFree(offset)); 262 263 // Sort each row of T, along with the permutation array 264 for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 265 266 // Output perm and T on device 267 auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 268 auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 269 PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 270 PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 271 PetscFunctionReturn(PETSC_SUCCESS); 272 } 273 274 // Generate the transpose on device and cache it internally 275 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 276 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 277 { 278 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 279 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 280 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 281 KokkosCsrMatrix &T = akok->csrmatT; 282 283 PetscFunctionBegin; 284 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 285 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 286 287 const auto &Aa = akok->a_dual.view_device(); 288 289 if (A->symmetric == PETSC_BOOL3_TRUE) { 290 *csrmatT = akok->csrmat; 291 } else { 292 // See if we already have a cached transpose and its value is up to date 293 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 294 if (!akok->transpose_updated) { // if the value is out of date, update the cached version 295 const auto &perm = akok->transpose_perm; // get the permutation array 296 auto &Ta = T.values; 297 298 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 299 } 300 } else { // Generate T of size n x m for the first time 301 MatRowMapKokkosView perm; 302 303 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 304 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 305 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 306 } 307 akok->transpose_updated = PETSC_TRUE; 308 *csrmatT = akok->csrmatT; 309 } 310 PetscFunctionReturn(PETSC_SUCCESS); 311 } 312 313 // Generate the Hermitian on device and cache it internally 314 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 315 { 316 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 317 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 318 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 319 KokkosCsrMatrix &T = akok->csrmatH; 320 321 PetscFunctionBegin; 322 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 323 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 324 325 const auto &Aa = akok->a_dual.view_device(); 326 327 if (A->hermitian == PETSC_BOOL3_TRUE) { 328 *csrmatH = akok->csrmat; 329 } else { 330 // See if we already have a cached hermitian and its value is up to date 331 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 332 if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 333 const auto &perm = akok->transpose_perm; // get the permutation array 334 auto &Ta = T.values; 335 336 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 337 } 338 } else { // Generate T of size n x m for the first time 339 MatRowMapKokkosView perm; 340 341 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 342 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 343 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 344 } 345 akok->hermitian_updated = PETSC_TRUE; 346 *csrmatH = akok->csrmatH; 347 } 348 PetscFunctionReturn(PETSC_SUCCESS); 349 } 350 351 /* y = A x */ 352 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 353 { 354 Mat_SeqAIJKokkos *aijkok; 355 ConstPetscScalarKokkosView xv; 356 PetscScalarKokkosView yv; 357 358 PetscFunctionBegin; 359 PetscCall(PetscLogGpuTimeBegin()); 360 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 361 PetscCall(VecGetKokkosView(xx, &xv)); 362 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 363 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 364 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 365 PetscCall(VecRestoreKokkosView(xx, &xv)); 366 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 367 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 368 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 369 PetscCall(PetscLogGpuTimeEnd()); 370 PetscFunctionReturn(PETSC_SUCCESS); 371 } 372 373 /* y = A^T x */ 374 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 375 { 376 Mat_SeqAIJKokkos *aijkok; 377 const char *mode; 378 ConstPetscScalarKokkosView xv; 379 PetscScalarKokkosView yv; 380 KokkosCsrMatrix csrmat; 381 382 PetscFunctionBegin; 383 PetscCall(PetscLogGpuTimeBegin()); 384 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 385 PetscCall(VecGetKokkosView(xx, &xv)); 386 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 387 if (A->form_explicit_transpose) { 388 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 389 mode = "N"; 390 } else { 391 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 392 csrmat = aijkok->csrmat; 393 mode = "T"; 394 } 395 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 396 PetscCall(VecRestoreKokkosView(xx, &xv)); 397 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 398 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 399 PetscCall(PetscLogGpuTimeEnd()); 400 PetscFunctionReturn(PETSC_SUCCESS); 401 } 402 403 /* y = A^H x */ 404 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 405 { 406 Mat_SeqAIJKokkos *aijkok; 407 const char *mode; 408 ConstPetscScalarKokkosView xv; 409 PetscScalarKokkosView yv; 410 KokkosCsrMatrix csrmat; 411 412 PetscFunctionBegin; 413 PetscCall(PetscLogGpuTimeBegin()); 414 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 415 PetscCall(VecGetKokkosView(xx, &xv)); 416 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 417 if (A->form_explicit_transpose) { 418 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 419 mode = "N"; 420 } else { 421 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 422 csrmat = aijkok->csrmat; 423 mode = "C"; 424 } 425 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 426 PetscCall(VecRestoreKokkosView(xx, &xv)); 427 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 428 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 429 PetscCall(PetscLogGpuTimeEnd()); 430 PetscFunctionReturn(PETSC_SUCCESS); 431 } 432 433 /* z = A x + y */ 434 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 435 { 436 Mat_SeqAIJKokkos *aijkok; 437 ConstPetscScalarKokkosView xv; 438 PetscScalarKokkosView zv; 439 440 PetscFunctionBegin; 441 PetscCall(PetscLogGpuTimeBegin()); 442 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 443 if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host 444 PetscCall(VecGetKokkosView(xx, &xv)); 445 PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device 446 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 447 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 448 PetscCall(VecRestoreKokkosView(xx, &xv)); 449 PetscCall(VecRestoreKokkosView(zz, &zv)); 450 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 451 PetscCall(PetscLogGpuTimeEnd()); 452 PetscFunctionReturn(PETSC_SUCCESS); 453 } 454 455 /* z = A^T x + y */ 456 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 457 { 458 Mat_SeqAIJKokkos *aijkok; 459 const char *mode; 460 ConstPetscScalarKokkosView xv; 461 PetscScalarKokkosView zv; 462 KokkosCsrMatrix csrmat; 463 464 PetscFunctionBegin; 465 PetscCall(PetscLogGpuTimeBegin()); 466 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 467 if (zz != yy) PetscCall(VecCopy(yy, zz)); 468 PetscCall(VecGetKokkosView(xx, &xv)); 469 PetscCall(VecGetKokkosView(zz, &zv)); 470 if (A->form_explicit_transpose) { 471 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 472 mode = "N"; 473 } else { 474 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 475 csrmat = aijkok->csrmat; 476 mode = "T"; 477 } 478 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 479 PetscCall(VecRestoreKokkosView(xx, &xv)); 480 PetscCall(VecRestoreKokkosView(zz, &zv)); 481 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 482 PetscCall(PetscLogGpuTimeEnd()); 483 PetscFunctionReturn(PETSC_SUCCESS); 484 } 485 486 /* z = A^H x + y */ 487 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 488 { 489 Mat_SeqAIJKokkos *aijkok; 490 const char *mode; 491 ConstPetscScalarKokkosView xv; 492 PetscScalarKokkosView zv; 493 KokkosCsrMatrix csrmat; 494 495 PetscFunctionBegin; 496 PetscCall(PetscLogGpuTimeBegin()); 497 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 498 if (zz != yy) PetscCall(VecCopy(yy, zz)); 499 PetscCall(VecGetKokkosView(xx, &xv)); 500 PetscCall(VecGetKokkosView(zz, &zv)); 501 if (A->form_explicit_transpose) { 502 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 503 mode = "N"; 504 } else { 505 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 506 csrmat = aijkok->csrmat; 507 mode = "C"; 508 } 509 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 510 PetscCall(VecRestoreKokkosView(xx, &xv)); 511 PetscCall(VecRestoreKokkosView(zz, &zv)); 512 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 513 PetscCall(PetscLogGpuTimeEnd()); 514 PetscFunctionReturn(PETSC_SUCCESS); 515 } 516 517 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 518 { 519 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 520 521 PetscFunctionBegin; 522 switch (op) { 523 case MAT_FORM_EXPLICIT_TRANSPOSE: 524 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 525 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 526 A->form_explicit_transpose = flg; 527 break; 528 default: 529 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 530 break; 531 } 532 PetscFunctionReturn(PETSC_SUCCESS); 533 } 534 535 /* Depending on reuse, either build a new mat, or use the existing mat */ 536 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 537 { 538 Mat_SeqAIJ *aseq; 539 540 PetscFunctionBegin; 541 PetscCall(PetscKokkosInitializeCheck()); 542 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 543 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */ 544 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 545 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 546 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 547 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 548 PetscCall(PetscFree(A->defaultvectype)); 549 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 550 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 551 PetscCall(MatSetOps_SeqAIJKokkos(A)); 552 aseq = static_cast<Mat_SeqAIJ *>(A->data); 553 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 554 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 555 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE); 556 } 557 } 558 PetscFunctionReturn(PETSC_SUCCESS); 559 } 560 561 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 562 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 563 */ 564 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 565 { 566 Mat_SeqAIJ *bseq; 567 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 568 Mat mat; 569 570 PetscFunctionBegin; 571 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 572 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 573 mat = *B; 574 if (A->assembled) { 575 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 576 bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE); 577 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 578 /* Now copy values to B if needed */ 579 if (dupOption == MAT_COPY_VALUES) { 580 if (akok->a_dual.need_sync_device()) { 581 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 582 bkok->a_dual.modify_host(); 583 } else { /* If device has the latest data, we only copy data on device */ 584 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 585 bkok->a_dual.modify_device(); 586 } 587 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 588 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 589 bkok->a_dual.modify_host(); 590 } 591 mat->spptr = bkok; 592 } 593 594 PetscCall(PetscFree(mat->defaultvectype)); 595 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 596 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 597 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 598 PetscFunctionReturn(PETSC_SUCCESS); 599 } 600 601 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 602 { 603 Mat At; 604 KokkosCsrMatrix internT; 605 Mat_SeqAIJKokkos *atkok, *bkok; 606 607 PetscFunctionBegin; 608 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 609 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 610 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 611 /* Deep copy internT, as we want to isolate the internal transpose */ 612 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 613 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 614 if (reuse == MAT_INITIAL_MATRIX) *B = At; 615 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 616 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 617 if ((*B)->assembled) { 618 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 619 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 620 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 621 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 622 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 623 MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 624 MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 625 PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 626 PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 627 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 628 } 629 PetscFunctionReturn(PETSC_SUCCESS); 630 } 631 632 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 633 { 634 Mat_SeqAIJKokkos *aijkok; 635 636 PetscFunctionBegin; 637 if (A->factortype == MAT_FACTOR_NONE) { 638 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 639 delete aijkok; 640 } else { 641 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 642 } 643 A->spptr = NULL; 644 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 645 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 646 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 647 #if defined(PETSC_HAVE_HYPRE) 648 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL)); 649 #endif 650 PetscCall(MatDestroy_SeqAIJ(A)); 651 PetscFunctionReturn(PETSC_SUCCESS); 652 } 653 654 /*MC 655 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 656 657 A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types 658 659 Options Database Key: 660 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 661 662 Level: beginner 663 664 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 665 M*/ 666 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 667 { 668 PetscFunctionBegin; 669 PetscCall(PetscKokkosInitializeCheck()); 670 PetscCall(MatCreate_SeqAIJ(A)); 671 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 672 PetscFunctionReturn(PETSC_SUCCESS); 673 } 674 675 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 676 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 677 { 678 Mat_SeqAIJ *a, *b; 679 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 680 MatScalarKokkosView aa, ba, ca; 681 MatRowMapKokkosView ai, bi, ci; 682 MatColIdxKokkosView aj, bj, cj; 683 PetscInt m, n, nnz, aN; 684 685 PetscFunctionBegin; 686 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 687 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 688 PetscAssertPointer(C, 4); 689 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 690 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 691 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 692 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 693 694 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 695 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 696 a = static_cast<Mat_SeqAIJ *>(A->data); 697 b = static_cast<Mat_SeqAIJ *>(B->data); 698 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 699 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 700 aa = akok->a_dual.view_device(); 701 ai = akok->i_dual.view_device(); 702 ba = bkok->a_dual.view_device(); 703 bi = bkok->i_dual.view_device(); 704 m = A->rmap->n; /* M, N and nnz of C */ 705 n = A->cmap->n + B->cmap->n; 706 nnz = a->nz + b->nz; 707 aN = A->cmap->n; /* N of A */ 708 if (reuse == MAT_INITIAL_MATRIX) { 709 aj = akok->j_dual.view_device(); 710 bj = bkok->j_dual.view_device(); 711 auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 712 auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 713 auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 714 ca = ca_dual.view_device(); 715 ci = ci_dual.view_device(); 716 cj = cj_dual.view_device(); 717 718 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 719 Kokkos::parallel_for( 720 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 721 PetscInt i = t.league_rank(); /* row i */ 722 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 723 724 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 725 ci(i) = coffset; 726 if (i == m - 1) ci(m) = ai(m) + bi(m); 727 }); 728 729 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 730 if (k < alen) { 731 ca(coffset + k) = aa(ai(i) + k); 732 cj(coffset + k) = aj(ai(i) + k); 733 } else { 734 ca(coffset + k) = ba(bi(i) + k - alen); 735 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 736 } 737 }); 738 }); 739 ca_dual.modify_device(); 740 ci_dual.modify_device(); 741 cj_dual.modify_device(); 742 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 743 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 744 } else if (reuse == MAT_REUSE_MATRIX) { 745 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 746 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 747 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 748 ca = ckok->a_dual.view_device(); 749 ci = ckok->i_dual.view_device(); 750 751 Kokkos::parallel_for( 752 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 753 PetscInt i = t.league_rank(); /* row i */ 754 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 755 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 756 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 757 else ca(ci(i) + k) = ba(bi(i) + k - alen); 758 }); 759 }); 760 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 761 } 762 PetscFunctionReturn(PETSC_SUCCESS); 763 } 764 765 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 766 { 767 PetscFunctionBegin; 768 delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 769 PetscFunctionReturn(PETSC_SUCCESS); 770 } 771 772 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 773 { 774 Mat_Product *product = C->product; 775 Mat A, B; 776 bool transA, transB; /* use bool, since KK needs this type */ 777 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 778 Mat_SeqAIJ *c; 779 MatProductData_SeqAIJKokkos *pdata; 780 KokkosCsrMatrix csrmatA, csrmatB; 781 782 PetscFunctionBegin; 783 MatCheckProduct(C, 1); 784 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 785 pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 786 787 // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 788 // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 789 // we still do numeric. 790 if (pdata->reusesym) { // numeric reuses results from symbolic 791 pdata->reusesym = PETSC_FALSE; 792 PetscFunctionReturn(PETSC_SUCCESS); 793 } 794 795 switch (product->type) { 796 case MATPRODUCT_AB: 797 transA = false; 798 transB = false; 799 break; 800 case MATPRODUCT_AtB: 801 transA = true; 802 transB = false; 803 break; 804 case MATPRODUCT_ABt: 805 transA = false; 806 transB = true; 807 break; 808 default: 809 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 810 } 811 812 A = product->A; 813 B = product->B; 814 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 815 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 816 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 817 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 818 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 819 820 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 821 822 csrmatA = akok->csrmat; 823 csrmatB = bkok->csrmat; 824 825 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 826 if (transA) { 827 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 828 transA = false; 829 } 830 831 if (transB) { 832 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 833 transB = false; 834 } 835 PetscCall(PetscLogGpuTimeBegin()); 836 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 837 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 838 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 839 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 840 #endif 841 842 PetscCall(PetscLogGpuTimeEnd()); 843 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 844 /* shorter version of MatAssemblyEnd_SeqAIJ */ 845 c = (Mat_SeqAIJ *)C->data; 846 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 847 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 848 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 849 c->reallocs = 0; 850 C->info.mallocs = 0; 851 C->info.nz_unneeded = 0; 852 C->assembled = C->was_assembled = PETSC_TRUE; 853 C->num_ass++; 854 PetscFunctionReturn(PETSC_SUCCESS); 855 } 856 857 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 858 { 859 Mat_Product *product = C->product; 860 MatProductType ptype; 861 Mat A, B; 862 bool transA, transB; 863 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 864 MatProductData_SeqAIJKokkos *pdata; 865 MPI_Comm comm; 866 KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 867 868 PetscFunctionBegin; 869 MatCheckProduct(C, 1); 870 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 871 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 872 A = product->A; 873 B = product->B; 874 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 875 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 876 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 877 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 878 csrmatA = akok->csrmat; 879 csrmatB = bkok->csrmat; 880 881 ptype = product->type; 882 // Take advantage of the symmetry if true 883 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 884 ptype = MATPRODUCT_AB; 885 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 886 } 887 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 888 ptype = MATPRODUCT_AB; 889 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 890 } 891 892 switch (ptype) { 893 case MATPRODUCT_AB: 894 transA = false; 895 transB = false; 896 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 897 break; 898 case MATPRODUCT_AtB: 899 transA = true; 900 transB = false; 901 if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs)); 902 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 903 break; 904 case MATPRODUCT_ABt: 905 transA = false; 906 transB = true; 907 if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs)); 908 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 909 break; 910 default: 911 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 912 } 913 PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 914 pdata->reusesym = product->api_user; 915 916 /* TODO: add command line options to select spgemm algorithms */ 917 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 918 919 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 920 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 921 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 922 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 923 #endif 924 #endif 925 PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 926 927 PetscCall(PetscLogGpuTimeBegin()); 928 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 929 if (transA) { 930 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 931 transA = false; 932 } 933 934 if (transB) { 935 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 936 transB = false; 937 } 938 939 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 940 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 941 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 942 calling new Mat_SeqAIJKokkos(). 943 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 944 */ 945 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 946 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 947 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 948 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 949 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 950 #endif 951 PetscCall(PetscLogGpuTimeEnd()); 952 953 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 954 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 955 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 956 PetscFunctionReturn(PETSC_SUCCESS); 957 } 958 959 /* handles sparse matrix matrix ops */ 960 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 961 { 962 Mat_Product *product = mat->product; 963 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 964 965 PetscFunctionBegin; 966 MatCheckProduct(mat, 1); 967 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 968 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 969 if (Biskok && Ciskok) { 970 switch (product->type) { 971 case MATPRODUCT_AB: 972 case MATPRODUCT_AtB: 973 case MATPRODUCT_ABt: 974 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 975 break; 976 case MATPRODUCT_PtAP: 977 case MATPRODUCT_RARt: 978 case MATPRODUCT_ABC: 979 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 980 break; 981 default: 982 break; 983 } 984 } else { /* fallback for AIJ */ 985 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 986 } 987 PetscFunctionReturn(PETSC_SUCCESS); 988 } 989 990 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 991 { 992 Mat_SeqAIJKokkos *aijkok; 993 994 PetscFunctionBegin; 995 PetscCall(PetscLogGpuTimeBegin()); 996 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 997 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 998 KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 999 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1000 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 1001 PetscCall(PetscLogGpuTimeEnd()); 1002 PetscFunctionReturn(PETSC_SUCCESS); 1003 } 1004 1005 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular) 1006 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a) 1007 { 1008 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1009 1010 PetscFunctionBegin; 1011 if (A->assembled && aijseq->diagonaldense) { // no missing diagonals 1012 PetscInt n = PetscMin(A->rmap->n, A->cmap->n); 1013 1014 PetscCall(PetscLogGpuTimeBegin()); 1015 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1016 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1017 const auto &Aa = aijkok->a_dual.view_device(); 1018 const auto &Adiag = aijkok->diag_dual.view_device(); 1019 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; })); 1020 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1021 PetscCall(PetscLogGpuFlops(n)); 1022 PetscCall(PetscLogGpuTimeEnd()); 1023 } else { // need reassembly, very slow! 1024 PetscCall(MatShift_Basic(A, a)); 1025 } 1026 PetscFunctionReturn(PETSC_SUCCESS); 1027 } 1028 1029 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is) 1030 { 1031 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data); 1032 1033 PetscFunctionBegin; 1034 if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals 1035 ConstPetscScalarKokkosView dv; 1036 PetscInt n, nv; 1037 1038 PetscCall(PetscLogGpuTimeBegin()); 1039 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1040 PetscCall(VecGetKokkosView(D, &dv)); 1041 PetscCall(VecGetLocalSize(D, &nv)); 1042 n = PetscMin(Y->rmap->n, Y->cmap->n); 1043 PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match"); 1044 1045 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1046 const auto &Aa = aijkok->a_dual.view_device(); 1047 const auto &Adiag = aijkok->diag_dual.view_device(); 1048 PetscCallCXX(Kokkos::parallel_for( 1049 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1050 if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i); 1051 else Aa(Adiag(i)) += dv(i); 1052 })); 1053 PetscCall(VecRestoreKokkosView(D, &dv)); 1054 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1055 PetscCall(PetscLogGpuFlops(n)); 1056 PetscCall(PetscLogGpuTimeEnd()); 1057 } else { // need reassembly, very slow! 1058 PetscCall(MatDiagonalSet_Default(Y, D, is)); 1059 } 1060 PetscFunctionReturn(PETSC_SUCCESS); 1061 } 1062 1063 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr) 1064 { 1065 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1066 PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz; 1067 ConstPetscScalarKokkosView lv, rv; 1068 1069 PetscFunctionBegin; 1070 PetscCall(PetscLogGpuTimeBegin()); 1071 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1072 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1073 const auto &Aa = aijkok->a_dual.view_device(); 1074 const auto &Ai = aijkok->i_dual.view_device(); 1075 const auto &Aj = aijkok->j_dual.view_device(); 1076 if (ll) { 1077 PetscCall(VecGetLocalSize(ll, &m)); 1078 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1079 PetscCall(VecGetKokkosView(ll, &lv)); 1080 PetscCallCXX(Kokkos::parallel_for( // for each row 1081 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1082 PetscInt i = t.league_rank(); // row i 1083 PetscInt len = Ai(i + 1) - Ai(i); 1084 // scale entries on the row 1085 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); }); 1086 })); 1087 PetscCall(VecRestoreKokkosView(ll, &lv)); 1088 PetscCall(PetscLogGpuFlops(nz)); 1089 } 1090 if (rr) { 1091 PetscCall(VecGetLocalSize(rr, &n)); 1092 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1093 PetscCall(VecGetKokkosView(rr, &rv)); 1094 PetscCallCXX(Kokkos::parallel_for( // for each nonzero 1095 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); })); 1096 PetscCall(VecRestoreKokkosView(rr, &lv)); 1097 PetscCall(PetscLogGpuFlops(nz)); 1098 } 1099 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1100 PetscCall(PetscLogGpuTimeEnd()); 1101 PetscFunctionReturn(PETSC_SUCCESS); 1102 } 1103 1104 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 1105 { 1106 Mat_SeqAIJKokkos *aijkok; 1107 1108 PetscFunctionBegin; 1109 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1110 if (aijkok) { /* Only zero the device if data is already there */ 1111 KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0); 1112 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1113 } else { /* Might be preallocated but not assembled */ 1114 PetscCall(MatZeroEntries_SeqAIJ(A)); 1115 } 1116 PetscFunctionReturn(PETSC_SUCCESS); 1117 } 1118 1119 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1120 { 1121 Mat_SeqAIJKokkos *aijkok; 1122 PetscInt n; 1123 PetscScalarKokkosView xv; 1124 1125 PetscFunctionBegin; 1126 PetscCall(VecGetLocalSize(x, &n)); 1127 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1128 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1129 1130 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1131 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1132 1133 const auto &Aa = aijkok->a_dual.view_device(); 1134 const auto &Ai = aijkok->i_dual.view_device(); 1135 const auto &Adiag = aijkok->diag_dual.view_device(); 1136 1137 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1138 Kokkos::parallel_for( 1139 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1140 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1141 else xv(i) = 0; 1142 }); 1143 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1144 PetscFunctionReturn(PETSC_SUCCESS); 1145 } 1146 1147 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1148 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1149 { 1150 Mat_SeqAIJKokkos *aijkok; 1151 1152 PetscFunctionBegin; 1153 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1154 PetscAssertPointer(kv, 2); 1155 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1156 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1157 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1158 *kv = aijkok->a_dual.view_device(); 1159 PetscFunctionReturn(PETSC_SUCCESS); 1160 } 1161 1162 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1163 { 1164 PetscFunctionBegin; 1165 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1166 PetscAssertPointer(kv, 2); 1167 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1168 PetscFunctionReturn(PETSC_SUCCESS); 1169 } 1170 1171 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1172 { 1173 Mat_SeqAIJKokkos *aijkok; 1174 1175 PetscFunctionBegin; 1176 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1177 PetscAssertPointer(kv, 2); 1178 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1179 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1180 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1181 *kv = aijkok->a_dual.view_device(); 1182 PetscFunctionReturn(PETSC_SUCCESS); 1183 } 1184 1185 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1186 { 1187 PetscFunctionBegin; 1188 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1189 PetscAssertPointer(kv, 2); 1190 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1191 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1192 PetscFunctionReturn(PETSC_SUCCESS); 1193 } 1194 1195 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1196 { 1197 Mat_SeqAIJKokkos *aijkok; 1198 1199 PetscFunctionBegin; 1200 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1201 PetscAssertPointer(kv, 2); 1202 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1203 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1204 *kv = aijkok->a_dual.view_device(); 1205 PetscFunctionReturn(PETSC_SUCCESS); 1206 } 1207 1208 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1209 { 1210 PetscFunctionBegin; 1211 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1212 PetscAssertPointer(kv, 2); 1213 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1214 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1215 PetscFunctionReturn(PETSC_SUCCESS); 1216 } 1217 1218 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A) 1219 { 1220 Mat_SeqAIJKokkos *akok; 1221 1222 PetscFunctionBegin; 1223 auto exec = PetscGetKokkosExecutionSpace(); 1224 // Create host copies of the input aij 1225 auto i_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), i_d); 1226 auto j_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), j_d); 1227 // Don't copy the vals to the host now 1228 auto a_h = Kokkos::create_mirror_view(HostMirrorMemorySpace(), a_d); 1229 1230 MatScalarKokkosDualView a_dual = MatScalarKokkosDualView(a_d, a_h); 1231 // Note we have modified device data so it will copy lazily 1232 a_dual.modify_device(); 1233 MatRowMapKokkosDualView i_dual = MatRowMapKokkosDualView(i_d, i_h); 1234 MatColIdxKokkosDualView j_dual = MatColIdxKokkosDualView(j_d, j_h); 1235 1236 PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_dual.extent(0), i_dual, j_dual, a_dual)); 1237 PetscCall(MatCreate(comm, A)); 1238 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1239 PetscFunctionReturn(PETSC_SUCCESS); 1240 } 1241 1242 /* Computes Y += alpha X */ 1243 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1244 { 1245 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1246 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1247 ConstMatScalarKokkosView Xa; 1248 MatScalarKokkosView Ya; 1249 auto exec = PetscGetKokkosExecutionSpace(); 1250 1251 PetscFunctionBegin; 1252 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1253 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1254 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1255 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1256 PetscCall(PetscLogGpuTimeBegin()); 1257 1258 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1259 PetscBool e; 1260 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1261 if (e) { 1262 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1263 if (e) pattern = SAME_NONZERO_PATTERN; 1264 } 1265 } 1266 1267 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1268 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1269 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1270 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1271 */ 1272 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1273 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1274 Xa = xkok->a_dual.view_device(); 1275 Ya = ykok->a_dual.view_device(); 1276 1277 if (pattern == SAME_NONZERO_PATTERN) { 1278 KokkosBlas::axpy(exec, alpha, Xa, Ya); 1279 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1280 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1281 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1282 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1283 1284 Kokkos::parallel_for( 1285 Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1286 PetscInt i = t.league_rank(); // row i 1287 Kokkos::single(Kokkos::PerTeam(t), [=]() { 1288 // Only one thread works in a team 1289 PetscInt p, q = Yi(i); 1290 for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 1291 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 1292 if (Xj(p) == Yj(q)) { // Found it 1293 Ya(q) += alpha * Xa(p); 1294 q++; 1295 } else { 1296 // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1297 // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1298 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 1299 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 1300 #else 1301 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 1302 #endif 1303 } 1304 } 1305 }); 1306 }); 1307 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1308 } else { // different nonzero patterns 1309 Mat Z; 1310 KokkosCsrMatrix zcsr; 1311 KernelHandle kh; 1312 kh.create_spadd_handle(true); // X, Y are sorted 1313 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1314 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1315 zkok = new Mat_SeqAIJKokkos(zcsr); 1316 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1317 PetscCall(MatHeaderReplace(Y, &Z)); 1318 kh.destroy_spadd_handle(); 1319 } 1320 PetscCall(PetscLogGpuTimeEnd()); 1321 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 1322 PetscFunctionReturn(PETSC_SUCCESS); 1323 } 1324 1325 struct MatCOOStruct_SeqAIJKokkos { 1326 PetscCount n; 1327 PetscCount Atot; 1328 PetscInt nz; 1329 PetscCountKokkosView jmap; 1330 PetscCountKokkosView perm; 1331 1332 MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h) 1333 { 1334 nz = coo_h->nz; 1335 n = coo_h->n; 1336 Atot = coo_h->Atot; 1337 jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1)); 1338 perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot)); 1339 } 1340 }; 1341 1342 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data) 1343 { 1344 PetscFunctionBegin; 1345 PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data)); 1346 PetscFunctionReturn(PETSC_SUCCESS); 1347 } 1348 1349 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1350 { 1351 Mat_SeqAIJKokkos *akok; 1352 Mat_SeqAIJ *aseq; 1353 PetscContainer container_h; 1354 MatCOOStruct_SeqAIJ *coo_h; 1355 MatCOOStruct_SeqAIJKokkos *coo_d; 1356 1357 PetscFunctionBegin; 1358 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1359 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1360 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1361 delete akok; 1362 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE); 1363 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1364 1365 // Copy the COO struct to device 1366 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 1367 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 1368 PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h)); 1369 1370 // Put the COO struct in a container and then attach that to the matrix 1371 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos)); 1372 PetscFunctionReturn(PETSC_SUCCESS); 1373 } 1374 1375 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1376 { 1377 MatScalarKokkosView Aa; 1378 ConstMatScalarKokkosView kv; 1379 PetscMemType memtype; 1380 PetscContainer container; 1381 MatCOOStruct_SeqAIJKokkos *coo; 1382 1383 PetscFunctionBegin; 1384 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 1385 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 1386 1387 const auto &n = coo->n; 1388 const auto &Annz = coo->nz; 1389 const auto &jmap = coo->jmap; 1390 const auto &perm = coo->perm; 1391 1392 PetscCall(PetscGetMemType(v, &memtype)); 1393 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1394 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n)); 1395 } else { 1396 kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */ 1397 } 1398 1399 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1400 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1401 1402 PetscCall(PetscLogGpuTimeBegin()); 1403 Kokkos::parallel_for( 1404 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) { 1405 PetscScalar sum = 0.0; 1406 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1407 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1408 }); 1409 PetscCall(PetscLogGpuTimeEnd()); 1410 1411 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1412 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1413 PetscFunctionReturn(PETSC_SUCCESS); 1414 } 1415 1416 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1417 { 1418 PetscFunctionBegin; 1419 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1420 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1421 B->offloadmask = PETSC_OFFLOAD_CPU; 1422 PetscFunctionReturn(PETSC_SUCCESS); 1423 } 1424 1425 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1426 { 1427 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1428 1429 PetscFunctionBegin; 1430 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1431 A->boundtocpu = PETSC_FALSE; 1432 1433 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1434 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1435 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1436 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1437 A->ops->scale = MatScale_SeqAIJKokkos; 1438 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1439 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1440 A->ops->mult = MatMult_SeqAIJKokkos; 1441 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1442 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1443 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1444 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1445 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1446 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1447 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1448 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1449 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1450 A->ops->shift = MatShift_SeqAIJKokkos; 1451 A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos; 1452 A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos; 1453 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1454 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1455 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1456 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1457 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1458 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1459 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1460 1461 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1462 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1463 #if defined(PETSC_HAVE_HYPRE) 1464 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE)); 1465 #endif 1466 PetscFunctionReturn(PETSC_SUCCESS); 1467 } 1468 1469 /* 1470 Extract the (prescribled) diagonal blocks of the matrix and then invert them 1471 1472 Input Parameters: 1473 + A - the MATSEQAIJKOKKOS matrix 1474 . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i) 1475 . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[] 1476 . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i) 1477 - work - a pre-allocated work buffer (as big as diagVal) for use by this routine 1478 1479 Output Parameter: 1480 . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order) 1481 */ 1482 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal) 1483 { 1484 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1485 PetscInt nblocks = bs.extent(0) - 1; 1486 1487 PetscFunctionBegin; 1488 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device 1489 1490 // Pull out the diagonal blocks of the matrix and then invert the blocks 1491 auto Aa = akok->a_dual.view_device(); 1492 auto Ai = akok->i_dual.view_device(); 1493 auto Aj = akok->j_dual.view_device(); 1494 auto Adiag = akok->diag_dual.view_device(); 1495 // TODO: how to tune the team size? 1496 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY) 1497 auto ts = Kokkos::AUTO(); 1498 #else 1499 auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs 1500 #endif 1501 PetscCallCXX(Kokkos::parallel_for( 1502 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) { 1503 const PetscInt bid = teamMember.league_rank(); // block id 1504 const PetscInt rstart = bs(bid); // this block starts from this row 1505 const PetscInt m = bs(bid + 1) - bs(bid); // size of this block 1506 const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order 1507 const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m); 1508 1509 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B 1510 PetscInt i = rstart + r; // i-th row in A 1511 1512 if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case) 1513 PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row 1514 1515 for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet 1516 if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero 1517 B(r, c) = 0.0; 1518 } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column 1519 B(r, c) = Aa(first + c); 1520 } else { // this entry does not show up in the CSR 1521 B(r, c) = 0.0; 1522 } 1523 } 1524 } else { // rare case that the diagonal does not exist 1525 const PetscInt begin = Ai(i); 1526 const PetscInt end = Ai(i + 1); 1527 for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0; 1528 for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not. 1529 if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j); 1530 else if (Aj(j) >= rstart + m) break; 1531 } 1532 } 1533 }); 1534 1535 // LU-decompose B (w/o pivoting) and then invert B 1536 KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0); 1537 KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W); 1538 })); 1539 // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels 1540 PetscFunctionReturn(PETSC_SUCCESS); 1541 } 1542 1543 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1544 { 1545 Mat_SeqAIJ *aseq; 1546 PetscInt i, m, n; 1547 auto exec = PetscGetKokkosExecutionSpace(); 1548 1549 PetscFunctionBegin; 1550 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1551 1552 m = akok->nrows(); 1553 n = akok->ncols(); 1554 PetscCall(MatSetSizes(A, m, n, m, n)); 1555 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1556 1557 /* Set up data structures of A as a MATSEQAIJ */ 1558 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1559 aseq = (Mat_SeqAIJ *)A->data; 1560 1561 PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */ 1562 PetscCallCXX(akok->j_dual.sync_host(exec)); 1563 PetscCallCXX(exec.fence()); 1564 1565 aseq->i = akok->i_host_data(); 1566 aseq->j = akok->j_host_data(); 1567 aseq->a = akok->a_host_data(); 1568 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1569 aseq->free_a = PETSC_FALSE; 1570 aseq->free_ij = PETSC_FALSE; 1571 aseq->nz = akok->nnz(); 1572 aseq->maxnz = aseq->nz; 1573 1574 PetscCall(PetscMalloc1(m, &aseq->imax)); 1575 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1576 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1577 1578 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1579 akok->nonzerostate = A->nonzerostate; 1580 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1581 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1582 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1583 PetscFunctionReturn(PETSC_SUCCESS); 1584 } 1585 1586 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 1587 { 1588 PetscFunctionBegin; 1589 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1590 *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 1591 PetscFunctionReturn(PETSC_SUCCESS); 1592 } 1593 1594 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 1595 { 1596 Mat_SeqAIJKokkos *akok; 1597 1598 PetscFunctionBegin; 1599 PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 1600 PetscCall(MatCreate(comm, A)); 1601 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1602 PetscFunctionReturn(PETSC_SUCCESS); 1603 } 1604 1605 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1606 1607 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1608 */ 1609 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1610 { 1611 PetscFunctionBegin; 1612 PetscCall(MatCreate(comm, A)); 1613 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1614 PetscFunctionReturn(PETSC_SUCCESS); 1615 } 1616 1617 /*@C 1618 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1619 (the default parallel PETSc format). This matrix will ultimately be handled by 1620 Kokkos for calculations. 1621 1622 Collective 1623 1624 Input Parameters: 1625 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1626 . m - number of rows 1627 . n - number of columns 1628 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 1629 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 1630 1631 Output Parameter: 1632 . A - the matrix 1633 1634 Level: intermediate 1635 1636 Notes: 1637 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1638 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1639 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1640 1641 The AIJ format, also called 1642 compressed row storage, is fully compatible with standard Fortran 1643 storage. That is, the stored row and column indices can begin at 1644 either one (as in Fortran) or zero. 1645 1646 Specify the preallocated storage with either `nz` or `nnz` (not both). 1647 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1648 allocation. 1649 1650 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()` 1651 @*/ 1652 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1653 { 1654 PetscFunctionBegin; 1655 PetscCall(PetscKokkosInitializeCheck()); 1656 PetscCall(MatCreate(comm, A)); 1657 PetscCall(MatSetSizes(*A, m, n, m, n)); 1658 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1659 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1660 PetscFunctionReturn(PETSC_SUCCESS); 1661 } 1662 1663 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1664 { 1665 PetscFunctionBegin; 1666 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1667 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1668 PetscFunctionReturn(PETSC_SUCCESS); 1669 } 1670 1671 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1672 { 1673 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1674 1675 PetscFunctionBegin; 1676 if (!factors->sptrsv_symbolic_completed) { 1677 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d); 1678 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d); 1679 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1680 } 1681 PetscFunctionReturn(PETSC_SUCCESS); 1682 } 1683 1684 /* Check if we need to update factors etc for transpose solve */ 1685 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1686 { 1687 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1688 MatColIdxType n = A->rmap->n; 1689 1690 PetscFunctionBegin; 1691 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1692 /* Update L^T and do sptrsv symbolic */ 1693 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0 1694 factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 1695 factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 1696 1697 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d, 1698 factors->iLt_d, factors->jLt_d, factors->aLt_d); 1699 1700 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1701 We have to sort the indices, until KK provides finer control options. 1702 */ 1703 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d); 1704 1705 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d); 1706 1707 /* Update U^T and do sptrsv symbolic */ 1708 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0 1709 factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 1710 factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 1711 1712 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d, 1713 factors->iUt_d, factors->jUt_d, factors->aUt_d); 1714 1715 /* Sort indices. See comments above */ 1716 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d); 1717 1718 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d); 1719 factors->transpose_updated = PETSC_TRUE; 1720 } 1721 PetscFunctionReturn(PETSC_SUCCESS); 1722 } 1723 1724 /* Solve Ax = b, with A = LU */ 1725 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x) 1726 { 1727 ConstPetscScalarKokkosView bv; 1728 PetscScalarKokkosView xv; 1729 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1730 1731 PetscFunctionBegin; 1732 PetscCall(PetscLogGpuTimeBegin()); 1733 PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 1734 PetscCall(VecGetKokkosView(b, &bv)); 1735 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1736 /* Solve L tmpv = b */ 1737 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector)); 1738 /* Solve Ux = tmpv */ 1739 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv)); 1740 PetscCall(VecRestoreKokkosView(b, &bv)); 1741 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1742 PetscCall(PetscLogGpuTimeEnd()); 1743 PetscFunctionReturn(PETSC_SUCCESS); 1744 } 1745 1746 /* Solve A^T x = b, where A^T = U^T L^T */ 1747 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x) 1748 { 1749 ConstPetscScalarKokkosView bv; 1750 PetscScalarKokkosView xv; 1751 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1752 1753 PetscFunctionBegin; 1754 PetscCall(PetscLogGpuTimeBegin()); 1755 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 1756 PetscCall(VecGetKokkosView(b, &bv)); 1757 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1758 /* Solve U^T tmpv = b */ 1759 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector); 1760 1761 /* Solve L^T x = tmpv */ 1762 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv); 1763 PetscCall(VecRestoreKokkosView(b, &bv)); 1764 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1765 PetscCall(PetscLogGpuTimeEnd()); 1766 PetscFunctionReturn(PETSC_SUCCESS); 1767 } 1768 1769 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1770 { 1771 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1772 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1773 PetscInt fill_lev = info->levels; 1774 1775 PetscFunctionBegin; 1776 PetscCall(PetscLogGpuTimeBegin()); 1777 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1778 1779 auto a_d = aijkok->a_dual.view_device(); 1780 auto i_d = aijkok->i_dual.view_device(); 1781 auto j_d = aijkok->j_dual.view_device(); 1782 1783 KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d); 1784 1785 B->assembled = PETSC_TRUE; 1786 B->preallocated = PETSC_TRUE; 1787 B->ops->solve = MatSolve_SeqAIJKokkos; 1788 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1789 B->ops->matsolve = NULL; 1790 B->ops->matsolvetranspose = NULL; 1791 B->offloadmask = PETSC_OFFLOAD_GPU; 1792 1793 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1794 factors->transpose_updated = PETSC_FALSE; 1795 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1796 /* TODO: log flops, but how to know that? */ 1797 PetscCall(PetscLogGpuTimeEnd()); 1798 PetscFunctionReturn(PETSC_SUCCESS); 1799 } 1800 1801 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1802 { 1803 Mat_SeqAIJKokkos *aijkok; 1804 Mat_SeqAIJ *b; 1805 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1806 PetscInt fill_lev = info->levels; 1807 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 1808 PetscInt n = A->rmap->n; 1809 1810 PetscFunctionBegin; 1811 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1812 /* Rebuild factors */ 1813 if (factors) { 1814 factors->Destroy(); 1815 } /* Destroy the old if it exists */ 1816 else { 1817 B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n); 1818 } 1819 1820 /* Create a spiluk handle and then do symbolic factorization */ 1821 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 1822 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 1823 1824 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1825 1826 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 1827 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 1828 Kokkos::realloc(factors->iU_d, n + 1); 1829 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 1830 1831 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1832 auto i_d = aijkok->i_dual.view_device(); 1833 auto j_d = aijkok->j_dual.view_device(); 1834 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d); 1835 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1836 1837 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1838 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 1839 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 1840 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 1841 1842 /* TODO: add options to select sptrsv algorithms */ 1843 /* Create sptrsv handles for L, U and their transpose */ 1844 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1845 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1846 #else 1847 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1848 #endif 1849 1850 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 1851 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 1852 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 1853 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 1854 1855 /* Fill fields of the factor matrix B */ 1856 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 1857 b = (Mat_SeqAIJ *)B->data; 1858 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 1859 B->info.fill_ratio_given = info->fill; 1860 B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 1861 1862 B->offloadmask = PETSC_OFFLOAD_GPU; 1863 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1864 PetscFunctionReturn(PETSC_SUCCESS); 1865 } 1866 1867 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type) 1868 { 1869 PetscFunctionBegin; 1870 *type = MATSOLVERKOKKOS; 1871 PetscFunctionReturn(PETSC_SUCCESS); 1872 } 1873 1874 /*MC 1875 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1876 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 1877 1878 Level: beginner 1879 1880 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1881 M*/ 1882 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1883 { 1884 PetscInt n = A->rmap->n; 1885 1886 PetscFunctionBegin; 1887 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 1888 PetscCall(MatSetSizes(*B, n, n, n, n)); 1889 (*B)->factortype = ftype; 1890 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 1891 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1892 1893 if (ftype == MAT_FACTOR_LU) { 1894 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1895 (*B)->canuseordering = PETSC_TRUE; 1896 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1897 } else if (ftype == MAT_FACTOR_ILU) { 1898 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1899 (*B)->canuseordering = PETSC_FALSE; 1900 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1901 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1902 1903 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 1904 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos)); 1905 PetscFunctionReturn(PETSC_SUCCESS); 1906 } 1907 1908 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1909 { 1910 PetscFunctionBegin; 1911 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1912 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1913 PetscFunctionReturn(PETSC_SUCCESS); 1914 } 1915 1916 /* Utility to print out a KokkosCsrMatrix for debugging */ 1917 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 1918 { 1919 const auto &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map); 1920 const auto &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries); 1921 const auto &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values); 1922 const PetscInt *i = iv.data(); 1923 const PetscInt *j = jv.data(); 1924 const PetscScalar *a = av.data(); 1925 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 1926 1927 PetscFunctionBegin; 1928 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 1929 for (PetscInt k = 0; k < m; k++) { 1930 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 1931 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 1932 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 1933 } 1934 PetscFunctionReturn(PETSC_SUCCESS); 1935 } 1936