1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petsc/private/sfimpl.h> 5 #include <petscsystypes.h> 6 #include <petscerror.h> 7 8 #include <Kokkos_Core.hpp> 9 #include <KokkosBlas.hpp> 10 #include <KokkosSparse_CrsMatrix.hpp> 11 #include <KokkosSparse_spmv.hpp> 12 #include <KokkosSparse_spiluk.hpp> 13 #include <KokkosSparse_sptrsv.hpp> 14 #include <KokkosSparse_spgemm.hpp> 15 #include <KokkosSparse_spadd.hpp> 16 17 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 18 19 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 20 #include <KokkosSparse_Utils.hpp> 21 using KokkosSparse::sort_crs_matrix; 22 using KokkosSparse::Impl::transpose_matrix; 23 #else 24 #include <KokkosKernels_Sorting.hpp> 25 using KokkosKernels::sort_crs_matrix; 26 using KokkosKernels::Impl::transpose_matrix; 27 #endif 28 29 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 30 31 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 32 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 33 In the latter case, it is important to set a_dual's sync state correctly. 34 */ 35 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 36 { 37 Mat_SeqAIJ *aijseq; 38 Mat_SeqAIJKokkos *aijkok; 39 40 PetscFunctionBegin; 41 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 42 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 43 44 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 45 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 46 47 /* If aijkok does not exist, we just copy i, j to device. 48 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 49 In both cases, we build a new aijkok structure. 50 */ 51 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 52 delete aijkok; 53 aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 54 A->spptr = aijkok; 55 } 56 57 PetscFunctionReturn(PETSC_SUCCESS); 58 } 59 60 /* Sync CSR data to device if not yet */ 61 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 62 { 63 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 64 65 PetscFunctionBegin; 66 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 67 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 68 if (aijkok->a_dual.need_sync_device()) { 69 aijkok->a_dual.sync_device(); 70 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 71 aijkok->hermitian_updated = PETSC_FALSE; 72 } 73 PetscFunctionReturn(PETSC_SUCCESS); 74 } 75 76 /* Mark the CSR data on device as modified */ 77 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 78 { 79 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 80 81 PetscFunctionBegin; 82 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 83 aijkok->a_dual.clear_sync_state(); 84 aijkok->a_dual.modify_device(); 85 aijkok->transpose_updated = PETSC_FALSE; 86 aijkok->hermitian_updated = PETSC_FALSE; 87 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 88 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 89 PetscFunctionReturn(PETSC_SUCCESS); 90 } 91 92 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 93 { 94 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 95 96 PetscFunctionBegin; 97 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 98 /* We do not expect one needs factors on host */ 99 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 100 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 101 aijkok->a_dual.sync_host(); 102 PetscFunctionReturn(PETSC_SUCCESS); 103 } 104 105 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 106 { 107 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 108 109 PetscFunctionBegin; 110 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 111 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 112 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 113 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 114 */ 115 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 116 aijkok->a_dual.sync_host(); 117 *array = aijkok->a_dual.view_host().data(); 118 } else { /* Happens when calling MatSetValues on a newly created matrix */ 119 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 120 } 121 PetscFunctionReturn(PETSC_SUCCESS); 122 } 123 124 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 125 { 126 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 127 128 PetscFunctionBegin; 129 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 130 PetscFunctionReturn(PETSC_SUCCESS); 131 } 132 133 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 134 { 135 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 136 137 PetscFunctionBegin; 138 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 139 aijkok->a_dual.sync_host(); 140 *array = aijkok->a_dual.view_host().data(); 141 } else { 142 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 143 } 144 PetscFunctionReturn(PETSC_SUCCESS); 145 } 146 147 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 148 { 149 PetscFunctionBegin; 150 *array = NULL; 151 PetscFunctionReturn(PETSC_SUCCESS); 152 } 153 154 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 155 { 156 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 157 158 PetscFunctionBegin; 159 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 160 *array = aijkok->a_dual.view_host().data(); 161 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 162 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 163 } 164 PetscFunctionReturn(PETSC_SUCCESS); 165 } 166 167 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 168 { 169 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 170 171 PetscFunctionBegin; 172 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 173 aijkok->a_dual.clear_sync_state(); 174 aijkok->a_dual.modify_host(); 175 } 176 PetscFunctionReturn(PETSC_SUCCESS); 177 } 178 179 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 180 { 181 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 182 183 PetscFunctionBegin; 184 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 185 186 if (i) *i = aijkok->i_device_data(); 187 if (j) *j = aijkok->j_device_data(); 188 if (a) { 189 aijkok->a_dual.sync_device(); 190 *a = aijkok->a_device_data(); 191 } 192 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 193 PetscFunctionReturn(PETSC_SUCCESS); 194 } 195 196 /* 197 Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 198 199 Input Parameter: 200 . A - the MATSEQAIJKOKKOS matrix 201 202 Output Parameters: 203 + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 204 - T_d - the transpose on device, whose value array is allocated but not initialized 205 */ 206 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 207 { 208 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 209 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 210 const PetscInt *Ai = aseq->i, *Aj = aseq->j; 211 MatRowMapKokkosViewHost Ti_h("Ti", n + 1); 212 MatRowMapType *Ti = Ti_h.data(); 213 MatColIdxKokkosViewHost Tj_h("Tj", nz); 214 MatRowMapKokkosViewHost perm_h("permutation", nz); 215 PetscInt *Tj = Tj_h.data(); 216 PetscInt *perm = perm_h.data(); 217 PetscInt *offset; 218 219 PetscFunctionBegin; 220 // Populate Ti 221 PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 222 Ti++; 223 for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 224 Ti--; 225 for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 226 227 // Populate Tj and the permutation array 228 PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 229 for (PetscInt i = 0; i < m; i++) { 230 for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 231 PetscInt r = Aj[j]; // row r of T 232 PetscInt disp = Ti[r] + offset[r]; 233 234 Tj[disp] = i; // col i of T 235 perm[disp] = j; 236 offset[r]++; 237 } 238 } 239 PetscCall(PetscFree(offset)); 240 241 // Sort each row of T, along with the permutation array 242 for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 243 244 // Output perm and T on device 245 auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 246 auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 247 PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 248 PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 249 PetscFunctionReturn(PETSC_SUCCESS); 250 } 251 252 // Generate the transpose on device and cache it internally 253 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 254 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 255 { 256 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 257 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 258 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 259 KokkosCsrMatrix &T = akok->csrmatT; 260 261 PetscFunctionBegin; 262 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 263 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's valeus since we are going to access them on device 264 265 const auto &Aa = akok->a_dual.view_device(); 266 267 if (A->symmetric == PETSC_BOOL3_TRUE) { 268 *csrmatT = akok->csrmat; 269 } else { 270 // See if we already have a cached transpose and its value is up to date 271 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 272 if (!akok->transpose_updated) { // if the value is out of date, update the cached version 273 const auto &perm = akok->transpose_perm; // get the permutation array 274 auto &Ta = T.values; 275 276 PetscCallCXX(Kokkos::parallel_for( 277 nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 278 } 279 } else { // Generate T of size n x m for the first time 280 MatRowMapKokkosView perm; 281 282 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 283 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 284 PetscCallCXX(Kokkos::parallel_for( 285 nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 286 } 287 akok->transpose_updated = PETSC_TRUE; 288 *csrmatT = akok->csrmatT; 289 } 290 PetscFunctionReturn(PETSC_SUCCESS); 291 } 292 293 // Generate the Hermitian on device and cache it internally 294 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 295 { 296 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 297 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 298 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 299 KokkosCsrMatrix &T = akok->csrmatH; 300 301 PetscFunctionBegin; 302 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 303 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 304 305 const auto &Aa = akok->a_dual.view_device(); 306 307 if (A->hermitian == PETSC_BOOL3_TRUE) { 308 *csrmatH = akok->csrmat; 309 } else { 310 // See if we already have a cached hermitian and its value is up to date 311 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 312 if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 313 const auto &perm = akok->transpose_perm; // get the permutation array 314 auto &Ta = T.values; 315 316 PetscCallCXX(Kokkos::parallel_for( 317 nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 318 } 319 } else { // Generate T of size n x m for the first time 320 MatRowMapKokkosView perm; 321 322 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 323 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 324 PetscCallCXX(Kokkos::parallel_for( 325 nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 326 } 327 akok->hermitian_updated = PETSC_TRUE; 328 *csrmatH = akok->csrmatH; 329 } 330 PetscFunctionReturn(PETSC_SUCCESS); 331 } 332 333 /* y = A x */ 334 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 335 { 336 Mat_SeqAIJKokkos *aijkok; 337 ConstPetscScalarKokkosView xv; 338 PetscScalarKokkosView yv; 339 340 PetscFunctionBegin; 341 PetscCall(PetscLogGpuTimeBegin()); 342 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 343 PetscCall(VecGetKokkosView(xx, &xv)); 344 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 345 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 346 PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 347 PetscCall(VecRestoreKokkosView(xx, &xv)); 348 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 349 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 350 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 351 PetscCall(PetscLogGpuTimeEnd()); 352 PetscFunctionReturn(PETSC_SUCCESS); 353 } 354 355 /* y = A^T x */ 356 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 357 { 358 Mat_SeqAIJKokkos *aijkok; 359 const char *mode; 360 ConstPetscScalarKokkosView xv; 361 PetscScalarKokkosView yv; 362 KokkosCsrMatrix csrmat; 363 364 PetscFunctionBegin; 365 PetscCall(PetscLogGpuTimeBegin()); 366 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 367 PetscCall(VecGetKokkosView(xx, &xv)); 368 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 369 if (A->form_explicit_transpose) { 370 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 371 mode = "N"; 372 } else { 373 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 374 csrmat = aijkok->csrmat; 375 mode = "T"; 376 } 377 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 378 PetscCall(VecRestoreKokkosView(xx, &xv)); 379 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 380 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 381 PetscCall(PetscLogGpuTimeEnd()); 382 PetscFunctionReturn(PETSC_SUCCESS); 383 } 384 385 /* y = A^H x */ 386 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 387 { 388 Mat_SeqAIJKokkos *aijkok; 389 const char *mode; 390 ConstPetscScalarKokkosView xv; 391 PetscScalarKokkosView yv; 392 KokkosCsrMatrix csrmat; 393 394 PetscFunctionBegin; 395 PetscCall(PetscLogGpuTimeBegin()); 396 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 397 PetscCall(VecGetKokkosView(xx, &xv)); 398 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 399 if (A->form_explicit_transpose) { 400 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 401 mode = "N"; 402 } else { 403 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 404 csrmat = aijkok->csrmat; 405 mode = "C"; 406 } 407 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 408 PetscCall(VecRestoreKokkosView(xx, &xv)); 409 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 410 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 411 PetscCall(PetscLogGpuTimeEnd()); 412 PetscFunctionReturn(PETSC_SUCCESS); 413 } 414 415 /* z = A x + y */ 416 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 417 { 418 Mat_SeqAIJKokkos *aijkok; 419 ConstPetscScalarKokkosView xv, yv; 420 PetscScalarKokkosView zv; 421 422 PetscFunctionBegin; 423 PetscCall(PetscLogGpuTimeBegin()); 424 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 425 PetscCall(VecGetKokkosView(xx, &xv)); 426 PetscCall(VecGetKokkosView(yy, &yv)); 427 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 428 if (zz != yy) Kokkos::deep_copy(zv, yv); 429 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 430 PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 431 PetscCall(VecRestoreKokkosView(xx, &xv)); 432 PetscCall(VecRestoreKokkosView(yy, &yv)); 433 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 434 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 435 PetscCall(PetscLogGpuTimeEnd()); 436 PetscFunctionReturn(PETSC_SUCCESS); 437 } 438 439 /* z = A^T x + y */ 440 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 441 { 442 Mat_SeqAIJKokkos *aijkok; 443 const char *mode; 444 ConstPetscScalarKokkosView xv, yv; 445 PetscScalarKokkosView zv; 446 KokkosCsrMatrix csrmat; 447 448 PetscFunctionBegin; 449 PetscCall(PetscLogGpuTimeBegin()); 450 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 451 PetscCall(VecGetKokkosView(xx, &xv)); 452 PetscCall(VecGetKokkosView(yy, &yv)); 453 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 454 if (zz != yy) Kokkos::deep_copy(zv, yv); 455 if (A->form_explicit_transpose) { 456 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 457 mode = "N"; 458 } else { 459 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 460 csrmat = aijkok->csrmat; 461 mode = "T"; 462 } 463 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 464 PetscCall(VecRestoreKokkosView(xx, &xv)); 465 PetscCall(VecRestoreKokkosView(yy, &yv)); 466 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 467 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 468 PetscCall(PetscLogGpuTimeEnd()); 469 PetscFunctionReturn(PETSC_SUCCESS); 470 } 471 472 /* z = A^H x + y */ 473 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 474 { 475 Mat_SeqAIJKokkos *aijkok; 476 const char *mode; 477 ConstPetscScalarKokkosView xv, yv; 478 PetscScalarKokkosView zv; 479 KokkosCsrMatrix csrmat; 480 481 PetscFunctionBegin; 482 PetscCall(PetscLogGpuTimeBegin()); 483 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 484 PetscCall(VecGetKokkosView(xx, &xv)); 485 PetscCall(VecGetKokkosView(yy, &yv)); 486 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 487 if (zz != yy) Kokkos::deep_copy(zv, yv); 488 if (A->form_explicit_transpose) { 489 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 490 mode = "N"; 491 } else { 492 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 493 csrmat = aijkok->csrmat; 494 mode = "C"; 495 } 496 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 497 PetscCall(VecRestoreKokkosView(xx, &xv)); 498 PetscCall(VecRestoreKokkosView(yy, &yv)); 499 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 500 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 501 PetscCall(PetscLogGpuTimeEnd()); 502 PetscFunctionReturn(PETSC_SUCCESS); 503 } 504 505 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 506 { 507 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 508 509 PetscFunctionBegin; 510 switch (op) { 511 case MAT_FORM_EXPLICIT_TRANSPOSE: 512 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 513 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 514 A->form_explicit_transpose = flg; 515 break; 516 default: 517 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 518 break; 519 } 520 PetscFunctionReturn(PETSC_SUCCESS); 521 } 522 523 /* Depending on reuse, either build a new mat, or use the existing mat */ 524 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 525 { 526 Mat_SeqAIJ *aseq; 527 528 PetscFunctionBegin; 529 PetscCall(PetscKokkosInitializeCheck()); 530 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 531 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */ 532 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 533 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 534 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 535 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 536 PetscCall(PetscFree(A->defaultvectype)); 537 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 538 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 539 PetscCall(MatSetOps_SeqAIJKokkos(A)); 540 aseq = static_cast<Mat_SeqAIJ *>(A->data); 541 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 542 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 543 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE); 544 } 545 } 546 PetscFunctionReturn(PETSC_SUCCESS); 547 } 548 549 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 550 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 551 */ 552 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 553 { 554 Mat_SeqAIJ *bseq; 555 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 556 Mat mat; 557 558 PetscFunctionBegin; 559 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 560 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 561 mat = *B; 562 if (A->assembled) { 563 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 564 bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE); 565 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 566 /* Now copy values to B if needed */ 567 if (dupOption == MAT_COPY_VALUES) { 568 if (akok->a_dual.need_sync_device()) { 569 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 570 bkok->a_dual.modify_host(); 571 } else { /* If device has the latest data, we only copy data on device */ 572 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 573 bkok->a_dual.modify_device(); 574 } 575 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 576 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 577 bkok->a_dual.modify_host(); 578 } 579 mat->spptr = bkok; 580 } 581 582 PetscCall(PetscFree(mat->defaultvectype)); 583 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 584 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 585 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 586 PetscFunctionReturn(PETSC_SUCCESS); 587 } 588 589 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 590 { 591 Mat At; 592 KokkosCsrMatrix internT; 593 Mat_SeqAIJKokkos *atkok, *bkok; 594 595 PetscFunctionBegin; 596 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 597 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 598 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 599 /* Deep copy internT, as we want to isolate the internal transpose */ 600 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 601 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 602 if (reuse == MAT_INITIAL_MATRIX) *B = At; 603 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 604 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 605 if ((*B)->assembled) { 606 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 607 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 608 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 609 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 610 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 611 MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 612 MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 613 PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 614 PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 615 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 616 } 617 PetscFunctionReturn(PETSC_SUCCESS); 618 } 619 620 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 621 { 622 Mat_SeqAIJKokkos *aijkok; 623 624 PetscFunctionBegin; 625 if (A->factortype == MAT_FACTOR_NONE) { 626 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 627 delete aijkok; 628 } else { 629 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 630 } 631 A->spptr = NULL; 632 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 633 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 634 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 635 PetscCall(MatDestroy_SeqAIJ(A)); 636 PetscFunctionReturn(PETSC_SUCCESS); 637 } 638 639 /*MC 640 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 641 642 A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 643 644 Options Database Key: 645 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 646 647 Level: beginner 648 649 .seealso: [](chapter_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 650 M*/ 651 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 652 { 653 PetscFunctionBegin; 654 PetscCall(PetscKokkosInitializeCheck()); 655 PetscCall(MatCreate_SeqAIJ(A)); 656 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 657 PetscFunctionReturn(PETSC_SUCCESS); 658 } 659 660 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 661 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 662 { 663 Mat_SeqAIJ *a, *b; 664 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 665 MatScalarKokkosView aa, ba, ca; 666 MatRowMapKokkosView ai, bi, ci; 667 MatColIdxKokkosView aj, bj, cj; 668 PetscInt m, n, nnz, aN; 669 670 PetscFunctionBegin; 671 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 672 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 673 PetscValidPointer(C, 4); 674 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 675 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 676 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 677 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 678 679 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 680 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 681 a = static_cast<Mat_SeqAIJ *>(A->data); 682 b = static_cast<Mat_SeqAIJ *>(B->data); 683 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 684 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 685 aa = akok->a_dual.view_device(); 686 ai = akok->i_dual.view_device(); 687 ba = bkok->a_dual.view_device(); 688 bi = bkok->i_dual.view_device(); 689 m = A->rmap->n; /* M, N and nnz of C */ 690 n = A->cmap->n + B->cmap->n; 691 nnz = a->nz + b->nz; 692 aN = A->cmap->n; /* N of A */ 693 if (reuse == MAT_INITIAL_MATRIX) { 694 aj = akok->j_dual.view_device(); 695 bj = bkok->j_dual.view_device(); 696 auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 697 auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 698 auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 699 ca = ca_dual.view_device(); 700 ci = ci_dual.view_device(); 701 cj = cj_dual.view_device(); 702 703 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 704 Kokkos::parallel_for( 705 Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 706 PetscInt i = t.league_rank(); /* row i */ 707 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 708 709 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 710 ci(i) = coffset; 711 if (i == m - 1) ci(m) = ai(m) + bi(m); 712 }); 713 714 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 715 if (k < alen) { 716 ca(coffset + k) = aa(ai(i) + k); 717 cj(coffset + k) = aj(ai(i) + k); 718 } else { 719 ca(coffset + k) = ba(bi(i) + k - alen); 720 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 721 } 722 }); 723 }); 724 ca_dual.modify_device(); 725 ci_dual.modify_device(); 726 cj_dual.modify_device(); 727 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 728 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 729 } else if (reuse == MAT_REUSE_MATRIX) { 730 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 731 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 732 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 733 ca = ckok->a_dual.view_device(); 734 ci = ckok->i_dual.view_device(); 735 736 Kokkos::parallel_for( 737 Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 738 PetscInt i = t.league_rank(); /* row i */ 739 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 740 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 741 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 742 else ca(ci(i) + k) = ba(bi(i) + k - alen); 743 }); 744 }); 745 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 746 } 747 PetscFunctionReturn(PETSC_SUCCESS); 748 } 749 750 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 751 { 752 PetscFunctionBegin; 753 delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 754 PetscFunctionReturn(PETSC_SUCCESS); 755 } 756 757 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 758 { 759 Mat_Product *product = C->product; 760 Mat A, B; 761 bool transA, transB; /* use bool, since KK needs this type */ 762 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 763 Mat_SeqAIJ *c; 764 MatProductData_SeqAIJKokkos *pdata; 765 KokkosCsrMatrix csrmatA, csrmatB; 766 767 PetscFunctionBegin; 768 MatCheckProduct(C, 1); 769 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 770 pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 771 772 // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 773 // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 774 // we still do numeric. 775 if (pdata->reusesym) { // numeric reuses results from symbolic 776 pdata->reusesym = PETSC_FALSE; 777 PetscFunctionReturn(PETSC_SUCCESS); 778 } 779 780 switch (product->type) { 781 case MATPRODUCT_AB: 782 transA = false; 783 transB = false; 784 break; 785 case MATPRODUCT_AtB: 786 transA = true; 787 transB = false; 788 break; 789 case MATPRODUCT_ABt: 790 transA = false; 791 transB = true; 792 break; 793 default: 794 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 795 } 796 797 A = product->A; 798 B = product->B; 799 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 800 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 801 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 802 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 803 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 804 805 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 806 807 csrmatA = akok->csrmat; 808 csrmatB = bkok->csrmat; 809 810 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 811 if (transA) { 812 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 813 transA = false; 814 } 815 816 if (transB) { 817 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 818 transB = false; 819 } 820 PetscCall(PetscLogGpuTimeBegin()); 821 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 822 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 823 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 824 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 825 #endif 826 827 PetscCall(PetscLogGpuTimeEnd()); 828 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 829 /* shorter version of MatAssemblyEnd_SeqAIJ */ 830 c = (Mat_SeqAIJ *)C->data; 831 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 832 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 833 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 834 c->reallocs = 0; 835 C->info.mallocs = 0; 836 C->info.nz_unneeded = 0; 837 C->assembled = C->was_assembled = PETSC_TRUE; 838 C->num_ass++; 839 PetscFunctionReturn(PETSC_SUCCESS); 840 } 841 842 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 843 { 844 Mat_Product *product = C->product; 845 MatProductType ptype; 846 Mat A, B; 847 bool transA, transB; 848 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 849 MatProductData_SeqAIJKokkos *pdata; 850 MPI_Comm comm; 851 KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 852 853 PetscFunctionBegin; 854 MatCheckProduct(C, 1); 855 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 856 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 857 A = product->A; 858 B = product->B; 859 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 860 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 861 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 862 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 863 csrmatA = akok->csrmat; 864 csrmatB = bkok->csrmat; 865 866 ptype = product->type; 867 // Take advantage of the symmetry if true 868 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 869 ptype = MATPRODUCT_AB; 870 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 871 } 872 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 873 ptype = MATPRODUCT_AB; 874 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 875 } 876 877 switch (ptype) { 878 case MATPRODUCT_AB: 879 transA = false; 880 transB = false; 881 break; 882 case MATPRODUCT_AtB: 883 transA = true; 884 transB = false; 885 break; 886 case MATPRODUCT_ABt: 887 transA = false; 888 transB = true; 889 break; 890 default: 891 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 892 } 893 PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 894 pdata->reusesym = product->api_user; 895 896 /* TODO: add command line options to select spgemm algorithms */ 897 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 898 899 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 900 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 901 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 902 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 903 #endif 904 #endif 905 PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 906 907 PetscCall(PetscLogGpuTimeBegin()); 908 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 909 if (transA) { 910 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 911 transA = false; 912 } 913 914 if (transB) { 915 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 916 transB = false; 917 } 918 919 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 920 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 921 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 922 calling new Mat_SeqAIJKokkos(). 923 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 924 */ 925 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 926 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 927 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 928 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 929 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 930 #endif 931 PetscCall(PetscLogGpuTimeEnd()); 932 933 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 934 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 935 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 936 PetscFunctionReturn(PETSC_SUCCESS); 937 } 938 939 /* handles sparse matrix matrix ops */ 940 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 941 { 942 Mat_Product *product = mat->product; 943 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 944 945 PetscFunctionBegin; 946 MatCheckProduct(mat, 1); 947 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 948 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 949 if (Biskok && Ciskok) { 950 switch (product->type) { 951 case MATPRODUCT_AB: 952 case MATPRODUCT_AtB: 953 case MATPRODUCT_ABt: 954 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 955 break; 956 case MATPRODUCT_PtAP: 957 case MATPRODUCT_RARt: 958 case MATPRODUCT_ABC: 959 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 960 break; 961 default: 962 break; 963 } 964 } else { /* fallback for AIJ */ 965 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 966 } 967 PetscFunctionReturn(PETSC_SUCCESS); 968 } 969 970 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 971 { 972 Mat_SeqAIJKokkos *aijkok; 973 974 PetscFunctionBegin; 975 PetscCall(PetscLogGpuTimeBegin()); 976 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 977 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 978 KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 979 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 980 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 981 PetscCall(PetscLogGpuTimeEnd()); 982 PetscFunctionReturn(PETSC_SUCCESS); 983 } 984 985 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 986 { 987 Mat_SeqAIJKokkos *aijkok; 988 989 PetscFunctionBegin; 990 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 991 if (aijkok) { /* Only zero the device if data is already there */ 992 KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0); 993 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 994 } else { /* Might be preallocated but not assembled */ 995 PetscCall(MatZeroEntries_SeqAIJ(A)); 996 } 997 PetscFunctionReturn(PETSC_SUCCESS); 998 } 999 1000 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1001 { 1002 Mat_SeqAIJ *aijseq; 1003 Mat_SeqAIJKokkos *aijkok; 1004 PetscInt n; 1005 PetscScalarKokkosView xv; 1006 1007 PetscFunctionBegin; 1008 PetscCall(VecGetLocalSize(x, &n)); 1009 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1010 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1011 1012 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1013 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1014 1015 if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */ 1016 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1017 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1018 aijkok->SetDiagonal(aijseq->diag); 1019 } 1020 1021 const auto &Aa = aijkok->a_dual.view_device(); 1022 const auto &Ai = aijkok->i_dual.view_device(); 1023 const auto &Adiag = aijkok->diag_dual.view_device(); 1024 1025 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1026 Kokkos::parallel_for( 1027 n, KOKKOS_LAMBDA(const PetscInt i) { 1028 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1029 else xv(i) = 0; 1030 }); 1031 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1032 PetscFunctionReturn(PETSC_SUCCESS); 1033 } 1034 1035 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1036 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1037 { 1038 Mat_SeqAIJKokkos *aijkok; 1039 1040 PetscFunctionBegin; 1041 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1042 PetscValidPointer(kv, 2); 1043 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1044 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1045 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1046 *kv = aijkok->a_dual.view_device(); 1047 PetscFunctionReturn(PETSC_SUCCESS); 1048 } 1049 1050 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1051 { 1052 PetscFunctionBegin; 1053 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1054 PetscValidPointer(kv, 2); 1055 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1056 PetscFunctionReturn(PETSC_SUCCESS); 1057 } 1058 1059 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1060 { 1061 Mat_SeqAIJKokkos *aijkok; 1062 1063 PetscFunctionBegin; 1064 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1065 PetscValidPointer(kv, 2); 1066 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1067 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1068 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1069 *kv = aijkok->a_dual.view_device(); 1070 PetscFunctionReturn(PETSC_SUCCESS); 1071 } 1072 1073 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1074 { 1075 PetscFunctionBegin; 1076 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1077 PetscValidPointer(kv, 2); 1078 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1079 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1080 PetscFunctionReturn(PETSC_SUCCESS); 1081 } 1082 1083 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1084 { 1085 Mat_SeqAIJKokkos *aijkok; 1086 1087 PetscFunctionBegin; 1088 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1089 PetscValidPointer(kv, 2); 1090 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1091 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1092 *kv = aijkok->a_dual.view_device(); 1093 PetscFunctionReturn(PETSC_SUCCESS); 1094 } 1095 1096 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1097 { 1098 PetscFunctionBegin; 1099 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1100 PetscValidPointer(kv, 2); 1101 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1102 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1103 PetscFunctionReturn(PETSC_SUCCESS); 1104 } 1105 1106 /* Computes Y += alpha X */ 1107 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1108 { 1109 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1110 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1111 ConstMatScalarKokkosView Xa; 1112 MatScalarKokkosView Ya; 1113 1114 PetscFunctionBegin; 1115 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1116 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1117 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1118 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1119 PetscCall(PetscLogGpuTimeBegin()); 1120 1121 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1122 PetscBool e; 1123 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1124 if (e) { 1125 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1126 if (e) pattern = SAME_NONZERO_PATTERN; 1127 } 1128 } 1129 1130 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1131 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1132 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1133 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1134 */ 1135 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1136 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1137 Xa = xkok->a_dual.view_device(); 1138 Ya = ykok->a_dual.view_device(); 1139 1140 if (pattern == SAME_NONZERO_PATTERN) { 1141 KokkosBlas::axpy(alpha, Xa, Ya); 1142 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1143 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1144 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1145 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1146 1147 Kokkos::parallel_for( 1148 Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1149 PetscInt i = t.league_rank(); // row i 1150 Kokkos::single(Kokkos::PerTeam(t), [=]() { 1151 // Only one thread works in a team 1152 PetscInt p, q = Yi(i); 1153 for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 1154 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 1155 if (Xj(p) == Yj(q)) { // Found it 1156 Ya(q) += alpha * Xa(p); 1157 q++; 1158 } else { 1159 // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1160 // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1161 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 1162 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 1163 #else 1164 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 1165 #endif 1166 } 1167 } 1168 }); 1169 }); 1170 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1171 } else { // different nonzero patterns 1172 Mat Z; 1173 KokkosCsrMatrix zcsr; 1174 KernelHandle kh; 1175 kh.create_spadd_handle(true); // X, Y are sorted 1176 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1177 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1178 zkok = new Mat_SeqAIJKokkos(zcsr); 1179 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1180 PetscCall(MatHeaderReplace(Y, &Z)); 1181 kh.destroy_spadd_handle(); 1182 } 1183 PetscCall(PetscLogGpuTimeEnd()); 1184 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 1185 PetscFunctionReturn(PETSC_SUCCESS); 1186 } 1187 1188 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1189 { 1190 Mat_SeqAIJKokkos *akok; 1191 Mat_SeqAIJ *aseq; 1192 1193 PetscFunctionBegin; 1194 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1195 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1196 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1197 delete akok; 1198 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE); 1199 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1200 akok->SetUpCOO(aseq); 1201 PetscFunctionReturn(PETSC_SUCCESS); 1202 } 1203 1204 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1205 { 1206 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 1207 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1208 PetscCount Annz = aseq->nz; 1209 const PetscCountKokkosView &jmap = akok->jmap_d; 1210 const PetscCountKokkosView &perm = akok->perm_d; 1211 MatScalarKokkosView Aa; 1212 ConstMatScalarKokkosView kv; 1213 PetscMemType memtype; 1214 1215 PetscFunctionBegin; 1216 PetscCall(PetscGetMemType(v, &memtype)); 1217 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1218 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, aseq->coo_n)); 1219 } else { 1220 kv = ConstMatScalarKokkosView(v, aseq->coo_n); /* Directly use v[]'s memory */ 1221 } 1222 1223 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1224 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1225 1226 Kokkos::parallel_for( 1227 Annz, KOKKOS_LAMBDA(const PetscCount i) { 1228 PetscScalar sum = 0.0; 1229 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1230 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1231 }); 1232 1233 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1234 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1235 PetscFunctionReturn(PETSC_SUCCESS); 1236 } 1237 1238 PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A, const PetscInt *diag) 1239 { 1240 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1241 MatScalarKokkosView Aa; 1242 const MatRowMapKokkosView &Ai = akok->i_dual.view_device(); 1243 PetscInt m = A->rmap->n; 1244 ConstMatRowMapKokkosView Adiag(diag, m); /* diag is a device pointer */ 1245 1246 PetscFunctionBegin; 1247 PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); 1248 Kokkos::parallel_for( 1249 m, KOKKOS_LAMBDA(const PetscInt i) { 1250 PetscScalar tmp; 1251 if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i + 1)) { /* The diagonal element exists */ 1252 tmp = Aa(Ai(i)); 1253 Aa(Ai(i)) = Aa(Adiag(i)); 1254 Aa(Adiag(i)) = tmp; 1255 } 1256 }); 1257 PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1258 PetscFunctionReturn(PETSC_SUCCESS); 1259 } 1260 1261 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1262 { 1263 PetscFunctionBegin; 1264 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1265 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1266 B->offloadmask = PETSC_OFFLOAD_CPU; 1267 PetscFunctionReturn(PETSC_SUCCESS); 1268 } 1269 1270 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1271 { 1272 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1273 1274 PetscFunctionBegin; 1275 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1276 A->boundtocpu = PETSC_FALSE; 1277 1278 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1279 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1280 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1281 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1282 A->ops->scale = MatScale_SeqAIJKokkos; 1283 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1284 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1285 A->ops->mult = MatMult_SeqAIJKokkos; 1286 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1287 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1288 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1289 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1290 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1291 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1292 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1293 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1294 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1295 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1296 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1297 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1298 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1299 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1300 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1301 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1302 1303 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1304 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1305 PetscFunctionReturn(PETSC_SUCCESS); 1306 } 1307 1308 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1309 { 1310 Mat_SeqAIJ *aseq; 1311 PetscInt i, m, n; 1312 1313 PetscFunctionBegin; 1314 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1315 1316 m = akok->nrows(); 1317 n = akok->ncols(); 1318 PetscCall(MatSetSizes(A, m, n, m, n)); 1319 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1320 1321 /* Set up data structures of A as a MATSEQAIJ */ 1322 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1323 aseq = (Mat_SeqAIJ *)(A)->data; 1324 1325 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1326 akok->j_dual.sync_host(); 1327 1328 aseq->i = akok->i_host_data(); 1329 aseq->j = akok->j_host_data(); 1330 aseq->a = akok->a_host_data(); 1331 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1332 aseq->singlemalloc = PETSC_FALSE; 1333 aseq->free_a = PETSC_FALSE; 1334 aseq->free_ij = PETSC_FALSE; 1335 aseq->nz = akok->nnz(); 1336 aseq->maxnz = aseq->nz; 1337 1338 PetscCall(PetscMalloc1(m, &aseq->imax)); 1339 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1340 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1341 1342 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1343 akok->nonzerostate = A->nonzerostate; 1344 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1345 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1346 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1347 PetscFunctionReturn(PETSC_SUCCESS); 1348 } 1349 1350 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 1351 { 1352 PetscFunctionBegin; 1353 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1354 *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 1355 PetscFunctionReturn(PETSC_SUCCESS); 1356 } 1357 1358 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 1359 { 1360 Mat_SeqAIJKokkos *akok; 1361 PetscFunctionBegin; 1362 PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 1363 PetscCall(MatCreate(comm, A)); 1364 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1365 PetscFunctionReturn(PETSC_SUCCESS); 1366 } 1367 1368 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1369 1370 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1371 */ 1372 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1373 { 1374 PetscFunctionBegin; 1375 PetscCall(MatCreate(comm, A)); 1376 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1377 PetscFunctionReturn(PETSC_SUCCESS); 1378 } 1379 1380 /*@C 1381 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1382 (the default parallel PETSc format). This matrix will ultimately be handled by 1383 Kokkos for calculations. 1384 1385 Collective 1386 1387 Input Parameters: 1388 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1389 . m - number of rows 1390 . n - number of columns 1391 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 1392 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 1393 1394 Output Parameter: 1395 . A - the matrix 1396 1397 Level: intermediate 1398 1399 Notes: 1400 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1401 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1402 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1403 1404 The AIJ format, also called 1405 compressed row storage, is fully compatible with standard Fortran 1406 storage. That is, the stored row and column indices can begin at 1407 either one (as in Fortran) or zero. 1408 1409 Specify the preallocated storage with either `nz` or `nnz` (not both). 1410 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1411 allocation. 1412 1413 .seealso: [](chapter_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()` 1414 @*/ 1415 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1416 { 1417 PetscFunctionBegin; 1418 PetscCall(PetscKokkosInitializeCheck()); 1419 PetscCall(MatCreate(comm, A)); 1420 PetscCall(MatSetSizes(*A, m, n, m, n)); 1421 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1422 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1423 PetscFunctionReturn(PETSC_SUCCESS); 1424 } 1425 1426 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1427 { 1428 PetscFunctionBegin; 1429 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1430 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1431 PetscFunctionReturn(PETSC_SUCCESS); 1432 } 1433 1434 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1435 { 1436 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1437 1438 PetscFunctionBegin; 1439 if (!factors->sptrsv_symbolic_completed) { 1440 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d); 1441 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d); 1442 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1443 } 1444 PetscFunctionReturn(PETSC_SUCCESS); 1445 } 1446 1447 /* Check if we need to update factors etc for transpose solve */ 1448 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1449 { 1450 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1451 MatColIdxType n = A->rmap->n; 1452 1453 PetscFunctionBegin; 1454 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1455 /* Update L^T and do sptrsv symbolic */ 1456 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); 1457 Kokkos::deep_copy(factors->iLt_d, 0); /* KK requires 0 */ 1458 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d", factors->jL_d.extent(0)); 1459 factors->aLt_d = MatScalarKokkosView("factors->aLt_d", factors->aL_d.extent(0)); 1460 1461 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d, 1462 factors->iLt_d, factors->jLt_d, factors->aLt_d); 1463 1464 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1465 We have to sort the indices, until KK provides finer control options. 1466 */ 1467 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d); 1468 1469 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d); 1470 1471 /* Update U^T and do sptrsv symbolic */ 1472 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); 1473 Kokkos::deep_copy(factors->iUt_d, 0); /* KK requires 0 */ 1474 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d", factors->jU_d.extent(0)); 1475 factors->aUt_d = MatScalarKokkosView("factors->aUt_d", factors->aU_d.extent(0)); 1476 1477 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d, 1478 factors->iUt_d, factors->jUt_d, factors->aUt_d); 1479 1480 /* Sort indices. See comments above */ 1481 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d); 1482 1483 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d); 1484 factors->transpose_updated = PETSC_TRUE; 1485 } 1486 PetscFunctionReturn(PETSC_SUCCESS); 1487 } 1488 1489 /* Solve Ax = b, with A = LU */ 1490 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x) 1491 { 1492 ConstPetscScalarKokkosView bv; 1493 PetscScalarKokkosView xv; 1494 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1495 1496 PetscFunctionBegin; 1497 PetscCall(PetscLogGpuTimeBegin()); 1498 PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 1499 PetscCall(VecGetKokkosView(b, &bv)); 1500 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1501 /* Solve L tmpv = b */ 1502 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector)); 1503 /* Solve Ux = tmpv */ 1504 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv)); 1505 PetscCall(VecRestoreKokkosView(b, &bv)); 1506 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1507 PetscCall(PetscLogGpuTimeEnd()); 1508 PetscFunctionReturn(PETSC_SUCCESS); 1509 } 1510 1511 /* Solve A^T x = b, where A^T = U^T L^T */ 1512 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x) 1513 { 1514 ConstPetscScalarKokkosView bv; 1515 PetscScalarKokkosView xv; 1516 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1517 1518 PetscFunctionBegin; 1519 PetscCall(PetscLogGpuTimeBegin()); 1520 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 1521 PetscCall(VecGetKokkosView(b, &bv)); 1522 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1523 /* Solve U^T tmpv = b */ 1524 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector); 1525 1526 /* Solve L^T x = tmpv */ 1527 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv); 1528 PetscCall(VecRestoreKokkosView(b, &bv)); 1529 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1530 PetscCall(PetscLogGpuTimeEnd()); 1531 PetscFunctionReturn(PETSC_SUCCESS); 1532 } 1533 1534 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1535 { 1536 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1537 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1538 PetscInt fill_lev = info->levels; 1539 1540 PetscFunctionBegin; 1541 PetscCall(PetscLogGpuTimeBegin()); 1542 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1543 1544 auto a_d = aijkok->a_dual.view_device(); 1545 auto i_d = aijkok->i_dual.view_device(); 1546 auto j_d = aijkok->j_dual.view_device(); 1547 1548 KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d); 1549 1550 B->assembled = PETSC_TRUE; 1551 B->preallocated = PETSC_TRUE; 1552 B->ops->solve = MatSolve_SeqAIJKokkos; 1553 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1554 B->ops->matsolve = NULL; 1555 B->ops->matsolvetranspose = NULL; 1556 B->offloadmask = PETSC_OFFLOAD_GPU; 1557 1558 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1559 factors->transpose_updated = PETSC_FALSE; 1560 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1561 /* TODO: log flops, but how to know that? */ 1562 PetscCall(PetscLogGpuTimeEnd()); 1563 PetscFunctionReturn(PETSC_SUCCESS); 1564 } 1565 1566 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1567 { 1568 Mat_SeqAIJKokkos *aijkok; 1569 Mat_SeqAIJ *b; 1570 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1571 PetscInt fill_lev = info->levels; 1572 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 1573 PetscInt n = A->rmap->n; 1574 1575 PetscFunctionBegin; 1576 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1577 /* Rebuild factors */ 1578 if (factors) { 1579 factors->Destroy(); 1580 } /* Destroy the old if it exists */ 1581 else { 1582 B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n); 1583 } 1584 1585 /* Create a spiluk handle and then do symbolic factorization */ 1586 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 1587 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 1588 1589 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1590 1591 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 1592 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 1593 Kokkos::realloc(factors->iU_d, n + 1); 1594 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 1595 1596 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1597 auto i_d = aijkok->i_dual.view_device(); 1598 auto j_d = aijkok->j_dual.view_device(); 1599 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d); 1600 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1601 1602 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1603 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 1604 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 1605 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 1606 1607 /* TODO: add options to select sptrsv algorithms */ 1608 /* Create sptrsv handles for L, U and their transpose */ 1609 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1610 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1611 #else 1612 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1613 #endif 1614 1615 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 1616 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 1617 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 1618 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 1619 1620 /* Fill fields of the factor matrix B */ 1621 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 1622 b = (Mat_SeqAIJ *)B->data; 1623 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 1624 B->info.fill_ratio_given = info->fill; 1625 B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 1626 1627 B->offloadmask = PETSC_OFFLOAD_GPU; 1628 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1629 PetscFunctionReturn(PETSC_SUCCESS); 1630 } 1631 1632 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type) 1633 { 1634 PetscFunctionBegin; 1635 *type = MATSOLVERKOKKOS; 1636 PetscFunctionReturn(PETSC_SUCCESS); 1637 } 1638 1639 /*MC 1640 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1641 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 1642 1643 Level: beginner 1644 1645 .seealso: [](chapter_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1646 M*/ 1647 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1648 { 1649 PetscInt n = A->rmap->n; 1650 1651 PetscFunctionBegin; 1652 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 1653 PetscCall(MatSetSizes(*B, n, n, n, n)); 1654 (*B)->factortype = ftype; 1655 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 1656 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1657 1658 if (ftype == MAT_FACTOR_LU) { 1659 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1660 (*B)->canuseordering = PETSC_TRUE; 1661 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1662 } else if (ftype == MAT_FACTOR_ILU) { 1663 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1664 (*B)->canuseordering = PETSC_FALSE; 1665 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1666 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1667 1668 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 1669 PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos)); 1670 PetscFunctionReturn(PETSC_SUCCESS); 1671 } 1672 1673 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1674 { 1675 PetscFunctionBegin; 1676 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1677 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1678 PetscFunctionReturn(PETSC_SUCCESS); 1679 } 1680 1681 /* Utility to print out a KokkosCsrMatrix for debugging */ 1682 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 1683 { 1684 const auto &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map); 1685 const auto &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries); 1686 const auto &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values); 1687 const PetscInt *i = iv.data(); 1688 const PetscInt *j = jv.data(); 1689 const PetscScalar *a = av.data(); 1690 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 1691 1692 PetscFunctionBegin; 1693 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 1694 for (PetscInt k = 0; k < m; k++) { 1695 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 1696 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 1697 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 1698 } 1699 PetscFunctionReturn(PETSC_SUCCESS); 1700 } 1701