1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petsc/private/sfimpl.h> 5 #include <petscsystypes.h> 6 #include <petscerror.h> 7 8 #include <Kokkos_Core.hpp> 9 #include <KokkosBlas.hpp> 10 #include <KokkosSparse_CrsMatrix.hpp> 11 #include <KokkosSparse_spmv.hpp> 12 #include <KokkosSparse_spiluk.hpp> 13 #include <KokkosSparse_sptrsv.hpp> 14 #include <KokkosSparse_spgemm.hpp> 15 #include <KokkosSparse_spadd.hpp> 16 17 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 18 19 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 20 #include <KokkosSparse_Utils.hpp> 21 using KokkosSparse::sort_crs_matrix; 22 using KokkosSparse::Impl::transpose_matrix; 23 #else 24 #include <KokkosKernels_Sorting.hpp> 25 using KokkosKernels::sort_crs_matrix; 26 using KokkosKernels::Impl::transpose_matrix; 27 #endif 28 29 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 30 31 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 32 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 33 In the latter case, it is important to set a_dual's sync state correctly. 34 */ 35 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 36 { 37 Mat_SeqAIJ *aijseq; 38 Mat_SeqAIJKokkos *aijkok; 39 40 PetscFunctionBegin; 41 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 42 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 43 44 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 45 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 46 47 /* If aijkok does not exist, we just copy i, j to device. 48 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 49 In both cases, we build a new aijkok structure. 50 */ 51 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 52 delete aijkok; 53 aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 54 A->spptr = aijkok; 55 } 56 57 if (aijkok->device_mat_d.data()) { 58 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 59 } 60 PetscFunctionReturn(PETSC_SUCCESS); 61 } 62 63 /* Sync CSR data to device if not yet */ 64 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 65 { 66 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 67 68 PetscFunctionBegin; 69 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Cann't sync factorized matrix from host to device"); 70 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 71 if (aijkok->a_dual.need_sync_device()) { 72 aijkok->a_dual.sync_device(); 73 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 74 aijkok->hermitian_updated = PETSC_FALSE; 75 } 76 PetscFunctionReturn(PETSC_SUCCESS); 77 } 78 79 /* Mark the CSR data on device as modified */ 80 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 81 { 82 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 83 84 PetscFunctionBegin; 85 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 86 aijkok->a_dual.clear_sync_state(); 87 aijkok->a_dual.modify_device(); 88 aijkok->transpose_updated = PETSC_FALSE; 89 aijkok->hermitian_updated = PETSC_FALSE; 90 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 91 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 92 PetscFunctionReturn(PETSC_SUCCESS); 93 } 94 95 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 96 { 97 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 98 99 PetscFunctionBegin; 100 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 101 /* We do not expect one needs factors on host */ 102 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Cann't sync factorized matrix from device to host"); 103 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 104 aijkok->a_dual.sync_host(); 105 PetscFunctionReturn(PETSC_SUCCESS); 106 } 107 108 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 109 { 110 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 111 112 PetscFunctionBegin; 113 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 114 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 115 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 116 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 117 */ 118 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 119 aijkok->a_dual.sync_host(); 120 *array = aijkok->a_dual.view_host().data(); 121 } else { /* Happens when calling MatSetValues on a newly created matrix */ 122 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 123 } 124 PetscFunctionReturn(PETSC_SUCCESS); 125 } 126 127 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 128 { 129 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 130 131 PetscFunctionBegin; 132 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 133 PetscFunctionReturn(PETSC_SUCCESS); 134 } 135 136 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 137 { 138 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 139 140 PetscFunctionBegin; 141 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 142 aijkok->a_dual.sync_host(); 143 *array = aijkok->a_dual.view_host().data(); 144 } else { 145 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 146 } 147 PetscFunctionReturn(PETSC_SUCCESS); 148 } 149 150 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 151 { 152 PetscFunctionBegin; 153 *array = NULL; 154 PetscFunctionReturn(PETSC_SUCCESS); 155 } 156 157 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 158 { 159 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 160 161 PetscFunctionBegin; 162 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 163 *array = aijkok->a_dual.view_host().data(); 164 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 165 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 166 } 167 PetscFunctionReturn(PETSC_SUCCESS); 168 } 169 170 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 171 { 172 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 173 174 PetscFunctionBegin; 175 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 176 aijkok->a_dual.clear_sync_state(); 177 aijkok->a_dual.modify_host(); 178 } 179 PetscFunctionReturn(PETSC_SUCCESS); 180 } 181 182 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 183 { 184 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 185 186 PetscFunctionBegin; 187 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 188 189 if (i) *i = aijkok->i_device_data(); 190 if (j) *j = aijkok->j_device_data(); 191 if (a) { 192 aijkok->a_dual.sync_device(); 193 *a = aijkok->a_device_data(); 194 } 195 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 196 PetscFunctionReturn(PETSC_SUCCESS); 197 } 198 199 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 200 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 201 { 202 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 203 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 204 205 PetscFunctionBegin; 206 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 207 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(), h_mat_k); 208 Kokkos::deep_copy(aijkok->device_mat_d, h_mat_k); 209 PetscFunctionReturn(PETSC_SUCCESS); 210 } 211 212 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 213 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 214 { 215 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 216 217 PetscFunctionBegin; 218 if (aijkok && aijkok->device_mat_d.data()) { 219 *d_mat = aijkok->device_mat_d.data(); 220 } else { 221 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it) 222 *d_mat = NULL; 223 } 224 PetscFunctionReturn(PETSC_SUCCESS); 225 } 226 227 /* 228 Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 229 230 Input Parameter: 231 . A - the MATSEQAIJKOKKOS matrix 232 233 Output Parameters: 234 + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 235 - T_d - the transpose on device, whose value array is allcoated but not initialized 236 */ 237 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 238 { 239 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 240 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 241 const PetscInt *Ai = aseq->i, *Aj = aseq->j; 242 MatRowMapKokkosViewHost Ti_h("Ti", n + 1); 243 MatRowMapType *Ti = Ti_h.data(); 244 MatColIdxKokkosViewHost Tj_h("Tj", nz); 245 MatRowMapKokkosViewHost perm_h("permutation", nz); 246 PetscInt *Tj = Tj_h.data(); 247 PetscInt *perm = perm_h.data(); 248 PetscInt *offset; 249 250 PetscFunctionBegin; 251 // Populate Ti 252 PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 253 Ti++; 254 for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 255 Ti--; 256 for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 257 258 // Populate Tj and the permutation array 259 PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 260 for (PetscInt i = 0; i < m; i++) { 261 for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 262 PetscInt r = Aj[j]; // row r of T 263 PetscInt disp = Ti[r] + offset[r]; 264 265 Tj[disp] = i; // col i of T 266 perm[disp] = j; 267 offset[r]++; 268 } 269 } 270 PetscCall(PetscFree(offset)); 271 272 // Sort each row of T, along with the permutation array 273 for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 274 275 // Output perm and T on device 276 auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 277 auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 278 PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 279 PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 280 PetscFunctionReturn(PETSC_SUCCESS); 281 } 282 283 // Generate the transpose on device and cache it internally 284 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 285 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 286 { 287 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 288 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 289 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 290 KokkosCsrMatrix &T = akok->csrmatT; 291 292 PetscFunctionBegin; 293 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 294 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's valeus since we are going to access them on device 295 296 const auto &Aa = akok->a_dual.view_device(); 297 298 if (A->symmetric == PETSC_BOOL3_TRUE) { 299 *csrmatT = akok->csrmat; 300 } else { 301 // See if we already have a cached transpose and its value is up to date 302 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 303 if (!akok->transpose_updated) { // if the value is out of date, update the cached version 304 const auto &perm = akok->transpose_perm; // get the permutation array 305 auto &Ta = T.values; 306 307 PetscCallCXX(Kokkos::parallel_for( 308 nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 309 } 310 } else { // Generate T of size n x m for the first time 311 MatRowMapKokkosView perm; 312 313 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 314 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 315 PetscCallCXX(Kokkos::parallel_for( 316 nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 317 } 318 akok->transpose_updated = PETSC_TRUE; 319 *csrmatT = akok->csrmatT; 320 } 321 PetscFunctionReturn(PETSC_SUCCESS); 322 } 323 324 // Generate the Hermitian on device and cache it internally 325 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 326 { 327 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 328 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 329 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 330 KokkosCsrMatrix &T = akok->csrmatH; 331 332 PetscFunctionBegin; 333 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 334 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 335 336 const auto &Aa = akok->a_dual.view_device(); 337 338 if (A->hermitian == PETSC_BOOL3_TRUE) { 339 *csrmatH = akok->csrmat; 340 } else { 341 // See if we already have a cached hermitian and its value is up to date 342 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 343 if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 344 const auto &perm = akok->transpose_perm; // get the permutation array 345 auto &Ta = T.values; 346 347 PetscCallCXX(Kokkos::parallel_for( 348 nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 349 } 350 } else { // Generate T of size n x m for the first time 351 MatRowMapKokkosView perm; 352 353 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 354 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 355 PetscCallCXX(Kokkos::parallel_for( 356 nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 357 } 358 akok->hermitian_updated = PETSC_TRUE; 359 *csrmatH = akok->csrmatH; 360 } 361 PetscFunctionReturn(PETSC_SUCCESS); 362 } 363 364 /* y = A x */ 365 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 366 { 367 Mat_SeqAIJKokkos *aijkok; 368 ConstPetscScalarKokkosView xv; 369 PetscScalarKokkosView yv; 370 371 PetscFunctionBegin; 372 PetscCall(PetscLogGpuTimeBegin()); 373 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 374 PetscCall(VecGetKokkosView(xx, &xv)); 375 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 376 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 377 PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 378 PetscCall(VecRestoreKokkosView(xx, &xv)); 379 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 380 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 381 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 382 PetscCall(PetscLogGpuTimeEnd()); 383 PetscFunctionReturn(PETSC_SUCCESS); 384 } 385 386 /* y = A^T x */ 387 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 388 { 389 Mat_SeqAIJKokkos *aijkok; 390 const char *mode; 391 ConstPetscScalarKokkosView xv; 392 PetscScalarKokkosView yv; 393 KokkosCsrMatrix csrmat; 394 395 PetscFunctionBegin; 396 PetscCall(PetscLogGpuTimeBegin()); 397 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 398 PetscCall(VecGetKokkosView(xx, &xv)); 399 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 400 if (A->form_explicit_transpose) { 401 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 402 mode = "N"; 403 } else { 404 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 405 csrmat = aijkok->csrmat; 406 mode = "T"; 407 } 408 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 409 PetscCall(VecRestoreKokkosView(xx, &xv)); 410 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 411 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 412 PetscCall(PetscLogGpuTimeEnd()); 413 PetscFunctionReturn(PETSC_SUCCESS); 414 } 415 416 /* y = A^H x */ 417 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 418 { 419 Mat_SeqAIJKokkos *aijkok; 420 const char *mode; 421 ConstPetscScalarKokkosView xv; 422 PetscScalarKokkosView yv; 423 KokkosCsrMatrix csrmat; 424 425 PetscFunctionBegin; 426 PetscCall(PetscLogGpuTimeBegin()); 427 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 428 PetscCall(VecGetKokkosView(xx, &xv)); 429 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 430 if (A->form_explicit_transpose) { 431 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 432 mode = "N"; 433 } else { 434 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 435 csrmat = aijkok->csrmat; 436 mode = "C"; 437 } 438 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 439 PetscCall(VecRestoreKokkosView(xx, &xv)); 440 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 441 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 442 PetscCall(PetscLogGpuTimeEnd()); 443 PetscFunctionReturn(PETSC_SUCCESS); 444 } 445 446 /* z = A x + y */ 447 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 448 { 449 Mat_SeqAIJKokkos *aijkok; 450 ConstPetscScalarKokkosView xv, yv; 451 PetscScalarKokkosView zv; 452 453 PetscFunctionBegin; 454 PetscCall(PetscLogGpuTimeBegin()); 455 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 456 PetscCall(VecGetKokkosView(xx, &xv)); 457 PetscCall(VecGetKokkosView(yy, &yv)); 458 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 459 if (zz != yy) Kokkos::deep_copy(zv, yv); 460 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 461 PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 462 PetscCall(VecRestoreKokkosView(xx, &xv)); 463 PetscCall(VecRestoreKokkosView(yy, &yv)); 464 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 465 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 466 PetscCall(PetscLogGpuTimeEnd()); 467 PetscFunctionReturn(PETSC_SUCCESS); 468 } 469 470 /* z = A^T x + y */ 471 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 472 { 473 Mat_SeqAIJKokkos *aijkok; 474 const char *mode; 475 ConstPetscScalarKokkosView xv, yv; 476 PetscScalarKokkosView zv; 477 KokkosCsrMatrix csrmat; 478 479 PetscFunctionBegin; 480 PetscCall(PetscLogGpuTimeBegin()); 481 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 482 PetscCall(VecGetKokkosView(xx, &xv)); 483 PetscCall(VecGetKokkosView(yy, &yv)); 484 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 485 if (zz != yy) Kokkos::deep_copy(zv, yv); 486 if (A->form_explicit_transpose) { 487 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 488 mode = "N"; 489 } else { 490 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 491 csrmat = aijkok->csrmat; 492 mode = "T"; 493 } 494 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 495 PetscCall(VecRestoreKokkosView(xx, &xv)); 496 PetscCall(VecRestoreKokkosView(yy, &yv)); 497 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 498 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 499 PetscCall(PetscLogGpuTimeEnd()); 500 PetscFunctionReturn(PETSC_SUCCESS); 501 } 502 503 /* z = A^H x + y */ 504 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 505 { 506 Mat_SeqAIJKokkos *aijkok; 507 const char *mode; 508 ConstPetscScalarKokkosView xv, yv; 509 PetscScalarKokkosView zv; 510 KokkosCsrMatrix csrmat; 511 512 PetscFunctionBegin; 513 PetscCall(PetscLogGpuTimeBegin()); 514 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 515 PetscCall(VecGetKokkosView(xx, &xv)); 516 PetscCall(VecGetKokkosView(yy, &yv)); 517 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 518 if (zz != yy) Kokkos::deep_copy(zv, yv); 519 if (A->form_explicit_transpose) { 520 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 521 mode = "N"; 522 } else { 523 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 524 csrmat = aijkok->csrmat; 525 mode = "C"; 526 } 527 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 528 PetscCall(VecRestoreKokkosView(xx, &xv)); 529 PetscCall(VecRestoreKokkosView(yy, &yv)); 530 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 531 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 532 PetscCall(PetscLogGpuTimeEnd()); 533 PetscFunctionReturn(PETSC_SUCCESS); 534 } 535 536 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 537 { 538 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 539 540 PetscFunctionBegin; 541 switch (op) { 542 case MAT_FORM_EXPLICIT_TRANSPOSE: 543 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 544 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 545 A->form_explicit_transpose = flg; 546 break; 547 default: 548 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 549 break; 550 } 551 PetscFunctionReturn(PETSC_SUCCESS); 552 } 553 554 /* Depending on reuse, either build a new mat, or use the existing mat */ 555 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 556 { 557 Mat_SeqAIJ *aseq; 558 559 PetscFunctionBegin; 560 PetscCall(PetscKokkosInitializeCheck()); 561 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 562 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */ 563 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 564 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 565 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 566 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 567 PetscCall(PetscFree(A->defaultvectype)); 568 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 569 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 570 PetscCall(MatSetOps_SeqAIJKokkos(A)); 571 aseq = static_cast<Mat_SeqAIJ *>(A->data); 572 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 573 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 574 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE); 575 } 576 } 577 PetscFunctionReturn(PETSC_SUCCESS); 578 } 579 580 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 581 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 582 */ 583 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 584 { 585 Mat_SeqAIJ *bseq; 586 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 587 Mat mat; 588 589 PetscFunctionBegin; 590 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 591 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 592 mat = *B; 593 if (A->assembled) { 594 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 595 bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE); 596 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 597 /* Now copy values to B if needed */ 598 if (dupOption == MAT_COPY_VALUES) { 599 if (akok->a_dual.need_sync_device()) { 600 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 601 bkok->a_dual.modify_host(); 602 } else { /* If device has the latest data, we only copy data on device */ 603 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 604 bkok->a_dual.modify_device(); 605 } 606 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 607 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 608 bkok->a_dual.modify_host(); 609 } 610 mat->spptr = bkok; 611 } 612 613 PetscCall(PetscFree(mat->defaultvectype)); 614 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 615 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 616 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 617 PetscFunctionReturn(PETSC_SUCCESS); 618 } 619 620 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 621 { 622 Mat At; 623 KokkosCsrMatrix internT; 624 Mat_SeqAIJKokkos *atkok, *bkok; 625 626 PetscFunctionBegin; 627 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 628 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 629 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 630 /* Deep copy internT, as we want to isolate the internal transpose */ 631 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 632 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 633 if (reuse == MAT_INITIAL_MATRIX) *B = At; 634 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 635 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 636 if ((*B)->assembled) { 637 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 638 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 639 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 640 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 641 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 642 MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 643 MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 644 PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 645 PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 646 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 647 } 648 PetscFunctionReturn(PETSC_SUCCESS); 649 } 650 651 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 652 { 653 Mat_SeqAIJKokkos *aijkok; 654 655 PetscFunctionBegin; 656 if (A->factortype == MAT_FACTOR_NONE) { 657 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 658 delete aijkok; 659 } else { 660 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 661 } 662 A->spptr = NULL; 663 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 664 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 665 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 666 PetscCall(MatDestroy_SeqAIJ(A)); 667 PetscFunctionReturn(PETSC_SUCCESS); 668 } 669 670 /*MC 671 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 672 673 A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 674 675 Options Database Key: 676 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 677 678 Level: beginner 679 680 .seealso: [](chapter_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 681 M*/ 682 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 683 { 684 PetscFunctionBegin; 685 PetscCall(PetscKokkosInitializeCheck()); 686 PetscCall(MatCreate_SeqAIJ(A)); 687 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 688 PetscFunctionReturn(PETSC_SUCCESS); 689 } 690 691 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 692 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 693 { 694 Mat_SeqAIJ *a, *b; 695 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 696 MatScalarKokkosView aa, ba, ca; 697 MatRowMapKokkosView ai, bi, ci; 698 MatColIdxKokkosView aj, bj, cj; 699 PetscInt m, n, nnz, aN; 700 701 PetscFunctionBegin; 702 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 703 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 704 PetscValidPointer(C, 4); 705 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 706 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 707 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 708 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 709 710 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 711 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 712 a = static_cast<Mat_SeqAIJ *>(A->data); 713 b = static_cast<Mat_SeqAIJ *>(B->data); 714 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 715 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 716 aa = akok->a_dual.view_device(); 717 ai = akok->i_dual.view_device(); 718 ba = bkok->a_dual.view_device(); 719 bi = bkok->i_dual.view_device(); 720 m = A->rmap->n; /* M, N and nnz of C */ 721 n = A->cmap->n + B->cmap->n; 722 nnz = a->nz + b->nz; 723 aN = A->cmap->n; /* N of A */ 724 if (reuse == MAT_INITIAL_MATRIX) { 725 aj = akok->j_dual.view_device(); 726 bj = bkok->j_dual.view_device(); 727 auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 728 auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 729 auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 730 ca = ca_dual.view_device(); 731 ci = ci_dual.view_device(); 732 cj = cj_dual.view_device(); 733 734 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 735 Kokkos::parallel_for( 736 Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 737 PetscInt i = t.league_rank(); /* row i */ 738 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 739 740 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 741 ci(i) = coffset; 742 if (i == m - 1) ci(m) = ai(m) + bi(m); 743 }); 744 745 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 746 if (k < alen) { 747 ca(coffset + k) = aa(ai(i) + k); 748 cj(coffset + k) = aj(ai(i) + k); 749 } else { 750 ca(coffset + k) = ba(bi(i) + k - alen); 751 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 752 } 753 }); 754 }); 755 ca_dual.modify_device(); 756 ci_dual.modify_device(); 757 cj_dual.modify_device(); 758 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 759 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 760 } else if (reuse == MAT_REUSE_MATRIX) { 761 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 762 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 763 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 764 ca = ckok->a_dual.view_device(); 765 ci = ckok->i_dual.view_device(); 766 767 Kokkos::parallel_for( 768 Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 769 PetscInt i = t.league_rank(); /* row i */ 770 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 771 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 772 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 773 else ca(ci(i) + k) = ba(bi(i) + k - alen); 774 }); 775 }); 776 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 777 } 778 PetscFunctionReturn(PETSC_SUCCESS); 779 } 780 781 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 782 { 783 PetscFunctionBegin; 784 delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 785 PetscFunctionReturn(PETSC_SUCCESS); 786 } 787 788 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 789 { 790 Mat_Product *product = C->product; 791 Mat A, B; 792 bool transA, transB; /* use bool, since KK needs this type */ 793 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 794 Mat_SeqAIJ *c; 795 MatProductData_SeqAIJKokkos *pdata; 796 KokkosCsrMatrix csrmatA, csrmatB; 797 798 PetscFunctionBegin; 799 MatCheckProduct(C, 1); 800 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 801 pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 802 803 // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 804 // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 805 // we still do numeric. 806 if (pdata->reusesym) { // numeric reuses results from symbolic 807 pdata->reusesym = PETSC_FALSE; 808 PetscFunctionReturn(PETSC_SUCCESS); 809 } 810 811 switch (product->type) { 812 case MATPRODUCT_AB: 813 transA = false; 814 transB = false; 815 break; 816 case MATPRODUCT_AtB: 817 transA = true; 818 transB = false; 819 break; 820 case MATPRODUCT_ABt: 821 transA = false; 822 transB = true; 823 break; 824 default: 825 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 826 } 827 828 A = product->A; 829 B = product->B; 830 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 831 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 832 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 833 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 834 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 835 836 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 837 838 csrmatA = akok->csrmat; 839 csrmatB = bkok->csrmat; 840 841 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 842 if (transA) { 843 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 844 transA = false; 845 } 846 847 if (transB) { 848 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 849 transB = false; 850 } 851 PetscCall(PetscLogGpuTimeBegin()); 852 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 853 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 854 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 855 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 856 #endif 857 858 PetscCall(PetscLogGpuTimeEnd()); 859 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 860 /* shorter version of MatAssemblyEnd_SeqAIJ */ 861 c = (Mat_SeqAIJ *)C->data; 862 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 863 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 864 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 865 c->reallocs = 0; 866 C->info.mallocs = 0; 867 C->info.nz_unneeded = 0; 868 C->assembled = C->was_assembled = PETSC_TRUE; 869 C->num_ass++; 870 PetscFunctionReturn(PETSC_SUCCESS); 871 } 872 873 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 874 { 875 Mat_Product *product = C->product; 876 MatProductType ptype; 877 Mat A, B; 878 bool transA, transB; 879 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 880 MatProductData_SeqAIJKokkos *pdata; 881 MPI_Comm comm; 882 KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 883 884 PetscFunctionBegin; 885 MatCheckProduct(C, 1); 886 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 887 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 888 A = product->A; 889 B = product->B; 890 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 891 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 892 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 893 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 894 csrmatA = akok->csrmat; 895 csrmatB = bkok->csrmat; 896 897 ptype = product->type; 898 // Take advantage of the symmetry if true 899 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 900 ptype = MATPRODUCT_AB; 901 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 902 } 903 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 904 ptype = MATPRODUCT_AB; 905 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 906 } 907 908 switch (ptype) { 909 case MATPRODUCT_AB: 910 transA = false; 911 transB = false; 912 break; 913 case MATPRODUCT_AtB: 914 transA = true; 915 transB = false; 916 break; 917 case MATPRODUCT_ABt: 918 transA = false; 919 transB = true; 920 break; 921 default: 922 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 923 } 924 PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 925 pdata->reusesym = product->api_user; 926 927 /* TODO: add command line options to select spgemm algorithms */ 928 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 929 930 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 931 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 932 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 933 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 934 #endif 935 #endif 936 PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 937 938 PetscCall(PetscLogGpuTimeBegin()); 939 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 940 if (transA) { 941 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 942 transA = false; 943 } 944 945 if (transB) { 946 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 947 transB = false; 948 } 949 950 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 951 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 952 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 953 calling new Mat_SeqAIJKokkos(). 954 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 955 */ 956 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 957 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 958 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 959 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 960 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 961 #endif 962 PetscCall(PetscLogGpuTimeEnd()); 963 964 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 965 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 966 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 967 PetscFunctionReturn(PETSC_SUCCESS); 968 } 969 970 /* handles sparse matrix matrix ops */ 971 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 972 { 973 Mat_Product *product = mat->product; 974 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 975 976 PetscFunctionBegin; 977 MatCheckProduct(mat, 1); 978 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 979 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 980 if (Biskok && Ciskok) { 981 switch (product->type) { 982 case MATPRODUCT_AB: 983 case MATPRODUCT_AtB: 984 case MATPRODUCT_ABt: 985 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 986 break; 987 case MATPRODUCT_PtAP: 988 case MATPRODUCT_RARt: 989 case MATPRODUCT_ABC: 990 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 991 break; 992 default: 993 break; 994 } 995 } else { /* fallback for AIJ */ 996 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 997 } 998 PetscFunctionReturn(PETSC_SUCCESS); 999 } 1000 1001 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 1002 { 1003 Mat_SeqAIJKokkos *aijkok; 1004 1005 PetscFunctionBegin; 1006 PetscCall(PetscLogGpuTimeBegin()); 1007 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1008 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1009 KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 1010 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1011 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 1012 PetscCall(PetscLogGpuTimeEnd()); 1013 PetscFunctionReturn(PETSC_SUCCESS); 1014 } 1015 1016 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 1017 { 1018 Mat_SeqAIJKokkos *aijkok; 1019 1020 PetscFunctionBegin; 1021 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1022 if (aijkok) { /* Only zero the device if data is already there */ 1023 KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0); 1024 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1025 } else { /* Might be preallocated but not assembled */ 1026 PetscCall(MatZeroEntries_SeqAIJ(A)); 1027 } 1028 PetscFunctionReturn(PETSC_SUCCESS); 1029 } 1030 1031 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1032 { 1033 Mat_SeqAIJ *aijseq; 1034 Mat_SeqAIJKokkos *aijkok; 1035 PetscInt n; 1036 PetscScalarKokkosView xv; 1037 1038 PetscFunctionBegin; 1039 PetscCall(VecGetLocalSize(x, &n)); 1040 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1041 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1042 1043 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1044 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1045 1046 if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */ 1047 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1048 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1049 aijkok->SetDiagonal(aijseq->diag); 1050 } 1051 1052 const auto &Aa = aijkok->a_dual.view_device(); 1053 const auto &Ai = aijkok->i_dual.view_device(); 1054 const auto &Adiag = aijkok->diag_dual.view_device(); 1055 1056 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1057 Kokkos::parallel_for( 1058 n, KOKKOS_LAMBDA(const PetscInt i) { 1059 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1060 else xv(i) = 0; 1061 }); 1062 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1063 PetscFunctionReturn(PETSC_SUCCESS); 1064 } 1065 1066 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1067 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1068 { 1069 Mat_SeqAIJKokkos *aijkok; 1070 1071 PetscFunctionBegin; 1072 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1073 PetscValidPointer(kv, 2); 1074 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1075 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1076 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1077 *kv = aijkok->a_dual.view_device(); 1078 PetscFunctionReturn(PETSC_SUCCESS); 1079 } 1080 1081 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1082 { 1083 PetscFunctionBegin; 1084 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1085 PetscValidPointer(kv, 2); 1086 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1087 PetscFunctionReturn(PETSC_SUCCESS); 1088 } 1089 1090 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1091 { 1092 Mat_SeqAIJKokkos *aijkok; 1093 1094 PetscFunctionBegin; 1095 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1096 PetscValidPointer(kv, 2); 1097 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1098 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1099 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1100 *kv = aijkok->a_dual.view_device(); 1101 PetscFunctionReturn(PETSC_SUCCESS); 1102 } 1103 1104 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1105 { 1106 PetscFunctionBegin; 1107 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1108 PetscValidPointer(kv, 2); 1109 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1110 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1111 PetscFunctionReturn(PETSC_SUCCESS); 1112 } 1113 1114 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1115 { 1116 Mat_SeqAIJKokkos *aijkok; 1117 1118 PetscFunctionBegin; 1119 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1120 PetscValidPointer(kv, 2); 1121 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1122 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1123 *kv = aijkok->a_dual.view_device(); 1124 PetscFunctionReturn(PETSC_SUCCESS); 1125 } 1126 1127 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1128 { 1129 PetscFunctionBegin; 1130 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1131 PetscValidPointer(kv, 2); 1132 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1133 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1134 PetscFunctionReturn(PETSC_SUCCESS); 1135 } 1136 1137 /* Computes Y += alpha X */ 1138 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1139 { 1140 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1141 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1142 ConstMatScalarKokkosView Xa; 1143 MatScalarKokkosView Ya; 1144 1145 PetscFunctionBegin; 1146 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1147 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1148 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1149 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1150 PetscCall(PetscLogGpuTimeBegin()); 1151 1152 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1153 PetscBool e; 1154 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1155 if (e) { 1156 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1157 if (e) pattern = SAME_NONZERO_PATTERN; 1158 } 1159 } 1160 1161 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1162 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1163 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1164 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1165 */ 1166 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1167 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1168 Xa = xkok->a_dual.view_device(); 1169 Ya = ykok->a_dual.view_device(); 1170 1171 if (pattern == SAME_NONZERO_PATTERN) { 1172 KokkosBlas::axpy(alpha, Xa, Ya); 1173 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1174 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1175 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1176 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1177 1178 Kokkos::parallel_for( 1179 Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1180 PetscInt i = t.league_rank(); // row i 1181 Kokkos::single(Kokkos::PerTeam(t), [=]() { 1182 // Only one thread works in a team 1183 PetscInt p, q = Yi(i); 1184 for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 1185 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 1186 if (Xj(p) == Yj(q)) { // Found it 1187 Ya(q) += alpha * Xa(p); 1188 q++; 1189 } else { 1190 // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1191 // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1192 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 1193 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 1194 #else 1195 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 1196 #endif 1197 } 1198 } 1199 }); 1200 }); 1201 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1202 } else { // different nonzero patterns 1203 Mat Z; 1204 KokkosCsrMatrix zcsr; 1205 KernelHandle kh; 1206 kh.create_spadd_handle(true); // X, Y are sorted 1207 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1208 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1209 zkok = new Mat_SeqAIJKokkos(zcsr); 1210 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1211 PetscCall(MatHeaderReplace(Y, &Z)); 1212 kh.destroy_spadd_handle(); 1213 } 1214 PetscCall(PetscLogGpuTimeEnd()); 1215 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 1216 PetscFunctionReturn(PETSC_SUCCESS); 1217 } 1218 1219 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1220 { 1221 Mat_SeqAIJKokkos *akok; 1222 Mat_SeqAIJ *aseq; 1223 1224 PetscFunctionBegin; 1225 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1226 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1227 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1228 delete akok; 1229 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE); 1230 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1231 akok->SetUpCOO(aseq); 1232 PetscFunctionReturn(PETSC_SUCCESS); 1233 } 1234 1235 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1236 { 1237 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 1238 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1239 PetscCount Annz = aseq->nz; 1240 const PetscCountKokkosView &jmap = akok->jmap_d; 1241 const PetscCountKokkosView &perm = akok->perm_d; 1242 MatScalarKokkosView Aa; 1243 ConstMatScalarKokkosView kv; 1244 PetscMemType memtype; 1245 1246 PetscFunctionBegin; 1247 PetscCall(PetscGetMemType(v, &memtype)); 1248 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1249 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, aseq->coo_n)); 1250 } else { 1251 kv = ConstMatScalarKokkosView(v, aseq->coo_n); /* Directly use v[]'s memory */ 1252 } 1253 1254 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1255 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1256 1257 Kokkos::parallel_for( 1258 Annz, KOKKOS_LAMBDA(const PetscCount i) { 1259 PetscScalar sum = 0.0; 1260 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1261 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1262 }); 1263 1264 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1265 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1266 PetscFunctionReturn(PETSC_SUCCESS); 1267 } 1268 1269 PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A, const PetscInt *diag) 1270 { 1271 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1272 MatScalarKokkosView Aa; 1273 const MatRowMapKokkosView &Ai = akok->i_dual.view_device(); 1274 PetscInt m = A->rmap->n; 1275 ConstMatRowMapKokkosView Adiag(diag, m); /* diag is a device pointer */ 1276 1277 PetscFunctionBegin; 1278 PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); 1279 Kokkos::parallel_for( 1280 m, KOKKOS_LAMBDA(const PetscInt i) { 1281 PetscScalar tmp; 1282 if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i + 1)) { /* The diagonal element exists */ 1283 tmp = Aa(Ai(i)); 1284 Aa(Ai(i)) = Aa(Adiag(i)); 1285 Aa(Adiag(i)) = tmp; 1286 } 1287 }); 1288 PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1289 PetscFunctionReturn(PETSC_SUCCESS); 1290 } 1291 1292 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1293 { 1294 PetscFunctionBegin; 1295 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1296 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1297 B->offloadmask = PETSC_OFFLOAD_CPU; 1298 PetscFunctionReturn(PETSC_SUCCESS); 1299 } 1300 1301 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1302 { 1303 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1304 1305 PetscFunctionBegin; 1306 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1307 A->boundtocpu = PETSC_FALSE; 1308 1309 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1310 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1311 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1312 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1313 A->ops->scale = MatScale_SeqAIJKokkos; 1314 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1315 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1316 A->ops->mult = MatMult_SeqAIJKokkos; 1317 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1318 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1319 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1320 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1321 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1322 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1323 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1324 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1325 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1326 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1327 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1328 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1329 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1330 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1331 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1332 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1333 1334 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1335 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1336 PetscFunctionReturn(PETSC_SUCCESS); 1337 } 1338 1339 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1340 { 1341 Mat_SeqAIJ *aseq; 1342 PetscInt i, m, n; 1343 1344 PetscFunctionBegin; 1345 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1346 1347 m = akok->nrows(); 1348 n = akok->ncols(); 1349 PetscCall(MatSetSizes(A, m, n, m, n)); 1350 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1351 1352 /* Set up data structures of A as a MATSEQAIJ */ 1353 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1354 aseq = (Mat_SeqAIJ *)(A)->data; 1355 1356 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1357 akok->j_dual.sync_host(); 1358 1359 aseq->i = akok->i_host_data(); 1360 aseq->j = akok->j_host_data(); 1361 aseq->a = akok->a_host_data(); 1362 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1363 aseq->singlemalloc = PETSC_FALSE; 1364 aseq->free_a = PETSC_FALSE; 1365 aseq->free_ij = PETSC_FALSE; 1366 aseq->nz = akok->nnz(); 1367 aseq->maxnz = aseq->nz; 1368 1369 PetscCall(PetscMalloc1(m, &aseq->imax)); 1370 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1371 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1372 1373 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1374 akok->nonzerostate = A->nonzerostate; 1375 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1376 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1377 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1378 PetscFunctionReturn(PETSC_SUCCESS); 1379 } 1380 1381 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 1382 { 1383 PetscFunctionBegin; 1384 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1385 *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 1386 PetscFunctionReturn(PETSC_SUCCESS); 1387 } 1388 1389 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 1390 { 1391 Mat_SeqAIJKokkos *akok; 1392 PetscFunctionBegin; 1393 PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 1394 PetscCall(MatCreate(comm, A)); 1395 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1396 PetscFunctionReturn(PETSC_SUCCESS); 1397 } 1398 1399 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1400 1401 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1402 */ 1403 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1404 { 1405 PetscFunctionBegin; 1406 PetscCall(MatCreate(comm, A)); 1407 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1408 PetscFunctionReturn(PETSC_SUCCESS); 1409 } 1410 1411 /*@C 1412 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1413 (the default parallel PETSc format). This matrix will ultimately be handled by 1414 Kokkos for calculations. For good matrix 1415 assembly performance the user should preallocate the matrix storage by setting 1416 the parameter nz (or the array nnz). By setting these parameters accurately, 1417 performance during matrix assembly can be increased by more than a factor of 50. 1418 1419 Collective 1420 1421 Input Parameters: 1422 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1423 . m - number of rows 1424 . n - number of columns 1425 . nz - number of nonzeros per row (same for all rows) 1426 - nnz - array containing the number of nonzeros in the various rows 1427 (possibly different for each row) or `NULL` 1428 1429 Output Parameter: 1430 . A - the matrix 1431 1432 Level: intermediate 1433 1434 Notes: 1435 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1436 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1437 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1438 1439 If `nnz` is given then `nz` is ignored 1440 1441 The AIJ format, also called 1442 compressed row storage, is fully compatible with standard Fortran 1443 storage. That is, the stored row and column indices can begin at 1444 either one (as in Fortran) or zero. See the users' manual for details. 1445 1446 Specify the preallocated storage with either `nz` or `nnz` (not both). 1447 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1448 allocation. 1449 1450 By default, this format uses inodes (identical nodes) when possible, to 1451 improve numerical efficiency of matrix-vector products and solves. We 1452 search for consecutive rows with the same nonzero structure, thereby 1453 reusing matrix information to achieve increased efficiency. 1454 1455 .seealso: [](chapter_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()` 1456 @*/ 1457 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1458 { 1459 PetscFunctionBegin; 1460 PetscCall(PetscKokkosInitializeCheck()); 1461 PetscCall(MatCreate(comm, A)); 1462 PetscCall(MatSetSizes(*A, m, n, m, n)); 1463 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1464 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1465 PetscFunctionReturn(PETSC_SUCCESS); 1466 } 1467 1468 typedef Kokkos::TeamPolicy<>::member_type team_member; 1469 // 1470 // This factorization exploits block diagonal matrices with "Nf" (not used). 1471 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1472 // 1473 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B, Mat A, const MatFactorInfo *info) 1474 { 1475 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1476 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 1477 IS isrow = b->row, isicol = b->icol; 1478 const PetscInt *r_h, *ic_h; 1479 const PetscInt n = A->rmap->n, *ai_d = aijkok->i_dual.view_device().data(), *aj_d = aijkok->j_dual.view_device().data(), *bi_d = baijkok->i_dual.view_device().data(), *bj_d = baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data(); 1480 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1481 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1482 PetscBool row_identity, col_identity; 1483 PetscInt nc, Nf = 1, nVec = 32; // should be a parameter, Nf is batch size - not used 1484 1485 PetscFunctionBegin; 1486 PetscCheck(A->rmap->n == n, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT, A->rmap->n, n); 1487 PetscCall(MatIsStructurallySymmetric(A, &row_identity)); 1488 PetscCheck(row_identity, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "structurally symmetric matrices only supported"); 1489 PetscCall(ISGetIndices(isrow, &r_h)); 1490 PetscCall(ISGetIndices(isicol, &ic_h)); 1491 PetscCall(ISGetSize(isicol, &nc)); 1492 PetscCall(PetscLogGpuTimeBegin()); 1493 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1494 { 1495 #define KOKKOS_SHARED_LEVEL 1 1496 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1497 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1498 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1499 const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_r_k(r_h, n); 1500 Kokkos::View<PetscInt *, Kokkos::LayoutLeft> d_r_k("r", n); 1501 const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_ic_k(ic_h, nc); 1502 Kokkos::View<PetscInt *, Kokkos::LayoutLeft> d_ic_k("ic", nc); 1503 size_t flops_h = 0.0; 1504 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k(&flops_h); 1505 Kokkos::View<size_t> d_flops_k("flops"); 1506 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1507 const int nloc = n / Nf, Ni = (conc > 8) ? 1 /* some intelligent number of SMs -- but need league_barrier */ : 1; 1508 Kokkos::deep_copy(d_flops_k, h_flops_k); 1509 Kokkos::deep_copy(d_r_k, h_r_k); 1510 Kokkos::deep_copy(d_ic_k, h_ic_k); 1511 // Fill A --> fact 1512 Kokkos::parallel_for( 1513 Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec), KOKKOS_LAMBDA(const team_member team) { 1514 const PetscInt field = team.league_rank() / Ni, field_block = team.league_rank() % Ni; // use grid.x/y in CUDA 1515 const PetscInt nloc_i = (nloc / Ni + !!(nloc % Ni)), start_i = field * nloc + field_block * nloc_i, end_i = (start_i + nloc_i) > (field + 1) * nloc ? (field + 1) * nloc : (start_i + nloc_i); 1516 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1517 // zero rows of B 1518 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) { 1519 PetscInt nzbL = bi_d[rowb + 1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb + 1]; // with diag 1520 PetscScalar *baL = ba_d + bi_d[rowb]; 1521 PetscScalar *baU = ba_d + bdiag_d[rowb + 1] + 1; 1522 /* zero (unfactored row) */ 1523 for (int j = 0; j < nzbL; j++) baL[j] = 0; 1524 for (int j = 0; j < nzbU; j++) baU[j] = 0; 1525 }); 1526 // copy A into B 1527 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) { 1528 PetscInt rowa = r[rowb], nza = ai_d[rowa + 1] - ai_d[rowa]; 1529 const PetscScalar *av = aa_d + ai_d[rowa]; 1530 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1531 /* load in initial (unfactored row) */ 1532 for (int j = 0; j < nza; j++) { 1533 PetscInt colb = ic[ajtmp[j]]; 1534 PetscScalar vala = av[j]; 1535 if (colb == rowb) { 1536 *(ba_d + bdiag_d[rowb]) = vala; 1537 } else { 1538 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]); 1539 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]); 1540 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb + 1] + 1) : bi_d[rowb + 1] - bi_d[rowb], set = 0; 1541 for (int j = 0; j < nz; j++) { 1542 if (pbj[j] == colb) { 1543 pba[j] = vala; 1544 set++; 1545 break; 1546 } 1547 } 1548 #if !defined(PETSC_HAVE_SYCL) 1549 if (set != 1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1550 #endif 1551 } 1552 } 1553 }); 1554 }); 1555 Kokkos::fence(); 1556 1557 Kokkos::parallel_for( 1558 Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size() + scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA(const team_member team) { 1559 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1560 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1561 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1562 const PetscInt field = team.league_rank() / Ni, field_block_idx = team.league_rank() % Ni; // use grid.x/y in CUDA 1563 const PetscInt start = field * nloc, end = start + nloc; 1564 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1565 // A22 panel update for each row A(1,:) and col A(:,1) 1566 for (int ii = start; ii < end - 1; ii++) { 1567 const PetscInt *bjUi = bj_d + bdiag_d[ii + 1] + 1, nzUi = bdiag_d[ii] - (bdiag_d[ii + 1] + 1); // vector, and vector size, of column indices of U(i,(i+1):end) 1568 const PetscScalar *baUi = ba_d + bdiag_d[ii + 1] + 1; // vector of data U(i,i+1:end) 1569 const PetscInt nUi_its = nzUi / Ni + !!(nzUi % Ni); 1570 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1571 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=](const int j) { 1572 PetscInt kIdx = j * Ni + field_block_idx; 1573 if (kIdx >= nzUi) /* void */ 1574 ; 1575 else { 1576 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1577 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1578 const PetscInt nzL = bi_d[myk + 1] - bi_d[myk]; // size of L_k(:) 1579 size_t st_idx; 1580 // find and do L(k,i) = A(:k,i) / A(i,i) 1581 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1582 // get column, there has got to be a better way 1583 Kokkos::parallel_reduce( 1584 Kokkos::ThreadVectorRange(team, nzL), 1585 [&](const int &j, size_t &idx) { 1586 if (pjL[j] == ii) { 1587 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1588 idx = j; // output 1589 *pLki = *pLki / Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1590 } 1591 }, 1592 st_idx); 1593 Kokkos::single(Kokkos::PerThread(team), [=]() { 1594 colkIdx() = st_idx; 1595 L_ki() = *(ba_d + bi_d[myk] + st_idx); 1596 }); 1597 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1598 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n", (int)myk, ii); // uses a register 1599 #endif 1600 // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1601 // U(i+1,:end) 1602 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, nzUi), [=](const int &uiIdx) { // index into i (U) 1603 PetscScalar Uij = baUi[uiIdx]; 1604 PetscInt col = bjUi[uiIdx]; 1605 if (col == myk) { 1606 // A_kk = A_kk - L_ki * U_ij(k) 1607 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1608 *Akkv = *Akkv - L_ki() * Uij; // UiK 1609 } else { 1610 PetscScalar *start, *end, *pAkjv = NULL; 1611 PetscInt high, low; 1612 const PetscInt *startj; 1613 if (col < myk) { // L 1614 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1615 PetscInt idx = (pLki + 1) - (ba_d + bi_d[myk]); // index into row 1616 start = pLki + 1; // start at pLki+1, A22(myk,1) 1617 startj = bj_d + bi_d[myk] + idx; 1618 end = ba_d + bi_d[myk + 1]; 1619 } else { 1620 PetscInt idx = bdiag_d[myk + 1] + 1; 1621 start = ba_d + idx; 1622 startj = bj_d + idx; 1623 end = ba_d + bdiag_d[myk]; 1624 } 1625 // search for 'col', use bisection search - TODO 1626 low = 0; 1627 high = (PetscInt)(end - start); 1628 while (high - low > 5) { 1629 int t = (low + high) / 2; 1630 if (startj[t] > col) high = t; 1631 else low = t; 1632 } 1633 for (pAkjv = start + low; pAkjv < start + high; pAkjv++) { 1634 if (startj[pAkjv - start] == col) break; 1635 } 1636 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1637 if (pAkjv == start + high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n", (int)myk, (int)col); // uses a register 1638 #endif 1639 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1640 } 1641 }); 1642 } 1643 }); 1644 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1645 if (field_block_idx == 0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add(flops.data(), (size_t)(2 * (nzUi * nzUi) + 2)); }); 1646 } /* endof for (i=0; i<n; i++) { */ 1647 Kokkos::single(Kokkos::PerTeam(team), [=]() { 1648 Kokkos::atomic_add(&d_flops_k(), flops()); 1649 flops() = 0; 1650 }); 1651 }); 1652 Kokkos::fence(); 1653 Kokkos::deep_copy(h_flops_k, d_flops_k); 1654 PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k())); 1655 Kokkos::parallel_for( 1656 Kokkos::TeamPolicy<>(Nf * Ni, 1, 256), KOKKOS_LAMBDA(const team_member team) { 1657 const PetscInt lg_rank = team.league_rank(), field = lg_rank / Ni; //, field_offset = lg_rank%Ni; 1658 const PetscInt start = field * nloc, end = start + nloc, n_its = (nloc / Ni + !!(nloc % Ni)); // 1/Ni iters 1659 /* Invert diagonal for simpler triangular solves */ 1660 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=](int outer_index) { 1661 int i = start + outer_index * Ni + lg_rank % Ni; 1662 if (i < end) { 1663 PetscScalar *pv = ba_d + bdiag_d[i]; 1664 *pv = 1.0 / (*pv); 1665 } 1666 }); 1667 }); 1668 } 1669 PetscCall(PetscLogGpuTimeEnd()); 1670 PetscCall(ISRestoreIndices(isicol, &ic_h)); 1671 PetscCall(ISRestoreIndices(isrow, &r_h)); 1672 1673 PetscCall(ISIdentity(isrow, &row_identity)); 1674 PetscCall(ISIdentity(isicol, &col_identity)); 1675 if (b->inode.size) { 1676 B->ops->solve = MatSolve_SeqAIJ_Inode; 1677 } else if (row_identity && col_identity) { 1678 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1679 } else { 1680 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1681 } 1682 B->offloadmask = PETSC_OFFLOAD_GPU; 1683 PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU 1684 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1685 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1686 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1687 B->ops->matsolve = MatMatSolve_SeqAIJ; 1688 B->assembled = PETSC_TRUE; 1689 B->preallocated = PETSC_TRUE; 1690 1691 PetscFunctionReturn(PETSC_SUCCESS); 1692 } 1693 1694 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1695 { 1696 PetscFunctionBegin; 1697 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1698 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1699 PetscFunctionReturn(PETSC_SUCCESS); 1700 } 1701 1702 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1703 { 1704 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1705 1706 PetscFunctionBegin; 1707 if (!factors->sptrsv_symbolic_completed) { 1708 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d); 1709 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d); 1710 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1711 } 1712 PetscFunctionReturn(PETSC_SUCCESS); 1713 } 1714 1715 /* Check if we need to update factors etc for transpose solve */ 1716 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1717 { 1718 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1719 MatColIdxType n = A->rmap->n; 1720 1721 PetscFunctionBegin; 1722 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1723 /* Update L^T and do sptrsv symbolic */ 1724 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); 1725 Kokkos::deep_copy(factors->iLt_d, 0); /* KK requires 0 */ 1726 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d", factors->jL_d.extent(0)); 1727 factors->aLt_d = MatScalarKokkosView("factors->aLt_d", factors->aL_d.extent(0)); 1728 1729 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d, 1730 factors->iLt_d, factors->jLt_d, factors->aLt_d); 1731 1732 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1733 We have to sort the indices, until KK provides finer control options. 1734 */ 1735 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d); 1736 1737 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d); 1738 1739 /* Update U^T and do sptrsv symbolic */ 1740 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); 1741 Kokkos::deep_copy(factors->iUt_d, 0); /* KK requires 0 */ 1742 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d", factors->jU_d.extent(0)); 1743 factors->aUt_d = MatScalarKokkosView("factors->aUt_d", factors->aU_d.extent(0)); 1744 1745 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d, 1746 factors->iUt_d, factors->jUt_d, factors->aUt_d); 1747 1748 /* Sort indices. See comments above */ 1749 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d); 1750 1751 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d); 1752 factors->transpose_updated = PETSC_TRUE; 1753 } 1754 PetscFunctionReturn(PETSC_SUCCESS); 1755 } 1756 1757 /* Solve Ax = b, with A = LU */ 1758 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x) 1759 { 1760 ConstPetscScalarKokkosView bv; 1761 PetscScalarKokkosView xv; 1762 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1763 1764 PetscFunctionBegin; 1765 PetscCall(PetscLogGpuTimeBegin()); 1766 PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 1767 PetscCall(VecGetKokkosView(b, &bv)); 1768 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1769 /* Solve L tmpv = b */ 1770 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector)); 1771 /* Solve Ux = tmpv */ 1772 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv)); 1773 PetscCall(VecRestoreKokkosView(b, &bv)); 1774 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1775 PetscCall(PetscLogGpuTimeEnd()); 1776 PetscFunctionReturn(PETSC_SUCCESS); 1777 } 1778 1779 /* Solve A^T x = b, where A^T = U^T L^T */ 1780 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x) 1781 { 1782 ConstPetscScalarKokkosView bv; 1783 PetscScalarKokkosView xv; 1784 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1785 1786 PetscFunctionBegin; 1787 PetscCall(PetscLogGpuTimeBegin()); 1788 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 1789 PetscCall(VecGetKokkosView(b, &bv)); 1790 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1791 /* Solve U^T tmpv = b */ 1792 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector); 1793 1794 /* Solve L^T x = tmpv */ 1795 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv); 1796 PetscCall(VecRestoreKokkosView(b, &bv)); 1797 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1798 PetscCall(PetscLogGpuTimeEnd()); 1799 PetscFunctionReturn(PETSC_SUCCESS); 1800 } 1801 1802 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1803 { 1804 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1805 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1806 PetscInt fill_lev = info->levels; 1807 1808 PetscFunctionBegin; 1809 PetscCall(PetscLogGpuTimeBegin()); 1810 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1811 1812 auto a_d = aijkok->a_dual.view_device(); 1813 auto i_d = aijkok->i_dual.view_device(); 1814 auto j_d = aijkok->j_dual.view_device(); 1815 1816 KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d); 1817 1818 B->assembled = PETSC_TRUE; 1819 B->preallocated = PETSC_TRUE; 1820 B->ops->solve = MatSolve_SeqAIJKokkos; 1821 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1822 B->ops->matsolve = NULL; 1823 B->ops->matsolvetranspose = NULL; 1824 B->offloadmask = PETSC_OFFLOAD_GPU; 1825 1826 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1827 factors->transpose_updated = PETSC_FALSE; 1828 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1829 /* TODO: log flops, but how to know that? */ 1830 PetscCall(PetscLogGpuTimeEnd()); 1831 PetscFunctionReturn(PETSC_SUCCESS); 1832 } 1833 1834 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1835 { 1836 Mat_SeqAIJKokkos *aijkok; 1837 Mat_SeqAIJ *b; 1838 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1839 PetscInt fill_lev = info->levels; 1840 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 1841 PetscInt n = A->rmap->n; 1842 1843 PetscFunctionBegin; 1844 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1845 /* Rebuild factors */ 1846 if (factors) { 1847 factors->Destroy(); 1848 } /* Destroy the old if it exists */ 1849 else { 1850 B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n); 1851 } 1852 1853 /* Create a spiluk handle and then do symbolic factorization */ 1854 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 1855 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 1856 1857 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1858 1859 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 1860 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 1861 Kokkos::realloc(factors->iU_d, n + 1); 1862 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 1863 1864 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1865 auto i_d = aijkok->i_dual.view_device(); 1866 auto j_d = aijkok->j_dual.view_device(); 1867 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d); 1868 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1869 1870 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1871 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 1872 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 1873 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 1874 1875 /* TODO: add options to select sptrsv algorithms */ 1876 /* Create sptrsv handles for L, U and their transpose */ 1877 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1878 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1879 #else 1880 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1881 #endif 1882 1883 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 1884 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 1885 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 1886 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 1887 1888 /* Fill fields of the factor matrix B */ 1889 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 1890 b = (Mat_SeqAIJ *)B->data; 1891 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 1892 B->info.fill_ratio_given = info->fill; 1893 B->info.fill_ratio_needed = ((PetscReal)b->nz) / ((PetscReal)nnzA); 1894 1895 B->offloadmask = PETSC_OFFLOAD_GPU; 1896 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1897 PetscFunctionReturn(PETSC_SUCCESS); 1898 } 1899 1900 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1901 { 1902 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1903 const PetscInt nrows = A->rmap->n; 1904 1905 PetscFunctionBegin; 1906 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1907 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1908 // move B data into Kokkos 1909 PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok 1910 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok 1911 { 1912 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 1913 if (!baijkok->diag_d.extent(0)) { 1914 const Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_diag(b->diag, nrows + 1); 1915 baijkok->diag_d = Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_diag)); 1916 Kokkos::deep_copy(baijkok->diag_d, h_diag); 1917 } 1918 } 1919 PetscFunctionReturn(PETSC_SUCCESS); 1920 } 1921 1922 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type) 1923 { 1924 PetscFunctionBegin; 1925 *type = MATSOLVERKOKKOS; 1926 PetscFunctionReturn(PETSC_SUCCESS); 1927 } 1928 1929 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A, MatSolverType *type) 1930 { 1931 PetscFunctionBegin; 1932 *type = MATSOLVERKOKKOSDEVICE; 1933 PetscFunctionReturn(PETSC_SUCCESS); 1934 } 1935 1936 /*MC 1937 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1938 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 1939 1940 Level: beginner 1941 1942 .seealso: [](chapter_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1943 M*/ 1944 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1945 { 1946 PetscInt n = A->rmap->n; 1947 1948 PetscFunctionBegin; 1949 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 1950 PetscCall(MatSetSizes(*B, n, n, n, n)); 1951 (*B)->factortype = ftype; 1952 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 1953 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1954 1955 if (ftype == MAT_FACTOR_LU) { 1956 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1957 (*B)->canuseordering = PETSC_TRUE; 1958 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1959 } else if (ftype == MAT_FACTOR_ILU) { 1960 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1961 (*B)->canuseordering = PETSC_FALSE; 1962 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1963 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1964 1965 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 1966 PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos)); 1967 PetscFunctionReturn(PETSC_SUCCESS); 1968 } 1969 1970 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A, MatFactorType ftype, Mat *B) 1971 { 1972 PetscInt n = A->rmap->n; 1973 1974 PetscFunctionBegin; 1975 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 1976 PetscCall(MatSetSizes(*B, n, n, n, n)); 1977 (*B)->factortype = ftype; 1978 (*B)->canuseordering = PETSC_TRUE; 1979 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 1980 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1981 1982 if (ftype == MAT_FACTOR_LU) { 1983 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1984 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1985 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for KOKKOS Matrix Types"); 1986 1987 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 1988 PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_kokkos_device)); 1989 PetscFunctionReturn(PETSC_SUCCESS); 1990 } 1991 1992 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1993 { 1994 PetscFunctionBegin; 1995 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1996 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1997 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_seqaijkokkos_kokkos_device)); 1998 PetscFunctionReturn(PETSC_SUCCESS); 1999 } 2000 2001 /* Utility to print out a KokkosCsrMatrix for debugging */ 2002 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 2003 { 2004 const auto &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map); 2005 const auto &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries); 2006 const auto &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values); 2007 const PetscInt *i = iv.data(); 2008 const PetscInt *j = jv.data(); 2009 const PetscScalar *a = av.data(); 2010 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 2011 2012 PetscFunctionBegin; 2013 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 2014 for (PetscInt k = 0; k < m; k++) { 2015 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 2016 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 2017 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 2018 } 2019 PetscFunctionReturn(PETSC_SUCCESS); 2020 } 2021