1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petsc/private/sfimpl.h> 5 #include <petscsystypes.h> 6 #include <petscerror.h> 7 8 #include <Kokkos_Core.hpp> 9 #include <KokkosBlas.hpp> 10 #include <KokkosSparse_CrsMatrix.hpp> 11 #include <KokkosSparse_spmv.hpp> 12 #include <KokkosSparse_spiluk.hpp> 13 #include <KokkosSparse_sptrsv.hpp> 14 #include <KokkosSparse_spgemm.hpp> 15 #include <KokkosSparse_spadd.hpp> 16 17 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 18 19 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 20 #include <KokkosSparse_Utils.hpp> 21 using KokkosSparse::sort_crs_matrix; 22 using KokkosSparse::Impl::transpose_matrix; 23 #else 24 #include <KokkosKernels_Sorting.hpp> 25 using KokkosKernels::sort_crs_matrix; 26 using KokkosKernels::Impl::transpose_matrix; 27 #endif 28 29 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 30 31 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 32 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 33 In the latter case, it is important to set a_dual's sync state correctly. 34 */ 35 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 36 { 37 Mat_SeqAIJ *aijseq; 38 Mat_SeqAIJKokkos *aijkok; 39 40 PetscFunctionBegin; 41 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 42 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 43 44 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 45 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 46 47 /* If aijkok does not exist, we just copy i, j to device. 48 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 49 In both cases, we build a new aijkok structure. 50 */ 51 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 52 delete aijkok; 53 aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 54 A->spptr = aijkok; 55 } 56 57 if (aijkok->device_mat_d.data()) { 58 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 59 } 60 PetscFunctionReturn(PETSC_SUCCESS); 61 } 62 63 /* Sync CSR data to device if not yet */ 64 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 65 { 66 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 67 68 PetscFunctionBegin; 69 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Cann't sync factorized matrix from host to device"); 70 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 71 if (aijkok->a_dual.need_sync_device()) { 72 aijkok->a_dual.sync_device(); 73 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 74 aijkok->hermitian_updated = PETSC_FALSE; 75 } 76 PetscFunctionReturn(PETSC_SUCCESS); 77 } 78 79 /* Mark the CSR data on device as modified */ 80 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 81 { 82 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 83 84 PetscFunctionBegin; 85 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 86 aijkok->a_dual.clear_sync_state(); 87 aijkok->a_dual.modify_device(); 88 aijkok->transpose_updated = PETSC_FALSE; 89 aijkok->hermitian_updated = PETSC_FALSE; 90 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 91 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 92 PetscFunctionReturn(PETSC_SUCCESS); 93 } 94 95 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 96 { 97 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 98 99 PetscFunctionBegin; 100 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 101 /* We do not expect one needs factors on host */ 102 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Cann't sync factorized matrix from device to host"); 103 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 104 aijkok->a_dual.sync_host(); 105 PetscFunctionReturn(PETSC_SUCCESS); 106 } 107 108 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 109 { 110 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 111 112 PetscFunctionBegin; 113 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 114 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 115 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 116 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 117 */ 118 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 119 aijkok->a_dual.sync_host(); 120 *array = aijkok->a_dual.view_host().data(); 121 } else { /* Happens when calling MatSetValues on a newly created matrix */ 122 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 123 } 124 PetscFunctionReturn(PETSC_SUCCESS); 125 } 126 127 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 128 { 129 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 130 131 PetscFunctionBegin; 132 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 133 PetscFunctionReturn(PETSC_SUCCESS); 134 } 135 136 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 137 { 138 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 139 140 PetscFunctionBegin; 141 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 142 aijkok->a_dual.sync_host(); 143 *array = aijkok->a_dual.view_host().data(); 144 } else { 145 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 146 } 147 PetscFunctionReturn(PETSC_SUCCESS); 148 } 149 150 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 151 { 152 PetscFunctionBegin; 153 *array = NULL; 154 PetscFunctionReturn(PETSC_SUCCESS); 155 } 156 157 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 158 { 159 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 160 161 PetscFunctionBegin; 162 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 163 *array = aijkok->a_dual.view_host().data(); 164 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 165 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 166 } 167 PetscFunctionReturn(PETSC_SUCCESS); 168 } 169 170 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 171 { 172 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 173 174 PetscFunctionBegin; 175 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 176 aijkok->a_dual.clear_sync_state(); 177 aijkok->a_dual.modify_host(); 178 } 179 PetscFunctionReturn(PETSC_SUCCESS); 180 } 181 182 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 183 { 184 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 185 186 PetscFunctionBegin; 187 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 188 189 if (i) *i = aijkok->i_device_data(); 190 if (j) *j = aijkok->j_device_data(); 191 if (a) { 192 aijkok->a_dual.sync_device(); 193 *a = aijkok->a_device_data(); 194 } 195 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 196 PetscFunctionReturn(PETSC_SUCCESS); 197 } 198 199 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 200 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 201 { 202 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 203 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 204 205 PetscFunctionBegin; 206 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 207 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(), h_mat_k); 208 Kokkos::deep_copy(aijkok->device_mat_d, h_mat_k); 209 PetscFunctionReturn(PETSC_SUCCESS); 210 } 211 212 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 213 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 214 { 215 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 216 217 PetscFunctionBegin; 218 if (aijkok && aijkok->device_mat_d.data()) { 219 *d_mat = aijkok->device_mat_d.data(); 220 } else { 221 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it) 222 *d_mat = NULL; 223 } 224 PetscFunctionReturn(PETSC_SUCCESS); 225 } 226 227 /* 228 Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 229 230 Input Parameter: 231 . A - the MATSEQAIJKOKKOS matrix 232 233 Output Parameters: 234 + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 235 - T_d - the transpose on device, whose value array is allcoated but not initialized 236 */ 237 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 238 { 239 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 240 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 241 const PetscInt *Ai = aseq->i, *Aj = aseq->j; 242 MatRowMapKokkosViewHost Ti_h("Ti", n + 1); 243 MatRowMapType *Ti = Ti_h.data(); 244 MatColIdxKokkosViewHost Tj_h("Tj", nz); 245 MatRowMapKokkosViewHost perm_h("permutation", nz); 246 PetscInt *Tj = Tj_h.data(); 247 PetscInt *perm = perm_h.data(); 248 PetscInt *offset; 249 250 PetscFunctionBegin; 251 // Populate Ti 252 PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 253 Ti++; 254 for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 255 Ti--; 256 for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 257 258 // Populate Tj and the permutation array 259 PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 260 for (PetscInt i = 0; i < m; i++) { 261 for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 262 PetscInt r = Aj[j]; // row r of T 263 PetscInt disp = Ti[r] + offset[r]; 264 265 Tj[disp] = i; // col i of T 266 perm[disp] = j; 267 offset[r]++; 268 } 269 } 270 PetscCall(PetscFree(offset)); 271 272 // Sort each row of T, along with the permutation array 273 for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 274 275 // Output perm and T on device 276 auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 277 auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 278 PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 279 PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 280 PetscFunctionReturn(PETSC_SUCCESS); 281 } 282 283 // Generate the transpose on device and cache it internally 284 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 285 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 286 { 287 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 288 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 289 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 290 KokkosCsrMatrix &T = akok->csrmatT; 291 292 PetscFunctionBegin; 293 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 294 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's valeus since we are going to access them on device 295 296 const auto &Aa = akok->a_dual.view_device(); 297 298 if (A->symmetric == PETSC_BOOL3_TRUE) { 299 *csrmatT = akok->csrmat; 300 } else { 301 // See if we already have a cached transpose and its value is up to date 302 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 303 if (!akok->transpose_updated) { // if the value is out of date, update the cached version 304 const auto &perm = akok->transpose_perm; // get the permutation array 305 auto &Ta = T.values; 306 307 PetscCallCXX(Kokkos::parallel_for( 308 nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 309 } 310 } else { // Generate T of size n x m for the first time 311 MatRowMapKokkosView perm; 312 313 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 314 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 315 PetscCallCXX(Kokkos::parallel_for( 316 nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 317 } 318 akok->transpose_updated = PETSC_TRUE; 319 *csrmatT = akok->csrmatT; 320 } 321 PetscFunctionReturn(PETSC_SUCCESS); 322 } 323 324 // Generate the Hermitian on device and cache it internally 325 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 326 { 327 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 328 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 329 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 330 KokkosCsrMatrix &T = akok->csrmatH; 331 332 PetscFunctionBegin; 333 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 334 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 335 336 const auto &Aa = akok->a_dual.view_device(); 337 338 if (A->hermitian == PETSC_BOOL3_TRUE) { 339 *csrmatH = akok->csrmat; 340 } else { 341 // See if we already have a cached hermitian and its value is up to date 342 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 343 if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 344 const auto &perm = akok->transpose_perm; // get the permutation array 345 auto &Ta = T.values; 346 347 PetscCallCXX(Kokkos::parallel_for( 348 nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 349 } 350 } else { // Generate T of size n x m for the first time 351 MatRowMapKokkosView perm; 352 353 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 354 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 355 PetscCallCXX(Kokkos::parallel_for( 356 nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 357 } 358 akok->hermitian_updated = PETSC_TRUE; 359 *csrmatH = akok->csrmatH; 360 } 361 PetscFunctionReturn(PETSC_SUCCESS); 362 } 363 364 /* y = A x */ 365 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 366 { 367 Mat_SeqAIJKokkos *aijkok; 368 ConstPetscScalarKokkosView xv; 369 PetscScalarKokkosView yv; 370 371 PetscFunctionBegin; 372 PetscCall(PetscLogGpuTimeBegin()); 373 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 374 PetscCall(VecGetKokkosView(xx, &xv)); 375 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 376 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 377 PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 378 PetscCall(VecRestoreKokkosView(xx, &xv)); 379 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 380 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 381 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 382 PetscCall(PetscLogGpuTimeEnd()); 383 PetscFunctionReturn(PETSC_SUCCESS); 384 } 385 386 /* y = A^T x */ 387 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 388 { 389 Mat_SeqAIJKokkos *aijkok; 390 const char *mode; 391 ConstPetscScalarKokkosView xv; 392 PetscScalarKokkosView yv; 393 KokkosCsrMatrix csrmat; 394 395 PetscFunctionBegin; 396 PetscCall(PetscLogGpuTimeBegin()); 397 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 398 PetscCall(VecGetKokkosView(xx, &xv)); 399 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 400 if (A->form_explicit_transpose) { 401 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 402 mode = "N"; 403 } else { 404 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 405 csrmat = aijkok->csrmat; 406 mode = "T"; 407 } 408 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 409 PetscCall(VecRestoreKokkosView(xx, &xv)); 410 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 411 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 412 PetscCall(PetscLogGpuTimeEnd()); 413 PetscFunctionReturn(PETSC_SUCCESS); 414 } 415 416 /* y = A^H x */ 417 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 418 { 419 Mat_SeqAIJKokkos *aijkok; 420 const char *mode; 421 ConstPetscScalarKokkosView xv; 422 PetscScalarKokkosView yv; 423 KokkosCsrMatrix csrmat; 424 425 PetscFunctionBegin; 426 PetscCall(PetscLogGpuTimeBegin()); 427 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 428 PetscCall(VecGetKokkosView(xx, &xv)); 429 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 430 if (A->form_explicit_transpose) { 431 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 432 mode = "N"; 433 } else { 434 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 435 csrmat = aijkok->csrmat; 436 mode = "C"; 437 } 438 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 439 PetscCall(VecRestoreKokkosView(xx, &xv)); 440 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 441 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 442 PetscCall(PetscLogGpuTimeEnd()); 443 PetscFunctionReturn(PETSC_SUCCESS); 444 } 445 446 /* z = A x + y */ 447 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 448 { 449 Mat_SeqAIJKokkos *aijkok; 450 ConstPetscScalarKokkosView xv, yv; 451 PetscScalarKokkosView zv; 452 453 PetscFunctionBegin; 454 PetscCall(PetscLogGpuTimeBegin()); 455 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 456 PetscCall(VecGetKokkosView(xx, &xv)); 457 PetscCall(VecGetKokkosView(yy, &yv)); 458 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 459 if (zz != yy) Kokkos::deep_copy(zv, yv); 460 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 461 PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 462 PetscCall(VecRestoreKokkosView(xx, &xv)); 463 PetscCall(VecRestoreKokkosView(yy, &yv)); 464 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 465 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 466 PetscCall(PetscLogGpuTimeEnd()); 467 PetscFunctionReturn(PETSC_SUCCESS); 468 } 469 470 /* z = A^T x + y */ 471 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 472 { 473 Mat_SeqAIJKokkos *aijkok; 474 const char *mode; 475 ConstPetscScalarKokkosView xv, yv; 476 PetscScalarKokkosView zv; 477 KokkosCsrMatrix csrmat; 478 479 PetscFunctionBegin; 480 PetscCall(PetscLogGpuTimeBegin()); 481 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 482 PetscCall(VecGetKokkosView(xx, &xv)); 483 PetscCall(VecGetKokkosView(yy, &yv)); 484 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 485 if (zz != yy) Kokkos::deep_copy(zv, yv); 486 if (A->form_explicit_transpose) { 487 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 488 mode = "N"; 489 } else { 490 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 491 csrmat = aijkok->csrmat; 492 mode = "T"; 493 } 494 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 495 PetscCall(VecRestoreKokkosView(xx, &xv)); 496 PetscCall(VecRestoreKokkosView(yy, &yv)); 497 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 498 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 499 PetscCall(PetscLogGpuTimeEnd()); 500 PetscFunctionReturn(PETSC_SUCCESS); 501 } 502 503 /* z = A^H x + y */ 504 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 505 { 506 Mat_SeqAIJKokkos *aijkok; 507 const char *mode; 508 ConstPetscScalarKokkosView xv, yv; 509 PetscScalarKokkosView zv; 510 KokkosCsrMatrix csrmat; 511 512 PetscFunctionBegin; 513 PetscCall(PetscLogGpuTimeBegin()); 514 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 515 PetscCall(VecGetKokkosView(xx, &xv)); 516 PetscCall(VecGetKokkosView(yy, &yv)); 517 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 518 if (zz != yy) Kokkos::deep_copy(zv, yv); 519 if (A->form_explicit_transpose) { 520 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 521 mode = "N"; 522 } else { 523 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 524 csrmat = aijkok->csrmat; 525 mode = "C"; 526 } 527 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 528 PetscCall(VecRestoreKokkosView(xx, &xv)); 529 PetscCall(VecRestoreKokkosView(yy, &yv)); 530 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 531 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 532 PetscCall(PetscLogGpuTimeEnd()); 533 PetscFunctionReturn(PETSC_SUCCESS); 534 } 535 536 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 537 { 538 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 539 540 PetscFunctionBegin; 541 switch (op) { 542 case MAT_FORM_EXPLICIT_TRANSPOSE: 543 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 544 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 545 A->form_explicit_transpose = flg; 546 break; 547 default: 548 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 549 break; 550 } 551 PetscFunctionReturn(PETSC_SUCCESS); 552 } 553 554 /* Depending on reuse, either build a new mat, or use the existing mat */ 555 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 556 { 557 Mat_SeqAIJ *aseq; 558 559 PetscFunctionBegin; 560 PetscCall(PetscKokkosInitializeCheck()); 561 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 562 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */ 563 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 564 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 565 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 566 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 567 PetscCall(PetscFree(A->defaultvectype)); 568 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 569 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 570 PetscCall(MatSetOps_SeqAIJKokkos(A)); 571 aseq = static_cast<Mat_SeqAIJ *>(A->data); 572 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 573 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 574 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE); 575 } 576 } 577 PetscFunctionReturn(PETSC_SUCCESS); 578 } 579 580 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 581 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 582 */ 583 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 584 { 585 Mat_SeqAIJ *bseq; 586 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 587 Mat mat; 588 589 PetscFunctionBegin; 590 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 591 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 592 mat = *B; 593 if (A->assembled) { 594 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 595 bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE); 596 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 597 /* Now copy values to B if needed */ 598 if (dupOption == MAT_COPY_VALUES) { 599 if (akok->a_dual.need_sync_device()) { 600 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 601 bkok->a_dual.modify_host(); 602 } else { /* If device has the latest data, we only copy data on device */ 603 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 604 bkok->a_dual.modify_device(); 605 } 606 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 607 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 608 bkok->a_dual.modify_host(); 609 } 610 mat->spptr = bkok; 611 } 612 613 PetscCall(PetscFree(mat->defaultvectype)); 614 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 615 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 616 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 617 PetscFunctionReturn(PETSC_SUCCESS); 618 } 619 620 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 621 { 622 Mat At; 623 KokkosCsrMatrix internT; 624 Mat_SeqAIJKokkos *atkok, *bkok; 625 626 PetscFunctionBegin; 627 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 628 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 629 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 630 /* Deep copy internT, as we want to isolate the internal transpose */ 631 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 632 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 633 if (reuse == MAT_INITIAL_MATRIX) *B = At; 634 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 635 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 636 if ((*B)->assembled) { 637 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 638 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 639 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 640 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 641 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 642 MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 643 MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 644 PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 645 PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 646 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 647 } 648 PetscFunctionReturn(PETSC_SUCCESS); 649 } 650 651 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 652 { 653 Mat_SeqAIJKokkos *aijkok; 654 655 PetscFunctionBegin; 656 if (A->factortype == MAT_FACTOR_NONE) { 657 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 658 delete aijkok; 659 } else { 660 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 661 } 662 A->spptr = NULL; 663 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 664 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 665 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 666 PetscCall(MatDestroy_SeqAIJ(A)); 667 PetscFunctionReturn(PETSC_SUCCESS); 668 } 669 670 /*MC 671 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 672 673 A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 674 675 Options Database Key: 676 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 677 678 Level: beginner 679 680 .seealso: [](chapter_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 681 M*/ 682 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 683 { 684 PetscFunctionBegin; 685 PetscCall(PetscKokkosInitializeCheck()); 686 PetscCall(MatCreate_SeqAIJ(A)); 687 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 688 PetscFunctionReturn(PETSC_SUCCESS); 689 } 690 691 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 692 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 693 { 694 Mat_SeqAIJ *a, *b; 695 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 696 MatScalarKokkosView aa, ba, ca; 697 MatRowMapKokkosView ai, bi, ci; 698 MatColIdxKokkosView aj, bj, cj; 699 PetscInt m, n, nnz, aN; 700 701 PetscFunctionBegin; 702 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 703 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 704 PetscValidPointer(C, 4); 705 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 706 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 707 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 708 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 709 710 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 711 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 712 a = static_cast<Mat_SeqAIJ *>(A->data); 713 b = static_cast<Mat_SeqAIJ *>(B->data); 714 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 715 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 716 aa = akok->a_dual.view_device(); 717 ai = akok->i_dual.view_device(); 718 ba = bkok->a_dual.view_device(); 719 bi = bkok->i_dual.view_device(); 720 m = A->rmap->n; /* M, N and nnz of C */ 721 n = A->cmap->n + B->cmap->n; 722 nnz = a->nz + b->nz; 723 aN = A->cmap->n; /* N of A */ 724 if (reuse == MAT_INITIAL_MATRIX) { 725 aj = akok->j_dual.view_device(); 726 bj = bkok->j_dual.view_device(); 727 auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 728 auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 729 auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 730 ca = ca_dual.view_device(); 731 ci = ci_dual.view_device(); 732 cj = cj_dual.view_device(); 733 734 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 735 Kokkos::parallel_for( 736 Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 737 PetscInt i = t.league_rank(); /* row i */ 738 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 739 740 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 741 ci(i) = coffset; 742 if (i == m - 1) ci(m) = ai(m) + bi(m); 743 }); 744 745 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 746 if (k < alen) { 747 ca(coffset + k) = aa(ai(i) + k); 748 cj(coffset + k) = aj(ai(i) + k); 749 } else { 750 ca(coffset + k) = ba(bi(i) + k - alen); 751 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 752 } 753 }); 754 }); 755 ca_dual.modify_device(); 756 ci_dual.modify_device(); 757 cj_dual.modify_device(); 758 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 759 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 760 } else if (reuse == MAT_REUSE_MATRIX) { 761 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 762 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 763 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 764 ca = ckok->a_dual.view_device(); 765 ci = ckok->i_dual.view_device(); 766 767 Kokkos::parallel_for( 768 Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 769 PetscInt i = t.league_rank(); /* row i */ 770 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 771 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 772 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 773 else ca(ci(i) + k) = ba(bi(i) + k - alen); 774 }); 775 }); 776 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 777 } 778 PetscFunctionReturn(PETSC_SUCCESS); 779 } 780 781 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 782 { 783 PetscFunctionBegin; 784 delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 785 PetscFunctionReturn(PETSC_SUCCESS); 786 } 787 788 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 789 { 790 Mat_Product *product = C->product; 791 Mat A, B; 792 bool transA, transB; /* use bool, since KK needs this type */ 793 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 794 Mat_SeqAIJ *c; 795 MatProductData_SeqAIJKokkos *pdata; 796 KokkosCsrMatrix csrmatA, csrmatB; 797 798 PetscFunctionBegin; 799 MatCheckProduct(C, 1); 800 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 801 pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 802 803 // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 804 // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 805 // we still do numeric. 806 if (pdata->reusesym) { // numeric reuses results from symbolic 807 pdata->reusesym = PETSC_FALSE; 808 PetscFunctionReturn(PETSC_SUCCESS); 809 } 810 811 switch (product->type) { 812 case MATPRODUCT_AB: 813 transA = false; 814 transB = false; 815 break; 816 case MATPRODUCT_AtB: 817 transA = true; 818 transB = false; 819 break; 820 case MATPRODUCT_ABt: 821 transA = false; 822 transB = true; 823 break; 824 default: 825 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 826 } 827 828 A = product->A; 829 B = product->B; 830 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 831 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 832 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 833 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 834 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 835 836 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 837 838 csrmatA = akok->csrmat; 839 csrmatB = bkok->csrmat; 840 841 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 842 if (transA) { 843 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 844 transA = false; 845 } 846 847 if (transB) { 848 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 849 transB = false; 850 } 851 PetscCall(PetscLogGpuTimeBegin()); 852 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 853 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 854 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 855 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 856 #endif 857 858 PetscCall(PetscLogGpuTimeEnd()); 859 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 860 /* shorter version of MatAssemblyEnd_SeqAIJ */ 861 c = (Mat_SeqAIJ *)C->data; 862 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 863 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 864 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 865 c->reallocs = 0; 866 C->info.mallocs = 0; 867 C->info.nz_unneeded = 0; 868 C->assembled = C->was_assembled = PETSC_TRUE; 869 C->num_ass++; 870 PetscFunctionReturn(PETSC_SUCCESS); 871 } 872 873 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 874 { 875 Mat_Product *product = C->product; 876 MatProductType ptype; 877 Mat A, B; 878 bool transA, transB; 879 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 880 MatProductData_SeqAIJKokkos *pdata; 881 MPI_Comm comm; 882 KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 883 884 PetscFunctionBegin; 885 MatCheckProduct(C, 1); 886 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 887 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 888 A = product->A; 889 B = product->B; 890 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 891 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 892 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 893 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 894 csrmatA = akok->csrmat; 895 csrmatB = bkok->csrmat; 896 897 ptype = product->type; 898 // Take advantage of the symmetry if true 899 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 900 ptype = MATPRODUCT_AB; 901 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 902 } 903 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 904 ptype = MATPRODUCT_AB; 905 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 906 } 907 908 switch (ptype) { 909 case MATPRODUCT_AB: 910 transA = false; 911 transB = false; 912 break; 913 case MATPRODUCT_AtB: 914 transA = true; 915 transB = false; 916 break; 917 case MATPRODUCT_ABt: 918 transA = false; 919 transB = true; 920 break; 921 default: 922 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 923 } 924 PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 925 pdata->reusesym = product->api_user; 926 927 /* TODO: add command line options to select spgemm algorithms */ 928 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 929 930 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 931 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 932 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 933 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 934 #endif 935 #endif 936 PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 937 938 PetscCall(PetscLogGpuTimeBegin()); 939 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 940 if (transA) { 941 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 942 transA = false; 943 } 944 945 if (transB) { 946 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 947 transB = false; 948 } 949 950 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 951 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 952 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 953 calling new Mat_SeqAIJKokkos(). 954 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 955 */ 956 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 957 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 958 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 959 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 960 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 961 #endif 962 PetscCall(PetscLogGpuTimeEnd()); 963 964 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 965 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 966 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 967 PetscFunctionReturn(PETSC_SUCCESS); 968 } 969 970 /* handles sparse matrix matrix ops */ 971 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 972 { 973 Mat_Product *product = mat->product; 974 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 975 976 PetscFunctionBegin; 977 MatCheckProduct(mat, 1); 978 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 979 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 980 if (Biskok && Ciskok) { 981 switch (product->type) { 982 case MATPRODUCT_AB: 983 case MATPRODUCT_AtB: 984 case MATPRODUCT_ABt: 985 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 986 break; 987 case MATPRODUCT_PtAP: 988 case MATPRODUCT_RARt: 989 case MATPRODUCT_ABC: 990 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 991 break; 992 default: 993 break; 994 } 995 } else { /* fallback for AIJ */ 996 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 997 } 998 PetscFunctionReturn(PETSC_SUCCESS); 999 } 1000 1001 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 1002 { 1003 Mat_SeqAIJKokkos *aijkok; 1004 1005 PetscFunctionBegin; 1006 PetscCall(PetscLogGpuTimeBegin()); 1007 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1008 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1009 KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 1010 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1011 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 1012 PetscCall(PetscLogGpuTimeEnd()); 1013 PetscFunctionReturn(PETSC_SUCCESS); 1014 } 1015 1016 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 1017 { 1018 Mat_SeqAIJKokkos *aijkok; 1019 1020 PetscFunctionBegin; 1021 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1022 if (aijkok) { /* Only zero the device if data is already there */ 1023 KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0); 1024 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1025 } else { /* Might be preallocated but not assembled */ 1026 PetscCall(MatZeroEntries_SeqAIJ(A)); 1027 } 1028 PetscFunctionReturn(PETSC_SUCCESS); 1029 } 1030 1031 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1032 { 1033 Mat_SeqAIJ *aijseq; 1034 Mat_SeqAIJKokkos *aijkok; 1035 PetscInt n; 1036 PetscScalarKokkosView xv; 1037 1038 PetscFunctionBegin; 1039 PetscCall(VecGetLocalSize(x, &n)); 1040 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1041 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1042 1043 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1044 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1045 1046 if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */ 1047 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1048 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1049 aijkok->SetDiagonal(aijseq->diag); 1050 } 1051 1052 const auto &Aa = aijkok->a_dual.view_device(); 1053 const auto &Ai = aijkok->i_dual.view_device(); 1054 const auto &Adiag = aijkok->diag_dual.view_device(); 1055 1056 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1057 Kokkos::parallel_for( 1058 n, KOKKOS_LAMBDA(const PetscInt i) { 1059 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1060 else xv(i) = 0; 1061 }); 1062 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1063 PetscFunctionReturn(PETSC_SUCCESS); 1064 } 1065 1066 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1067 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1068 { 1069 Mat_SeqAIJKokkos *aijkok; 1070 1071 PetscFunctionBegin; 1072 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1073 PetscValidPointer(kv, 2); 1074 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1075 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1076 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1077 *kv = aijkok->a_dual.view_device(); 1078 PetscFunctionReturn(PETSC_SUCCESS); 1079 } 1080 1081 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1082 { 1083 PetscFunctionBegin; 1084 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1085 PetscValidPointer(kv, 2); 1086 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1087 PetscFunctionReturn(PETSC_SUCCESS); 1088 } 1089 1090 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1091 { 1092 Mat_SeqAIJKokkos *aijkok; 1093 1094 PetscFunctionBegin; 1095 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1096 PetscValidPointer(kv, 2); 1097 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1098 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1099 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1100 *kv = aijkok->a_dual.view_device(); 1101 PetscFunctionReturn(PETSC_SUCCESS); 1102 } 1103 1104 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1105 { 1106 PetscFunctionBegin; 1107 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1108 PetscValidPointer(kv, 2); 1109 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1110 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1111 PetscFunctionReturn(PETSC_SUCCESS); 1112 } 1113 1114 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1115 { 1116 Mat_SeqAIJKokkos *aijkok; 1117 1118 PetscFunctionBegin; 1119 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1120 PetscValidPointer(kv, 2); 1121 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1122 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1123 *kv = aijkok->a_dual.view_device(); 1124 PetscFunctionReturn(PETSC_SUCCESS); 1125 } 1126 1127 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1128 { 1129 PetscFunctionBegin; 1130 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1131 PetscValidPointer(kv, 2); 1132 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1133 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1134 PetscFunctionReturn(PETSC_SUCCESS); 1135 } 1136 1137 /* Computes Y += alpha X */ 1138 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1139 { 1140 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1141 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1142 ConstMatScalarKokkosView Xa; 1143 MatScalarKokkosView Ya; 1144 1145 PetscFunctionBegin; 1146 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1147 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1148 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1149 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1150 PetscCall(PetscLogGpuTimeBegin()); 1151 1152 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1153 PetscBool e; 1154 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1155 if (e) { 1156 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1157 if (e) pattern = SAME_NONZERO_PATTERN; 1158 } 1159 } 1160 1161 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1162 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1163 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1164 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1165 */ 1166 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1167 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1168 Xa = xkok->a_dual.view_device(); 1169 Ya = ykok->a_dual.view_device(); 1170 1171 if (pattern == SAME_NONZERO_PATTERN) { 1172 KokkosBlas::axpy(alpha, Xa, Ya); 1173 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1174 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1175 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1176 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1177 1178 Kokkos::parallel_for( 1179 Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1180 PetscInt i = t.league_rank(); // row i 1181 Kokkos::single(Kokkos::PerTeam(t), [=]() { 1182 // Only one thread works in a team 1183 PetscInt p, q = Yi(i); 1184 for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 1185 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 1186 if (Xj(p) == Yj(q)) { // Found it 1187 Ya(q) += alpha * Xa(p); 1188 q++; 1189 } else { 1190 // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1191 // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1192 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 1193 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 1194 #else 1195 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 1196 #endif 1197 } 1198 } 1199 }); 1200 }); 1201 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1202 } else { // different nonzero patterns 1203 Mat Z; 1204 KokkosCsrMatrix zcsr; 1205 KernelHandle kh; 1206 kh.create_spadd_handle(true); // X, Y are sorted 1207 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1208 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1209 zkok = new Mat_SeqAIJKokkos(zcsr); 1210 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1211 PetscCall(MatHeaderReplace(Y, &Z)); 1212 kh.destroy_spadd_handle(); 1213 } 1214 PetscCall(PetscLogGpuTimeEnd()); 1215 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 1216 PetscFunctionReturn(PETSC_SUCCESS); 1217 } 1218 1219 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1220 { 1221 Mat_SeqAIJKokkos *akok; 1222 Mat_SeqAIJ *aseq; 1223 1224 PetscFunctionBegin; 1225 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1226 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1227 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1228 delete akok; 1229 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE); 1230 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1231 akok->SetUpCOO(aseq); 1232 PetscFunctionReturn(PETSC_SUCCESS); 1233 } 1234 1235 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1236 { 1237 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 1238 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1239 PetscCount Annz = aseq->nz; 1240 const PetscCountKokkosView &jmap = akok->jmap_d; 1241 const PetscCountKokkosView &perm = akok->perm_d; 1242 MatScalarKokkosView Aa; 1243 ConstMatScalarKokkosView kv; 1244 PetscMemType memtype; 1245 1246 PetscFunctionBegin; 1247 PetscCall(PetscGetMemType(v, &memtype)); 1248 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1249 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, aseq->coo_n)); 1250 } else { 1251 kv = ConstMatScalarKokkosView(v, aseq->coo_n); /* Directly use v[]'s memory */ 1252 } 1253 1254 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1255 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1256 1257 Kokkos::parallel_for( 1258 Annz, KOKKOS_LAMBDA(const PetscCount i) { 1259 PetscScalar sum = 0.0; 1260 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1261 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1262 }); 1263 1264 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1265 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1266 PetscFunctionReturn(PETSC_SUCCESS); 1267 } 1268 1269 PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A, const PetscInt *diag) 1270 { 1271 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1272 MatScalarKokkosView Aa; 1273 const MatRowMapKokkosView &Ai = akok->i_dual.view_device(); 1274 PetscInt m = A->rmap->n; 1275 ConstMatRowMapKokkosView Adiag(diag, m); /* diag is a device pointer */ 1276 1277 PetscFunctionBegin; 1278 PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); 1279 Kokkos::parallel_for( 1280 m, KOKKOS_LAMBDA(const PetscInt i) { 1281 PetscScalar tmp; 1282 if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i + 1)) { /* The diagonal element exists */ 1283 tmp = Aa(Ai(i)); 1284 Aa(Ai(i)) = Aa(Adiag(i)); 1285 Aa(Adiag(i)) = tmp; 1286 } 1287 }); 1288 PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1289 PetscFunctionReturn(PETSC_SUCCESS); 1290 } 1291 1292 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1293 { 1294 PetscFunctionBegin; 1295 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1296 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1297 B->offloadmask = PETSC_OFFLOAD_CPU; 1298 PetscFunctionReturn(PETSC_SUCCESS); 1299 } 1300 1301 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1302 { 1303 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1304 1305 PetscFunctionBegin; 1306 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1307 A->boundtocpu = PETSC_FALSE; 1308 1309 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1310 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1311 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1312 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1313 A->ops->scale = MatScale_SeqAIJKokkos; 1314 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1315 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1316 A->ops->mult = MatMult_SeqAIJKokkos; 1317 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1318 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1319 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1320 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1321 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1322 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1323 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1324 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1325 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1326 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1327 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1328 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1329 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1330 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1331 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1332 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1333 1334 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1335 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1336 PetscFunctionReturn(PETSC_SUCCESS); 1337 } 1338 1339 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1340 { 1341 Mat_SeqAIJ *aseq; 1342 PetscInt i, m, n; 1343 1344 PetscFunctionBegin; 1345 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1346 1347 m = akok->nrows(); 1348 n = akok->ncols(); 1349 PetscCall(MatSetSizes(A, m, n, m, n)); 1350 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1351 1352 /* Set up data structures of A as a MATSEQAIJ */ 1353 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1354 aseq = (Mat_SeqAIJ *)(A)->data; 1355 1356 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1357 akok->j_dual.sync_host(); 1358 1359 aseq->i = akok->i_host_data(); 1360 aseq->j = akok->j_host_data(); 1361 aseq->a = akok->a_host_data(); 1362 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1363 aseq->singlemalloc = PETSC_FALSE; 1364 aseq->free_a = PETSC_FALSE; 1365 aseq->free_ij = PETSC_FALSE; 1366 aseq->nz = akok->nnz(); 1367 aseq->maxnz = aseq->nz; 1368 1369 PetscCall(PetscMalloc1(m, &aseq->imax)); 1370 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1371 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1372 1373 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1374 akok->nonzerostate = A->nonzerostate; 1375 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1376 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1377 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1378 PetscFunctionReturn(PETSC_SUCCESS); 1379 } 1380 1381 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 1382 { 1383 PetscFunctionBegin; 1384 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1385 *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 1386 PetscFunctionReturn(PETSC_SUCCESS); 1387 } 1388 1389 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 1390 { 1391 Mat_SeqAIJKokkos *akok; 1392 PetscFunctionBegin; 1393 PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 1394 PetscCall(MatCreate(comm, A)); 1395 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1396 PetscFunctionReturn(PETSC_SUCCESS); 1397 } 1398 1399 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1400 1401 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1402 */ 1403 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1404 { 1405 PetscFunctionBegin; 1406 PetscCall(MatCreate(comm, A)); 1407 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1408 PetscFunctionReturn(PETSC_SUCCESS); 1409 } 1410 1411 /*@C 1412 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1413 (the default parallel PETSc format). This matrix will ultimately be handled by 1414 Kokkos for calculations. 1415 1416 Collective 1417 1418 Input Parameters: 1419 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1420 . m - number of rows 1421 . n - number of columns 1422 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 1423 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 1424 1425 Output Parameter: 1426 . A - the matrix 1427 1428 Level: intermediate 1429 1430 Notes: 1431 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1432 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1433 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1434 1435 The AIJ format, also called 1436 compressed row storage, is fully compatible with standard Fortran 1437 storage. That is, the stored row and column indices can begin at 1438 either one (as in Fortran) or zero. 1439 1440 Specify the preallocated storage with either `nz` or `nnz` (not both). 1441 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1442 allocation. 1443 1444 .seealso: [](chapter_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()` 1445 @*/ 1446 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1447 { 1448 PetscFunctionBegin; 1449 PetscCall(PetscKokkosInitializeCheck()); 1450 PetscCall(MatCreate(comm, A)); 1451 PetscCall(MatSetSizes(*A, m, n, m, n)); 1452 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1453 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1454 PetscFunctionReturn(PETSC_SUCCESS); 1455 } 1456 1457 typedef Kokkos::TeamPolicy<>::member_type team_member; 1458 // 1459 // This factorization exploits block diagonal matrices with "Nf" (not used). 1460 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1461 // 1462 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B, Mat A, const MatFactorInfo *info) 1463 { 1464 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1465 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 1466 IS isrow = b->row, isicol = b->icol; 1467 const PetscInt *r_h, *ic_h; 1468 const PetscInt n = A->rmap->n, *ai_d = aijkok->i_dual.view_device().data(), *aj_d = aijkok->j_dual.view_device().data(), *bi_d = baijkok->i_dual.view_device().data(), *bj_d = baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data(); 1469 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1470 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1471 PetscBool row_identity, col_identity; 1472 PetscInt nc, Nf = 1, nVec = 32; // should be a parameter, Nf is batch size - not used 1473 1474 PetscFunctionBegin; 1475 PetscCheck(A->rmap->n == n, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT, A->rmap->n, n); 1476 PetscCall(MatIsStructurallySymmetric(A, &row_identity)); 1477 PetscCheck(row_identity, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "structurally symmetric matrices only supported"); 1478 PetscCall(ISGetIndices(isrow, &r_h)); 1479 PetscCall(ISGetIndices(isicol, &ic_h)); 1480 PetscCall(ISGetSize(isicol, &nc)); 1481 PetscCall(PetscLogGpuTimeBegin()); 1482 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1483 { 1484 #define KOKKOS_SHARED_LEVEL 1 1485 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1486 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1487 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1488 const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_r_k(r_h, n); 1489 Kokkos::View<PetscInt *, Kokkos::LayoutLeft> d_r_k("r", n); 1490 const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_ic_k(ic_h, nc); 1491 Kokkos::View<PetscInt *, Kokkos::LayoutLeft> d_ic_k("ic", nc); 1492 size_t flops_h = 0.0; 1493 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k(&flops_h); 1494 Kokkos::View<size_t> d_flops_k("flops"); 1495 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1496 const int nloc = n / Nf, Ni = (conc > 8) ? 1 /* some intelligent number of SMs -- but need league_barrier */ : 1; 1497 Kokkos::deep_copy(d_flops_k, h_flops_k); 1498 Kokkos::deep_copy(d_r_k, h_r_k); 1499 Kokkos::deep_copy(d_ic_k, h_ic_k); 1500 // Fill A --> fact 1501 Kokkos::parallel_for( 1502 Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec), KOKKOS_LAMBDA(const team_member team) { 1503 const PetscInt field = team.league_rank() / Ni, field_block = team.league_rank() % Ni; // use grid.x/y in CUDA 1504 const PetscInt nloc_i = (nloc / Ni + !!(nloc % Ni)), start_i = field * nloc + field_block * nloc_i, end_i = (start_i + nloc_i) > (field + 1) * nloc ? (field + 1) * nloc : (start_i + nloc_i); 1505 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1506 // zero rows of B 1507 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) { 1508 PetscInt nzbL = bi_d[rowb + 1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb + 1]; // with diag 1509 PetscScalar *baL = ba_d + bi_d[rowb]; 1510 PetscScalar *baU = ba_d + bdiag_d[rowb + 1] + 1; 1511 /* zero (unfactored row) */ 1512 for (int j = 0; j < nzbL; j++) baL[j] = 0; 1513 for (int j = 0; j < nzbU; j++) baU[j] = 0; 1514 }); 1515 // copy A into B 1516 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) { 1517 PetscInt rowa = r[rowb], nza = ai_d[rowa + 1] - ai_d[rowa]; 1518 const PetscScalar *av = aa_d + ai_d[rowa]; 1519 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1520 /* load in initial (unfactored row) */ 1521 for (int j = 0; j < nza; j++) { 1522 PetscInt colb = ic[ajtmp[j]]; 1523 PetscScalar vala = av[j]; 1524 if (colb == rowb) { 1525 *(ba_d + bdiag_d[rowb]) = vala; 1526 } else { 1527 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]); 1528 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]); 1529 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb + 1] + 1) : bi_d[rowb + 1] - bi_d[rowb], set = 0; 1530 for (int j = 0; j < nz; j++) { 1531 if (pbj[j] == colb) { 1532 pba[j] = vala; 1533 set++; 1534 break; 1535 } 1536 } 1537 #if !defined(PETSC_HAVE_SYCL) 1538 if (set != 1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1539 #endif 1540 } 1541 } 1542 }); 1543 }); 1544 Kokkos::fence(); 1545 1546 Kokkos::parallel_for( 1547 Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size() + scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA(const team_member team) { 1548 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1549 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1550 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1551 const PetscInt field = team.league_rank() / Ni, field_block_idx = team.league_rank() % Ni; // use grid.x/y in CUDA 1552 const PetscInt start = field * nloc, end = start + nloc; 1553 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1554 // A22 panel update for each row A(1,:) and col A(:,1) 1555 for (int ii = start; ii < end - 1; ii++) { 1556 const PetscInt *bjUi = bj_d + bdiag_d[ii + 1] + 1, nzUi = bdiag_d[ii] - (bdiag_d[ii + 1] + 1); // vector, and vector size, of column indices of U(i,(i+1):end) 1557 const PetscScalar *baUi = ba_d + bdiag_d[ii + 1] + 1; // vector of data U(i,i+1:end) 1558 const PetscInt nUi_its = nzUi / Ni + !!(nzUi % Ni); 1559 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1560 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=](const int j) { 1561 PetscInt kIdx = j * Ni + field_block_idx; 1562 if (kIdx >= nzUi) /* void */ 1563 ; 1564 else { 1565 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1566 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1567 const PetscInt nzL = bi_d[myk + 1] - bi_d[myk]; // size of L_k(:) 1568 size_t st_idx; 1569 // find and do L(k,i) = A(:k,i) / A(i,i) 1570 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1571 // get column, there has got to be a better way 1572 Kokkos::parallel_reduce( 1573 Kokkos::ThreadVectorRange(team, nzL), 1574 [&](const int &j, size_t &idx) { 1575 if (pjL[j] == ii) { 1576 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1577 idx = j; // output 1578 *pLki = *pLki / Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1579 } 1580 }, 1581 st_idx); 1582 Kokkos::single(Kokkos::PerThread(team), [=]() { 1583 colkIdx() = st_idx; 1584 L_ki() = *(ba_d + bi_d[myk] + st_idx); 1585 }); 1586 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1587 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n", (int)myk, ii); // uses a register 1588 #endif 1589 // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1590 // U(i+1,:end) 1591 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, nzUi), [=](const int &uiIdx) { // index into i (U) 1592 PetscScalar Uij = baUi[uiIdx]; 1593 PetscInt col = bjUi[uiIdx]; 1594 if (col == myk) { 1595 // A_kk = A_kk - L_ki * U_ij(k) 1596 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1597 *Akkv = *Akkv - L_ki() * Uij; // UiK 1598 } else { 1599 PetscScalar *start, *end, *pAkjv = NULL; 1600 PetscInt high, low; 1601 const PetscInt *startj; 1602 if (col < myk) { // L 1603 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1604 PetscInt idx = (pLki + 1) - (ba_d + bi_d[myk]); // index into row 1605 start = pLki + 1; // start at pLki+1, A22(myk,1) 1606 startj = bj_d + bi_d[myk] + idx; 1607 end = ba_d + bi_d[myk + 1]; 1608 } else { 1609 PetscInt idx = bdiag_d[myk + 1] + 1; 1610 start = ba_d + idx; 1611 startj = bj_d + idx; 1612 end = ba_d + bdiag_d[myk]; 1613 } 1614 // search for 'col', use bisection search - TODO 1615 low = 0; 1616 high = (PetscInt)(end - start); 1617 while (high - low > 5) { 1618 int t = (low + high) / 2; 1619 if (startj[t] > col) high = t; 1620 else low = t; 1621 } 1622 for (pAkjv = start + low; pAkjv < start + high; pAkjv++) { 1623 if (startj[pAkjv - start] == col) break; 1624 } 1625 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1626 if (pAkjv == start + high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n", (int)myk, (int)col); // uses a register 1627 #endif 1628 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1629 } 1630 }); 1631 } 1632 }); 1633 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1634 if (field_block_idx == 0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add(flops.data(), (size_t)(2 * (nzUi * nzUi) + 2)); }); 1635 } /* endof for (i=0; i<n; i++) { */ 1636 Kokkos::single(Kokkos::PerTeam(team), [=]() { 1637 Kokkos::atomic_add(&d_flops_k(), flops()); 1638 flops() = 0; 1639 }); 1640 }); 1641 Kokkos::fence(); 1642 Kokkos::deep_copy(h_flops_k, d_flops_k); 1643 PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k())); 1644 Kokkos::parallel_for( 1645 Kokkos::TeamPolicy<>(Nf * Ni, 1, 256), KOKKOS_LAMBDA(const team_member team) { 1646 const PetscInt lg_rank = team.league_rank(), field = lg_rank / Ni; //, field_offset = lg_rank%Ni; 1647 const PetscInt start = field * nloc, end = start + nloc, n_its = (nloc / Ni + !!(nloc % Ni)); // 1/Ni iters 1648 /* Invert diagonal for simpler triangular solves */ 1649 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=](int outer_index) { 1650 int i = start + outer_index * Ni + lg_rank % Ni; 1651 if (i < end) { 1652 PetscScalar *pv = ba_d + bdiag_d[i]; 1653 *pv = 1.0 / (*pv); 1654 } 1655 }); 1656 }); 1657 } 1658 PetscCall(PetscLogGpuTimeEnd()); 1659 PetscCall(ISRestoreIndices(isicol, &ic_h)); 1660 PetscCall(ISRestoreIndices(isrow, &r_h)); 1661 1662 PetscCall(ISIdentity(isrow, &row_identity)); 1663 PetscCall(ISIdentity(isicol, &col_identity)); 1664 if (b->inode.size) { 1665 B->ops->solve = MatSolve_SeqAIJ_Inode; 1666 } else if (row_identity && col_identity) { 1667 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1668 } else { 1669 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1670 } 1671 B->offloadmask = PETSC_OFFLOAD_GPU; 1672 PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU 1673 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1674 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1675 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1676 B->ops->matsolve = MatMatSolve_SeqAIJ; 1677 B->assembled = PETSC_TRUE; 1678 B->preallocated = PETSC_TRUE; 1679 1680 PetscFunctionReturn(PETSC_SUCCESS); 1681 } 1682 1683 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1684 { 1685 PetscFunctionBegin; 1686 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1687 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1688 PetscFunctionReturn(PETSC_SUCCESS); 1689 } 1690 1691 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1692 { 1693 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1694 1695 PetscFunctionBegin; 1696 if (!factors->sptrsv_symbolic_completed) { 1697 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d); 1698 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d); 1699 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1700 } 1701 PetscFunctionReturn(PETSC_SUCCESS); 1702 } 1703 1704 /* Check if we need to update factors etc for transpose solve */ 1705 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1706 { 1707 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1708 MatColIdxType n = A->rmap->n; 1709 1710 PetscFunctionBegin; 1711 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1712 /* Update L^T and do sptrsv symbolic */ 1713 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); 1714 Kokkos::deep_copy(factors->iLt_d, 0); /* KK requires 0 */ 1715 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d", factors->jL_d.extent(0)); 1716 factors->aLt_d = MatScalarKokkosView("factors->aLt_d", factors->aL_d.extent(0)); 1717 1718 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d, 1719 factors->iLt_d, factors->jLt_d, factors->aLt_d); 1720 1721 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1722 We have to sort the indices, until KK provides finer control options. 1723 */ 1724 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d); 1725 1726 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d); 1727 1728 /* Update U^T and do sptrsv symbolic */ 1729 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); 1730 Kokkos::deep_copy(factors->iUt_d, 0); /* KK requires 0 */ 1731 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d", factors->jU_d.extent(0)); 1732 factors->aUt_d = MatScalarKokkosView("factors->aUt_d", factors->aU_d.extent(0)); 1733 1734 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d, 1735 factors->iUt_d, factors->jUt_d, factors->aUt_d); 1736 1737 /* Sort indices. See comments above */ 1738 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d); 1739 1740 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d); 1741 factors->transpose_updated = PETSC_TRUE; 1742 } 1743 PetscFunctionReturn(PETSC_SUCCESS); 1744 } 1745 1746 /* Solve Ax = b, with A = LU */ 1747 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x) 1748 { 1749 ConstPetscScalarKokkosView bv; 1750 PetscScalarKokkosView xv; 1751 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1752 1753 PetscFunctionBegin; 1754 PetscCall(PetscLogGpuTimeBegin()); 1755 PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 1756 PetscCall(VecGetKokkosView(b, &bv)); 1757 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1758 /* Solve L tmpv = b */ 1759 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector)); 1760 /* Solve Ux = tmpv */ 1761 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv)); 1762 PetscCall(VecRestoreKokkosView(b, &bv)); 1763 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1764 PetscCall(PetscLogGpuTimeEnd()); 1765 PetscFunctionReturn(PETSC_SUCCESS); 1766 } 1767 1768 /* Solve A^T x = b, where A^T = U^T L^T */ 1769 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x) 1770 { 1771 ConstPetscScalarKokkosView bv; 1772 PetscScalarKokkosView xv; 1773 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1774 1775 PetscFunctionBegin; 1776 PetscCall(PetscLogGpuTimeBegin()); 1777 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 1778 PetscCall(VecGetKokkosView(b, &bv)); 1779 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1780 /* Solve U^T tmpv = b */ 1781 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector); 1782 1783 /* Solve L^T x = tmpv */ 1784 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv); 1785 PetscCall(VecRestoreKokkosView(b, &bv)); 1786 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1787 PetscCall(PetscLogGpuTimeEnd()); 1788 PetscFunctionReturn(PETSC_SUCCESS); 1789 } 1790 1791 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1792 { 1793 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1794 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1795 PetscInt fill_lev = info->levels; 1796 1797 PetscFunctionBegin; 1798 PetscCall(PetscLogGpuTimeBegin()); 1799 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1800 1801 auto a_d = aijkok->a_dual.view_device(); 1802 auto i_d = aijkok->i_dual.view_device(); 1803 auto j_d = aijkok->j_dual.view_device(); 1804 1805 KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d); 1806 1807 B->assembled = PETSC_TRUE; 1808 B->preallocated = PETSC_TRUE; 1809 B->ops->solve = MatSolve_SeqAIJKokkos; 1810 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1811 B->ops->matsolve = NULL; 1812 B->ops->matsolvetranspose = NULL; 1813 B->offloadmask = PETSC_OFFLOAD_GPU; 1814 1815 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1816 factors->transpose_updated = PETSC_FALSE; 1817 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1818 /* TODO: log flops, but how to know that? */ 1819 PetscCall(PetscLogGpuTimeEnd()); 1820 PetscFunctionReturn(PETSC_SUCCESS); 1821 } 1822 1823 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1824 { 1825 Mat_SeqAIJKokkos *aijkok; 1826 Mat_SeqAIJ *b; 1827 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1828 PetscInt fill_lev = info->levels; 1829 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 1830 PetscInt n = A->rmap->n; 1831 1832 PetscFunctionBegin; 1833 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1834 /* Rebuild factors */ 1835 if (factors) { 1836 factors->Destroy(); 1837 } /* Destroy the old if it exists */ 1838 else { 1839 B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n); 1840 } 1841 1842 /* Create a spiluk handle and then do symbolic factorization */ 1843 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 1844 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 1845 1846 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1847 1848 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 1849 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 1850 Kokkos::realloc(factors->iU_d, n + 1); 1851 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 1852 1853 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1854 auto i_d = aijkok->i_dual.view_device(); 1855 auto j_d = aijkok->j_dual.view_device(); 1856 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d); 1857 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1858 1859 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1860 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 1861 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 1862 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 1863 1864 /* TODO: add options to select sptrsv algorithms */ 1865 /* Create sptrsv handles for L, U and their transpose */ 1866 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1867 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1868 #else 1869 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1870 #endif 1871 1872 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 1873 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 1874 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 1875 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 1876 1877 /* Fill fields of the factor matrix B */ 1878 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 1879 b = (Mat_SeqAIJ *)B->data; 1880 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 1881 B->info.fill_ratio_given = info->fill; 1882 B->info.fill_ratio_needed = ((PetscReal)b->nz) / ((PetscReal)nnzA); 1883 1884 B->offloadmask = PETSC_OFFLOAD_GPU; 1885 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1886 PetscFunctionReturn(PETSC_SUCCESS); 1887 } 1888 1889 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1890 { 1891 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1892 const PetscInt nrows = A->rmap->n; 1893 1894 PetscFunctionBegin; 1895 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1896 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1897 // move B data into Kokkos 1898 PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok 1899 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok 1900 { 1901 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 1902 if (!baijkok->diag_d.extent(0)) { 1903 const Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_diag(b->diag, nrows + 1); 1904 baijkok->diag_d = Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_diag)); 1905 Kokkos::deep_copy(baijkok->diag_d, h_diag); 1906 } 1907 } 1908 PetscFunctionReturn(PETSC_SUCCESS); 1909 } 1910 1911 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type) 1912 { 1913 PetscFunctionBegin; 1914 *type = MATSOLVERKOKKOS; 1915 PetscFunctionReturn(PETSC_SUCCESS); 1916 } 1917 1918 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A, MatSolverType *type) 1919 { 1920 PetscFunctionBegin; 1921 *type = MATSOLVERKOKKOSDEVICE; 1922 PetscFunctionReturn(PETSC_SUCCESS); 1923 } 1924 1925 /*MC 1926 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1927 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 1928 1929 Level: beginner 1930 1931 .seealso: [](chapter_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1932 M*/ 1933 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1934 { 1935 PetscInt n = A->rmap->n; 1936 1937 PetscFunctionBegin; 1938 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 1939 PetscCall(MatSetSizes(*B, n, n, n, n)); 1940 (*B)->factortype = ftype; 1941 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 1942 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1943 1944 if (ftype == MAT_FACTOR_LU) { 1945 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1946 (*B)->canuseordering = PETSC_TRUE; 1947 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1948 } else if (ftype == MAT_FACTOR_ILU) { 1949 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1950 (*B)->canuseordering = PETSC_FALSE; 1951 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1952 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1953 1954 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 1955 PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos)); 1956 PetscFunctionReturn(PETSC_SUCCESS); 1957 } 1958 1959 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A, MatFactorType ftype, Mat *B) 1960 { 1961 PetscInt n = A->rmap->n; 1962 1963 PetscFunctionBegin; 1964 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 1965 PetscCall(MatSetSizes(*B, n, n, n, n)); 1966 (*B)->factortype = ftype; 1967 (*B)->canuseordering = PETSC_TRUE; 1968 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 1969 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1970 1971 if (ftype == MAT_FACTOR_LU) { 1972 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1973 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1974 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for KOKKOS Matrix Types"); 1975 1976 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 1977 PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_kokkos_device)); 1978 PetscFunctionReturn(PETSC_SUCCESS); 1979 } 1980 1981 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1982 { 1983 PetscFunctionBegin; 1984 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1985 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1986 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_seqaijkokkos_kokkos_device)); 1987 PetscFunctionReturn(PETSC_SUCCESS); 1988 } 1989 1990 /* Utility to print out a KokkosCsrMatrix for debugging */ 1991 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 1992 { 1993 const auto &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map); 1994 const auto &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries); 1995 const auto &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values); 1996 const PetscInt *i = iv.data(); 1997 const PetscInt *j = jv.data(); 1998 const PetscScalar *a = av.data(); 1999 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 2000 2001 PetscFunctionBegin; 2002 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 2003 for (PetscInt k = 0; k < m; k++) { 2004 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 2005 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 2006 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 2007 } 2008 PetscFunctionReturn(PETSC_SUCCESS); 2009 } 2010