1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petsc/private/sfimpl.h> 5 #include <petscsystypes.h> 6 #include <petscerror.h> 7 8 #include <Kokkos_Core.hpp> 9 #include <KokkosBlas.hpp> 10 #include <KokkosSparse_CrsMatrix.hpp> 11 #include <KokkosSparse_spmv.hpp> 12 #include <KokkosSparse_spiluk.hpp> 13 #include <KokkosSparse_sptrsv.hpp> 14 #include <KokkosSparse_spgemm.hpp> 15 #include <KokkosSparse_spadd.hpp> 16 17 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 18 19 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 20 #include <KokkosSparse_Utils.hpp> 21 using KokkosSparse::sort_crs_matrix; 22 using KokkosSparse::Impl::transpose_matrix; 23 #else 24 #include <KokkosKernels_Sorting.hpp> 25 using KokkosKernels::sort_crs_matrix; 26 using KokkosKernels::Impl::transpose_matrix; 27 #endif 28 29 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 30 31 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 32 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 33 In the latter case, it is important to set a_dual's sync state correctly. 34 */ 35 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 36 { 37 Mat_SeqAIJ *aijseq; 38 Mat_SeqAIJKokkos *aijkok; 39 40 PetscFunctionBegin; 41 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 42 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 43 44 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 45 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 46 47 /* If aijkok does not exist, we just copy i, j to device. 48 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 49 In both cases, we build a new aijkok structure. 50 */ 51 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 52 delete aijkok; 53 aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 54 A->spptr = aijkok; 55 } 56 57 if (aijkok->device_mat_d.data()) { 58 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 59 } 60 PetscFunctionReturn(PETSC_SUCCESS); 61 } 62 63 /* Sync CSR data to device if not yet */ 64 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 65 { 66 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 67 68 PetscFunctionBegin; 69 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 70 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 71 if (aijkok->a_dual.need_sync_device()) { 72 aijkok->a_dual.sync_device(); 73 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 74 aijkok->hermitian_updated = PETSC_FALSE; 75 } 76 PetscFunctionReturn(PETSC_SUCCESS); 77 } 78 79 /* Mark the CSR data on device as modified */ 80 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 81 { 82 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 83 84 PetscFunctionBegin; 85 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 86 aijkok->a_dual.clear_sync_state(); 87 aijkok->a_dual.modify_device(); 88 aijkok->transpose_updated = PETSC_FALSE; 89 aijkok->hermitian_updated = PETSC_FALSE; 90 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 91 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 92 PetscFunctionReturn(PETSC_SUCCESS); 93 } 94 95 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 96 { 97 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 98 99 PetscFunctionBegin; 100 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 101 /* We do not expect one needs factors on host */ 102 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 103 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 104 aijkok->a_dual.sync_host(); 105 PetscFunctionReturn(PETSC_SUCCESS); 106 } 107 108 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 109 { 110 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 111 112 PetscFunctionBegin; 113 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 114 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 115 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 116 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 117 */ 118 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 119 aijkok->a_dual.sync_host(); 120 *array = aijkok->a_dual.view_host().data(); 121 } else { /* Happens when calling MatSetValues on a newly created matrix */ 122 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 123 } 124 PetscFunctionReturn(PETSC_SUCCESS); 125 } 126 127 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 128 { 129 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 130 131 PetscFunctionBegin; 132 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 133 PetscFunctionReturn(PETSC_SUCCESS); 134 } 135 136 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 137 { 138 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 139 140 PetscFunctionBegin; 141 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 142 aijkok->a_dual.sync_host(); 143 *array = aijkok->a_dual.view_host().data(); 144 } else { 145 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 146 } 147 PetscFunctionReturn(PETSC_SUCCESS); 148 } 149 150 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 151 { 152 PetscFunctionBegin; 153 *array = NULL; 154 PetscFunctionReturn(PETSC_SUCCESS); 155 } 156 157 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 158 { 159 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 160 161 PetscFunctionBegin; 162 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 163 *array = aijkok->a_dual.view_host().data(); 164 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 165 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 166 } 167 PetscFunctionReturn(PETSC_SUCCESS); 168 } 169 170 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 171 { 172 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 173 174 PetscFunctionBegin; 175 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 176 aijkok->a_dual.clear_sync_state(); 177 aijkok->a_dual.modify_host(); 178 } 179 PetscFunctionReturn(PETSC_SUCCESS); 180 } 181 182 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 183 { 184 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 185 186 PetscFunctionBegin; 187 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 188 189 if (i) *i = aijkok->i_device_data(); 190 if (j) *j = aijkok->j_device_data(); 191 if (a) { 192 aijkok->a_dual.sync_device(); 193 *a = aijkok->a_device_data(); 194 } 195 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 196 PetscFunctionReturn(PETSC_SUCCESS); 197 } 198 199 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 200 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 201 { 202 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 203 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 204 205 PetscFunctionBegin; 206 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 207 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(), h_mat_k); 208 Kokkos::deep_copy(aijkok->device_mat_d, h_mat_k); 209 PetscFunctionReturn(PETSC_SUCCESS); 210 } 211 212 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 213 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 214 { 215 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 216 217 PetscFunctionBegin; 218 if (aijkok && aijkok->device_mat_d.data()) { 219 *d_mat = aijkok->device_mat_d.data(); 220 } else { 221 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it) 222 *d_mat = NULL; 223 } 224 PetscFunctionReturn(PETSC_SUCCESS); 225 } 226 227 /* 228 Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 229 230 Input Parameter: 231 . A - the MATSEQAIJKOKKOS matrix 232 233 Output Parameters: 234 + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 235 - T_d - the transpose on device, whose value array is allocated but not initialized 236 */ 237 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 238 { 239 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 240 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 241 const PetscInt *Ai = aseq->i, *Aj = aseq->j; 242 MatRowMapKokkosViewHost Ti_h("Ti", n + 1); 243 MatRowMapType *Ti = Ti_h.data(); 244 MatColIdxKokkosViewHost Tj_h("Tj", nz); 245 MatRowMapKokkosViewHost perm_h("permutation", nz); 246 PetscInt *Tj = Tj_h.data(); 247 PetscInt *perm = perm_h.data(); 248 PetscInt *offset; 249 250 PetscFunctionBegin; 251 // Populate Ti 252 PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 253 Ti++; 254 for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 255 Ti--; 256 for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 257 258 // Populate Tj and the permutation array 259 PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 260 for (PetscInt i = 0; i < m; i++) { 261 for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 262 PetscInt r = Aj[j]; // row r of T 263 PetscInt disp = Ti[r] + offset[r]; 264 265 Tj[disp] = i; // col i of T 266 perm[disp] = j; 267 offset[r]++; 268 } 269 } 270 PetscCall(PetscFree(offset)); 271 272 // Sort each row of T, along with the permutation array 273 for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 274 275 // Output perm and T on device 276 auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 277 auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 278 PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 279 PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 280 PetscFunctionReturn(PETSC_SUCCESS); 281 } 282 283 // Generate the transpose on device and cache it internally 284 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 285 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 286 { 287 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 288 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 289 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 290 KokkosCsrMatrix &T = akok->csrmatT; 291 292 PetscFunctionBegin; 293 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 294 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's valeus since we are going to access them on device 295 296 const auto &Aa = akok->a_dual.view_device(); 297 298 if (A->symmetric == PETSC_BOOL3_TRUE) { 299 *csrmatT = akok->csrmat; 300 } else { 301 // See if we already have a cached transpose and its value is up to date 302 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 303 if (!akok->transpose_updated) { // if the value is out of date, update the cached version 304 const auto &perm = akok->transpose_perm; // get the permutation array 305 auto &Ta = T.values; 306 307 PetscCallCXX(Kokkos::parallel_for( 308 nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 309 } 310 } else { // Generate T of size n x m for the first time 311 MatRowMapKokkosView perm; 312 313 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 314 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 315 PetscCallCXX(Kokkos::parallel_for( 316 nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 317 } 318 akok->transpose_updated = PETSC_TRUE; 319 *csrmatT = akok->csrmatT; 320 } 321 PetscFunctionReturn(PETSC_SUCCESS); 322 } 323 324 // Generate the Hermitian on device and cache it internally 325 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 326 { 327 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 328 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 329 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 330 KokkosCsrMatrix &T = akok->csrmatH; 331 332 PetscFunctionBegin; 333 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 334 PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 335 336 const auto &Aa = akok->a_dual.view_device(); 337 338 if (A->hermitian == PETSC_BOOL3_TRUE) { 339 *csrmatH = akok->csrmat; 340 } else { 341 // See if we already have a cached hermitian and its value is up to date 342 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 343 if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 344 const auto &perm = akok->transpose_perm; // get the permutation array 345 auto &Ta = T.values; 346 347 PetscCallCXX(Kokkos::parallel_for( 348 nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 349 } 350 } else { // Generate T of size n x m for the first time 351 MatRowMapKokkosView perm; 352 353 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 354 akok->transpose_perm = perm; // cache the perm in this matrix for reuse 355 PetscCallCXX(Kokkos::parallel_for( 356 nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 357 } 358 akok->hermitian_updated = PETSC_TRUE; 359 *csrmatH = akok->csrmatH; 360 } 361 PetscFunctionReturn(PETSC_SUCCESS); 362 } 363 364 /* y = A x */ 365 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 366 { 367 Mat_SeqAIJKokkos *aijkok; 368 ConstPetscScalarKokkosView xv; 369 PetscScalarKokkosView yv; 370 371 PetscFunctionBegin; 372 PetscCall(PetscLogGpuTimeBegin()); 373 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 374 PetscCall(VecGetKokkosView(xx, &xv)); 375 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 376 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 377 PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 378 PetscCall(VecRestoreKokkosView(xx, &xv)); 379 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 380 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 381 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 382 PetscCall(PetscLogGpuTimeEnd()); 383 PetscFunctionReturn(PETSC_SUCCESS); 384 } 385 386 /* y = A^T x */ 387 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 388 { 389 Mat_SeqAIJKokkos *aijkok; 390 const char *mode; 391 ConstPetscScalarKokkosView xv; 392 PetscScalarKokkosView yv; 393 KokkosCsrMatrix csrmat; 394 395 PetscFunctionBegin; 396 PetscCall(PetscLogGpuTimeBegin()); 397 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 398 PetscCall(VecGetKokkosView(xx, &xv)); 399 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 400 if (A->form_explicit_transpose) { 401 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 402 mode = "N"; 403 } else { 404 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 405 csrmat = aijkok->csrmat; 406 mode = "T"; 407 } 408 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 409 PetscCall(VecRestoreKokkosView(xx, &xv)); 410 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 411 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 412 PetscCall(PetscLogGpuTimeEnd()); 413 PetscFunctionReturn(PETSC_SUCCESS); 414 } 415 416 /* y = A^H x */ 417 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 418 { 419 Mat_SeqAIJKokkos *aijkok; 420 const char *mode; 421 ConstPetscScalarKokkosView xv; 422 PetscScalarKokkosView yv; 423 KokkosCsrMatrix csrmat; 424 425 PetscFunctionBegin; 426 PetscCall(PetscLogGpuTimeBegin()); 427 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 428 PetscCall(VecGetKokkosView(xx, &xv)); 429 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 430 if (A->form_explicit_transpose) { 431 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 432 mode = "N"; 433 } else { 434 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 435 csrmat = aijkok->csrmat; 436 mode = "C"; 437 } 438 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 439 PetscCall(VecRestoreKokkosView(xx, &xv)); 440 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 441 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 442 PetscCall(PetscLogGpuTimeEnd()); 443 PetscFunctionReturn(PETSC_SUCCESS); 444 } 445 446 /* z = A x + y */ 447 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 448 { 449 Mat_SeqAIJKokkos *aijkok; 450 ConstPetscScalarKokkosView xv, yv; 451 PetscScalarKokkosView zv; 452 453 PetscFunctionBegin; 454 PetscCall(PetscLogGpuTimeBegin()); 455 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 456 PetscCall(VecGetKokkosView(xx, &xv)); 457 PetscCall(VecGetKokkosView(yy, &yv)); 458 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 459 if (zz != yy) Kokkos::deep_copy(zv, yv); 460 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 461 PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 462 PetscCall(VecRestoreKokkosView(xx, &xv)); 463 PetscCall(VecRestoreKokkosView(yy, &yv)); 464 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 465 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 466 PetscCall(PetscLogGpuTimeEnd()); 467 PetscFunctionReturn(PETSC_SUCCESS); 468 } 469 470 /* z = A^T x + y */ 471 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 472 { 473 Mat_SeqAIJKokkos *aijkok; 474 const char *mode; 475 ConstPetscScalarKokkosView xv, yv; 476 PetscScalarKokkosView zv; 477 KokkosCsrMatrix csrmat; 478 479 PetscFunctionBegin; 480 PetscCall(PetscLogGpuTimeBegin()); 481 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 482 PetscCall(VecGetKokkosView(xx, &xv)); 483 PetscCall(VecGetKokkosView(yy, &yv)); 484 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 485 if (zz != yy) Kokkos::deep_copy(zv, yv); 486 if (A->form_explicit_transpose) { 487 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 488 mode = "N"; 489 } else { 490 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 491 csrmat = aijkok->csrmat; 492 mode = "T"; 493 } 494 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 495 PetscCall(VecRestoreKokkosView(xx, &xv)); 496 PetscCall(VecRestoreKokkosView(yy, &yv)); 497 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 498 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 499 PetscCall(PetscLogGpuTimeEnd()); 500 PetscFunctionReturn(PETSC_SUCCESS); 501 } 502 503 /* z = A^H x + y */ 504 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 505 { 506 Mat_SeqAIJKokkos *aijkok; 507 const char *mode; 508 ConstPetscScalarKokkosView xv, yv; 509 PetscScalarKokkosView zv; 510 KokkosCsrMatrix csrmat; 511 512 PetscFunctionBegin; 513 PetscCall(PetscLogGpuTimeBegin()); 514 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 515 PetscCall(VecGetKokkosView(xx, &xv)); 516 PetscCall(VecGetKokkosView(yy, &yv)); 517 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 518 if (zz != yy) Kokkos::deep_copy(zv, yv); 519 if (A->form_explicit_transpose) { 520 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 521 mode = "N"; 522 } else { 523 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 524 csrmat = aijkok->csrmat; 525 mode = "C"; 526 } 527 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 528 PetscCall(VecRestoreKokkosView(xx, &xv)); 529 PetscCall(VecRestoreKokkosView(yy, &yv)); 530 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 531 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 532 PetscCall(PetscLogGpuTimeEnd()); 533 PetscFunctionReturn(PETSC_SUCCESS); 534 } 535 536 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 537 { 538 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 539 540 PetscFunctionBegin; 541 switch (op) { 542 case MAT_FORM_EXPLICIT_TRANSPOSE: 543 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 544 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 545 A->form_explicit_transpose = flg; 546 break; 547 default: 548 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 549 break; 550 } 551 PetscFunctionReturn(PETSC_SUCCESS); 552 } 553 554 /* Depending on reuse, either build a new mat, or use the existing mat */ 555 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 556 { 557 Mat_SeqAIJ *aseq; 558 559 PetscFunctionBegin; 560 PetscCall(PetscKokkosInitializeCheck()); 561 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 562 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */ 563 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 564 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 565 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 566 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 567 PetscCall(PetscFree(A->defaultvectype)); 568 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 569 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 570 PetscCall(MatSetOps_SeqAIJKokkos(A)); 571 aseq = static_cast<Mat_SeqAIJ *>(A->data); 572 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 573 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 574 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE); 575 } 576 } 577 PetscFunctionReturn(PETSC_SUCCESS); 578 } 579 580 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 581 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 582 */ 583 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 584 { 585 Mat_SeqAIJ *bseq; 586 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 587 Mat mat; 588 589 PetscFunctionBegin; 590 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 591 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 592 mat = *B; 593 if (A->assembled) { 594 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 595 bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE); 596 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 597 /* Now copy values to B if needed */ 598 if (dupOption == MAT_COPY_VALUES) { 599 if (akok->a_dual.need_sync_device()) { 600 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 601 bkok->a_dual.modify_host(); 602 } else { /* If device has the latest data, we only copy data on device */ 603 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 604 bkok->a_dual.modify_device(); 605 } 606 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 607 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 608 bkok->a_dual.modify_host(); 609 } 610 mat->spptr = bkok; 611 } 612 613 PetscCall(PetscFree(mat->defaultvectype)); 614 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 615 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 616 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 617 PetscFunctionReturn(PETSC_SUCCESS); 618 } 619 620 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 621 { 622 Mat At; 623 KokkosCsrMatrix internT; 624 Mat_SeqAIJKokkos *atkok, *bkok; 625 626 PetscFunctionBegin; 627 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 628 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 629 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 630 /* Deep copy internT, as we want to isolate the internal transpose */ 631 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 632 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 633 if (reuse == MAT_INITIAL_MATRIX) *B = At; 634 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 635 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 636 if ((*B)->assembled) { 637 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 638 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 639 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 640 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 641 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 642 MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 643 MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 644 PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 645 PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 646 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 647 } 648 PetscFunctionReturn(PETSC_SUCCESS); 649 } 650 651 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 652 { 653 Mat_SeqAIJKokkos *aijkok; 654 655 PetscFunctionBegin; 656 if (A->factortype == MAT_FACTOR_NONE) { 657 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 658 delete aijkok; 659 } else { 660 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 661 } 662 A->spptr = NULL; 663 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 664 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 665 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 666 PetscCall(MatDestroy_SeqAIJ(A)); 667 PetscFunctionReturn(PETSC_SUCCESS); 668 } 669 670 /*MC 671 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 672 673 A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 674 675 Options Database Key: 676 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 677 678 Level: beginner 679 680 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 681 M*/ 682 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 683 { 684 PetscFunctionBegin; 685 PetscCall(PetscKokkosInitializeCheck()); 686 PetscCall(MatCreate_SeqAIJ(A)); 687 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 688 PetscFunctionReturn(PETSC_SUCCESS); 689 } 690 691 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 692 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 693 { 694 Mat_SeqAIJ *a, *b; 695 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 696 MatScalarKokkosView aa, ba, ca; 697 MatRowMapKokkosView ai, bi, ci; 698 MatColIdxKokkosView aj, bj, cj; 699 PetscInt m, n, nnz, aN; 700 701 PetscFunctionBegin; 702 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 703 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 704 PetscValidPointer(C, 4); 705 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 706 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 707 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 708 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 709 710 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 711 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 712 a = static_cast<Mat_SeqAIJ *>(A->data); 713 b = static_cast<Mat_SeqAIJ *>(B->data); 714 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 715 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 716 aa = akok->a_dual.view_device(); 717 ai = akok->i_dual.view_device(); 718 ba = bkok->a_dual.view_device(); 719 bi = bkok->i_dual.view_device(); 720 m = A->rmap->n; /* M, N and nnz of C */ 721 n = A->cmap->n + B->cmap->n; 722 nnz = a->nz + b->nz; 723 aN = A->cmap->n; /* N of A */ 724 if (reuse == MAT_INITIAL_MATRIX) { 725 aj = akok->j_dual.view_device(); 726 bj = bkok->j_dual.view_device(); 727 auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 728 auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 729 auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 730 ca = ca_dual.view_device(); 731 ci = ci_dual.view_device(); 732 cj = cj_dual.view_device(); 733 734 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 735 Kokkos::parallel_for( 736 Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 737 PetscInt i = t.league_rank(); /* row i */ 738 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 739 740 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 741 ci(i) = coffset; 742 if (i == m - 1) ci(m) = ai(m) + bi(m); 743 }); 744 745 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 746 if (k < alen) { 747 ca(coffset + k) = aa(ai(i) + k); 748 cj(coffset + k) = aj(ai(i) + k); 749 } else { 750 ca(coffset + k) = ba(bi(i) + k - alen); 751 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 752 } 753 }); 754 }); 755 ca_dual.modify_device(); 756 ci_dual.modify_device(); 757 cj_dual.modify_device(); 758 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 759 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 760 } else if (reuse == MAT_REUSE_MATRIX) { 761 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 762 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 763 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 764 ca = ckok->a_dual.view_device(); 765 ci = ckok->i_dual.view_device(); 766 767 Kokkos::parallel_for( 768 Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 769 PetscInt i = t.league_rank(); /* row i */ 770 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 771 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 772 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 773 else ca(ci(i) + k) = ba(bi(i) + k - alen); 774 }); 775 }); 776 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 777 } 778 PetscFunctionReturn(PETSC_SUCCESS); 779 } 780 781 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 782 { 783 PetscFunctionBegin; 784 delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 785 PetscFunctionReturn(PETSC_SUCCESS); 786 } 787 788 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 789 { 790 Mat_Product *product = C->product; 791 Mat A, B; 792 bool transA, transB; /* use bool, since KK needs this type */ 793 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 794 Mat_SeqAIJ *c; 795 MatProductData_SeqAIJKokkos *pdata; 796 KokkosCsrMatrix csrmatA, csrmatB; 797 798 PetscFunctionBegin; 799 MatCheckProduct(C, 1); 800 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 801 pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 802 803 // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 804 // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 805 // we still do numeric. 806 if (pdata->reusesym) { // numeric reuses results from symbolic 807 pdata->reusesym = PETSC_FALSE; 808 PetscFunctionReturn(PETSC_SUCCESS); 809 } 810 811 switch (product->type) { 812 case MATPRODUCT_AB: 813 transA = false; 814 transB = false; 815 break; 816 case MATPRODUCT_AtB: 817 transA = true; 818 transB = false; 819 break; 820 case MATPRODUCT_ABt: 821 transA = false; 822 transB = true; 823 break; 824 default: 825 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 826 } 827 828 A = product->A; 829 B = product->B; 830 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 831 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 832 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 833 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 834 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 835 836 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 837 838 csrmatA = akok->csrmat; 839 csrmatB = bkok->csrmat; 840 841 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 842 if (transA) { 843 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 844 transA = false; 845 } 846 847 if (transB) { 848 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 849 transB = false; 850 } 851 PetscCall(PetscLogGpuTimeBegin()); 852 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 853 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 854 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 855 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 856 #endif 857 858 PetscCall(PetscLogGpuTimeEnd()); 859 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 860 /* shorter version of MatAssemblyEnd_SeqAIJ */ 861 c = (Mat_SeqAIJ *)C->data; 862 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 863 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 864 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 865 c->reallocs = 0; 866 C->info.mallocs = 0; 867 C->info.nz_unneeded = 0; 868 C->assembled = C->was_assembled = PETSC_TRUE; 869 C->num_ass++; 870 PetscFunctionReturn(PETSC_SUCCESS); 871 } 872 873 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 874 { 875 Mat_Product *product = C->product; 876 MatProductType ptype; 877 Mat A, B; 878 bool transA, transB; 879 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 880 MatProductData_SeqAIJKokkos *pdata; 881 MPI_Comm comm; 882 KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 883 884 PetscFunctionBegin; 885 MatCheckProduct(C, 1); 886 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 887 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 888 A = product->A; 889 B = product->B; 890 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 891 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 892 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 893 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 894 csrmatA = akok->csrmat; 895 csrmatB = bkok->csrmat; 896 897 ptype = product->type; 898 // Take advantage of the symmetry if true 899 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 900 ptype = MATPRODUCT_AB; 901 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 902 } 903 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 904 ptype = MATPRODUCT_AB; 905 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 906 } 907 908 switch (ptype) { 909 case MATPRODUCT_AB: 910 transA = false; 911 transB = false; 912 break; 913 case MATPRODUCT_AtB: 914 transA = true; 915 transB = false; 916 break; 917 case MATPRODUCT_ABt: 918 transA = false; 919 transB = true; 920 break; 921 default: 922 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 923 } 924 PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 925 pdata->reusesym = product->api_user; 926 927 /* TODO: add command line options to select spgemm algorithms */ 928 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 929 930 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 931 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 932 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 933 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 934 #endif 935 #endif 936 PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 937 938 PetscCall(PetscLogGpuTimeBegin()); 939 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 940 if (transA) { 941 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 942 transA = false; 943 } 944 945 if (transB) { 946 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 947 transB = false; 948 } 949 950 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 951 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 952 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 953 calling new Mat_SeqAIJKokkos(). 954 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 955 */ 956 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 957 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 958 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 959 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 960 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 961 #endif 962 PetscCall(PetscLogGpuTimeEnd()); 963 964 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 965 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 966 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 967 PetscFunctionReturn(PETSC_SUCCESS); 968 } 969 970 /* handles sparse matrix matrix ops */ 971 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 972 { 973 Mat_Product *product = mat->product; 974 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 975 976 PetscFunctionBegin; 977 MatCheckProduct(mat, 1); 978 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 979 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 980 if (Biskok && Ciskok) { 981 switch (product->type) { 982 case MATPRODUCT_AB: 983 case MATPRODUCT_AtB: 984 case MATPRODUCT_ABt: 985 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 986 break; 987 case MATPRODUCT_PtAP: 988 case MATPRODUCT_RARt: 989 case MATPRODUCT_ABC: 990 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 991 break; 992 default: 993 break; 994 } 995 } else { /* fallback for AIJ */ 996 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 997 } 998 PetscFunctionReturn(PETSC_SUCCESS); 999 } 1000 1001 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 1002 { 1003 Mat_SeqAIJKokkos *aijkok; 1004 1005 PetscFunctionBegin; 1006 PetscCall(PetscLogGpuTimeBegin()); 1007 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1008 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1009 KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 1010 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1011 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 1012 PetscCall(PetscLogGpuTimeEnd()); 1013 PetscFunctionReturn(PETSC_SUCCESS); 1014 } 1015 1016 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 1017 { 1018 Mat_SeqAIJKokkos *aijkok; 1019 1020 PetscFunctionBegin; 1021 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1022 if (aijkok) { /* Only zero the device if data is already there */ 1023 KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0); 1024 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1025 } else { /* Might be preallocated but not assembled */ 1026 PetscCall(MatZeroEntries_SeqAIJ(A)); 1027 } 1028 PetscFunctionReturn(PETSC_SUCCESS); 1029 } 1030 1031 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1032 { 1033 Mat_SeqAIJ *aijseq; 1034 Mat_SeqAIJKokkos *aijkok; 1035 PetscInt n; 1036 PetscScalarKokkosView xv; 1037 1038 PetscFunctionBegin; 1039 PetscCall(VecGetLocalSize(x, &n)); 1040 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1041 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1042 1043 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1044 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1045 1046 if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */ 1047 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1048 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1049 aijkok->SetDiagonal(aijseq->diag); 1050 } 1051 1052 const auto &Aa = aijkok->a_dual.view_device(); 1053 const auto &Ai = aijkok->i_dual.view_device(); 1054 const auto &Adiag = aijkok->diag_dual.view_device(); 1055 1056 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1057 Kokkos::parallel_for( 1058 n, KOKKOS_LAMBDA(const PetscInt i) { 1059 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1060 else xv(i) = 0; 1061 }); 1062 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1063 PetscFunctionReturn(PETSC_SUCCESS); 1064 } 1065 1066 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1067 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1068 { 1069 Mat_SeqAIJKokkos *aijkok; 1070 1071 PetscFunctionBegin; 1072 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1073 PetscValidPointer(kv, 2); 1074 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1075 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1076 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1077 *kv = aijkok->a_dual.view_device(); 1078 PetscFunctionReturn(PETSC_SUCCESS); 1079 } 1080 1081 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1082 { 1083 PetscFunctionBegin; 1084 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1085 PetscValidPointer(kv, 2); 1086 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1087 PetscFunctionReturn(PETSC_SUCCESS); 1088 } 1089 1090 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1091 { 1092 Mat_SeqAIJKokkos *aijkok; 1093 1094 PetscFunctionBegin; 1095 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1096 PetscValidPointer(kv, 2); 1097 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1098 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1099 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1100 *kv = aijkok->a_dual.view_device(); 1101 PetscFunctionReturn(PETSC_SUCCESS); 1102 } 1103 1104 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1105 { 1106 PetscFunctionBegin; 1107 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1108 PetscValidPointer(kv, 2); 1109 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1110 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1111 PetscFunctionReturn(PETSC_SUCCESS); 1112 } 1113 1114 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1115 { 1116 Mat_SeqAIJKokkos *aijkok; 1117 1118 PetscFunctionBegin; 1119 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1120 PetscValidPointer(kv, 2); 1121 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1122 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1123 *kv = aijkok->a_dual.view_device(); 1124 PetscFunctionReturn(PETSC_SUCCESS); 1125 } 1126 1127 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1128 { 1129 PetscFunctionBegin; 1130 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1131 PetscValidPointer(kv, 2); 1132 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1133 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1134 PetscFunctionReturn(PETSC_SUCCESS); 1135 } 1136 1137 /* Computes Y += alpha X */ 1138 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1139 { 1140 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1141 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1142 ConstMatScalarKokkosView Xa; 1143 MatScalarKokkosView Ya; 1144 1145 PetscFunctionBegin; 1146 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1147 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1148 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1149 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1150 PetscCall(PetscLogGpuTimeBegin()); 1151 1152 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1153 PetscBool e; 1154 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1155 if (e) { 1156 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1157 if (e) pattern = SAME_NONZERO_PATTERN; 1158 } 1159 } 1160 1161 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1162 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1163 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1164 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1165 */ 1166 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1167 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1168 Xa = xkok->a_dual.view_device(); 1169 Ya = ykok->a_dual.view_device(); 1170 1171 if (pattern == SAME_NONZERO_PATTERN) { 1172 KokkosBlas::axpy(alpha, Xa, Ya); 1173 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1174 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1175 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1176 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1177 1178 Kokkos::parallel_for( 1179 Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1180 PetscInt i = t.league_rank(); // row i 1181 Kokkos::single(Kokkos::PerTeam(t), [=]() { 1182 // Only one thread works in a team 1183 PetscInt p, q = Yi(i); 1184 for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 1185 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 1186 if (Xj(p) == Yj(q)) { // Found it 1187 Ya(q) += alpha * Xa(p); 1188 q++; 1189 } else { 1190 // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1191 // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1192 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 1193 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 1194 #else 1195 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 1196 #endif 1197 } 1198 } 1199 }); 1200 }); 1201 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1202 } else { // different nonzero patterns 1203 Mat Z; 1204 KokkosCsrMatrix zcsr; 1205 KernelHandle kh; 1206 kh.create_spadd_handle(true); // X, Y are sorted 1207 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1208 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1209 zkok = new Mat_SeqAIJKokkos(zcsr); 1210 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1211 PetscCall(MatHeaderReplace(Y, &Z)); 1212 kh.destroy_spadd_handle(); 1213 } 1214 PetscCall(PetscLogGpuTimeEnd()); 1215 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 1216 PetscFunctionReturn(PETSC_SUCCESS); 1217 } 1218 1219 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1220 { 1221 Mat_SeqAIJKokkos *akok; 1222 Mat_SeqAIJ *aseq; 1223 1224 PetscFunctionBegin; 1225 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1226 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1227 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1228 delete akok; 1229 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE); 1230 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1231 akok->SetUpCOO(aseq); 1232 PetscFunctionReturn(PETSC_SUCCESS); 1233 } 1234 1235 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1236 { 1237 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 1238 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1239 PetscCount Annz = aseq->nz; 1240 const PetscCountKokkosView &jmap = akok->jmap_d; 1241 const PetscCountKokkosView &perm = akok->perm_d; 1242 MatScalarKokkosView Aa; 1243 ConstMatScalarKokkosView kv; 1244 PetscMemType memtype; 1245 1246 PetscFunctionBegin; 1247 PetscCall(PetscGetMemType(v, &memtype)); 1248 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1249 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, aseq->coo_n)); 1250 } else { 1251 kv = ConstMatScalarKokkosView(v, aseq->coo_n); /* Directly use v[]'s memory */ 1252 } 1253 1254 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1255 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1256 1257 Kokkos::parallel_for( 1258 Annz, KOKKOS_LAMBDA(const PetscCount i) { 1259 PetscScalar sum = 0.0; 1260 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1261 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1262 }); 1263 1264 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1265 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1266 PetscFunctionReturn(PETSC_SUCCESS); 1267 } 1268 1269 PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A, const PetscInt *diag) 1270 { 1271 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1272 MatScalarKokkosView Aa; 1273 const MatRowMapKokkosView &Ai = akok->i_dual.view_device(); 1274 PetscInt m = A->rmap->n; 1275 ConstMatRowMapKokkosView Adiag(diag, m); /* diag is a device pointer */ 1276 1277 PetscFunctionBegin; 1278 PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); 1279 Kokkos::parallel_for( 1280 m, KOKKOS_LAMBDA(const PetscInt i) { 1281 PetscScalar tmp; 1282 if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i + 1)) { /* The diagonal element exists */ 1283 tmp = Aa(Ai(i)); 1284 Aa(Ai(i)) = Aa(Adiag(i)); 1285 Aa(Adiag(i)) = tmp; 1286 } 1287 }); 1288 PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1289 PetscFunctionReturn(PETSC_SUCCESS); 1290 } 1291 1292 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1293 { 1294 PetscFunctionBegin; 1295 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1296 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1297 B->offloadmask = PETSC_OFFLOAD_CPU; 1298 PetscFunctionReturn(PETSC_SUCCESS); 1299 } 1300 1301 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1302 { 1303 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1304 1305 PetscFunctionBegin; 1306 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1307 A->boundtocpu = PETSC_FALSE; 1308 1309 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1310 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1311 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1312 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1313 A->ops->scale = MatScale_SeqAIJKokkos; 1314 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1315 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1316 A->ops->mult = MatMult_SeqAIJKokkos; 1317 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1318 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1319 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1320 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1321 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1322 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1323 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1324 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1325 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1326 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1327 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1328 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1329 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1330 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1331 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1332 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1333 1334 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1335 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1336 PetscFunctionReturn(PETSC_SUCCESS); 1337 } 1338 1339 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1340 { 1341 Mat_SeqAIJ *aseq; 1342 PetscInt i, m, n; 1343 1344 PetscFunctionBegin; 1345 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1346 1347 m = akok->nrows(); 1348 n = akok->ncols(); 1349 PetscCall(MatSetSizes(A, m, n, m, n)); 1350 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1351 1352 /* Set up data structures of A as a MATSEQAIJ */ 1353 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1354 aseq = (Mat_SeqAIJ *)(A)->data; 1355 1356 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1357 akok->j_dual.sync_host(); 1358 1359 aseq->i = akok->i_host_data(); 1360 aseq->j = akok->j_host_data(); 1361 aseq->a = akok->a_host_data(); 1362 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1363 aseq->singlemalloc = PETSC_FALSE; 1364 aseq->free_a = PETSC_FALSE; 1365 aseq->free_ij = PETSC_FALSE; 1366 aseq->nz = akok->nnz(); 1367 aseq->maxnz = aseq->nz; 1368 1369 PetscCall(PetscMalloc1(m, &aseq->imax)); 1370 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1371 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1372 1373 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1374 akok->nonzerostate = A->nonzerostate; 1375 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1376 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1377 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1378 PetscFunctionReturn(PETSC_SUCCESS); 1379 } 1380 1381 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 1382 { 1383 PetscFunctionBegin; 1384 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1385 *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 1386 PetscFunctionReturn(PETSC_SUCCESS); 1387 } 1388 1389 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 1390 { 1391 Mat_SeqAIJKokkos *akok; 1392 PetscFunctionBegin; 1393 PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 1394 PetscCall(MatCreate(comm, A)); 1395 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1396 PetscFunctionReturn(PETSC_SUCCESS); 1397 } 1398 1399 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1400 1401 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1402 */ 1403 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1404 { 1405 PetscFunctionBegin; 1406 PetscCall(MatCreate(comm, A)); 1407 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1408 PetscFunctionReturn(PETSC_SUCCESS); 1409 } 1410 1411 /*@C 1412 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1413 (the default parallel PETSc format). This matrix will ultimately be handled by 1414 Kokkos for calculations. 1415 1416 Collective 1417 1418 Input Parameters: 1419 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1420 . m - number of rows 1421 . n - number of columns 1422 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 1423 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 1424 1425 Output Parameter: 1426 . A - the matrix 1427 1428 Level: intermediate 1429 1430 Notes: 1431 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1432 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1433 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1434 1435 The AIJ format, also called 1436 compressed row storage, is fully compatible with standard Fortran 1437 storage. That is, the stored row and column indices can begin at 1438 either one (as in Fortran) or zero. 1439 1440 Specify the preallocated storage with either `nz` or `nnz` (not both). 1441 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1442 allocation. 1443 1444 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()` 1445 @*/ 1446 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1447 { 1448 PetscFunctionBegin; 1449 PetscCall(PetscKokkosInitializeCheck()); 1450 PetscCall(MatCreate(comm, A)); 1451 PetscCall(MatSetSizes(*A, m, n, m, n)); 1452 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1453 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1454 PetscFunctionReturn(PETSC_SUCCESS); 1455 } 1456 1457 typedef Kokkos::TeamPolicy<>::member_type team_member; 1458 // 1459 // This factorization exploits block diagonal matrices with "Nf" (not used). 1460 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1461 // 1462 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B, Mat A, const MatFactorInfo *info) 1463 { 1464 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1465 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 1466 IS isrow = b->row, isicol = b->icol; 1467 const PetscInt *r_h, *ic_h; 1468 const PetscInt n = A->rmap->n, *ai_d = aijkok->i_dual.view_device().data(), *aj_d = aijkok->j_dual.view_device().data(), *bi_d = baijkok->i_dual.view_device().data(), *bj_d = baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data(); 1469 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1470 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1471 PetscBool row_identity, col_identity; 1472 PetscInt nc, Nf = 1, nVec = 32; // should be a parameter, Nf is batch size - not used 1473 1474 PetscFunctionBegin; 1475 PetscCheck(A->rmap->n == n, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT, A->rmap->n, n); 1476 PetscCall(MatIsStructurallySymmetric(A, &row_identity)); 1477 PetscCheck(row_identity, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "structurally symmetric matrices only supported"); 1478 PetscCall(ISGetIndices(isrow, &r_h)); 1479 PetscCall(ISGetIndices(isicol, &ic_h)); 1480 PetscCall(ISGetSize(isicol, &nc)); 1481 PetscCall(PetscLogGpuTimeBegin()); 1482 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1483 { 1484 #define KOKKOS_SHARED_LEVEL 1 1485 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1486 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1487 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1488 const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_r_k(r_h, n); 1489 Kokkos::View<PetscInt *, Kokkos::LayoutLeft> d_r_k("r", n); 1490 const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_ic_k(ic_h, nc); 1491 Kokkos::View<PetscInt *, Kokkos::LayoutLeft> d_ic_k("ic", nc); 1492 size_t flops_h = 0.0; 1493 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k(&flops_h); 1494 Kokkos::View<size_t> d_flops_k("flops"); 1495 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1496 const int nloc = n / Nf, Ni = (conc > 8) ? 1 /* some intelligent number of SMs -- but need league_barrier */ : 1; 1497 Kokkos::deep_copy(d_flops_k, h_flops_k); 1498 Kokkos::deep_copy(d_r_k, h_r_k); 1499 Kokkos::deep_copy(d_ic_k, h_ic_k); 1500 // Fill A --> fact 1501 Kokkos::parallel_for( 1502 Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec), KOKKOS_LAMBDA(const team_member team) { 1503 const PetscInt field = team.league_rank() / Ni, field_block = team.league_rank() % Ni; // use grid.x/y in CUDA 1504 const PetscInt nloc_i = (nloc / Ni + !!(nloc % Ni)), start_i = field * nloc + field_block * nloc_i, end_i = (start_i + nloc_i) > (field + 1) * nloc ? (field + 1) * nloc : (start_i + nloc_i); 1505 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1506 // zero rows of B 1507 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) { 1508 PetscInt nzbL = bi_d[rowb + 1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb + 1]; // with diag 1509 PetscScalar *baL = ba_d + bi_d[rowb]; 1510 PetscScalar *baU = ba_d + bdiag_d[rowb + 1] + 1; 1511 /* zero (unfactored row) */ 1512 for (int j = 0; j < nzbL; j++) baL[j] = 0; 1513 for (int j = 0; j < nzbU; j++) baU[j] = 0; 1514 }); 1515 // copy A into B 1516 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) { 1517 PetscInt rowa = r[rowb], nza = ai_d[rowa + 1] - ai_d[rowa]; 1518 const PetscScalar *av = aa_d + ai_d[rowa]; 1519 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1520 /* load in initial (unfactored row) */ 1521 for (int j = 0; j < nza; j++) { 1522 PetscInt colb = ic[ajtmp[j]]; 1523 PetscScalar vala = av[j]; 1524 if (colb == rowb) { 1525 *(ba_d + bdiag_d[rowb]) = vala; 1526 } else { 1527 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]); 1528 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]); 1529 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb + 1] + 1) : bi_d[rowb + 1] - bi_d[rowb], set = 0; 1530 for (int j = 0; j < nz; j++) { 1531 if (pbj[j] == colb) { 1532 pba[j] = vala; 1533 set++; 1534 break; 1535 } 1536 } 1537 #if !defined(PETSC_HAVE_SYCL) 1538 if (set != 1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1539 #else 1540 (void)set; 1541 #endif 1542 } 1543 } 1544 }); 1545 }); 1546 Kokkos::fence(); 1547 1548 Kokkos::parallel_for( 1549 Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size() + scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA(const team_member team) { 1550 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1551 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1552 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1553 const PetscInt field = team.league_rank() / Ni, field_block_idx = team.league_rank() % Ni; // use grid.x/y in CUDA 1554 const PetscInt start = field * nloc, end = start + nloc; 1555 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1556 // A22 panel update for each row A(1,:) and col A(:,1) 1557 for (int ii = start; ii < end - 1; ii++) { 1558 const PetscInt *bjUi = bj_d + bdiag_d[ii + 1] + 1, nzUi = bdiag_d[ii] - (bdiag_d[ii + 1] + 1); // vector, and vector size, of column indices of U(i,(i+1):end) 1559 const PetscScalar *baUi = ba_d + bdiag_d[ii + 1] + 1; // vector of data U(i,i+1:end) 1560 const PetscInt nUi_its = nzUi / Ni + !!(nzUi % Ni); 1561 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1562 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=](const int j) { 1563 PetscInt kIdx = j * Ni + field_block_idx; 1564 if (kIdx >= nzUi) /* void */ 1565 ; 1566 else { 1567 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1568 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1569 const PetscInt nzL = bi_d[myk + 1] - bi_d[myk]; // size of L_k(:) 1570 size_t st_idx; 1571 // find and do L(k,i) = A(:k,i) / A(i,i) 1572 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1573 // get column, there has got to be a better way 1574 Kokkos::parallel_reduce( 1575 Kokkos::ThreadVectorRange(team, nzL), 1576 [&](const int &j, size_t &idx) { 1577 if (pjL[j] == ii) { 1578 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1579 idx = j; // output 1580 *pLki = *pLki / Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1581 } 1582 }, 1583 st_idx); 1584 Kokkos::single(Kokkos::PerThread(team), [=]() { 1585 colkIdx() = st_idx; 1586 L_ki() = *(ba_d + bi_d[myk] + st_idx); 1587 }); 1588 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1589 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n", (int)myk, ii); // uses a register 1590 #endif 1591 // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1592 // U(i+1,:end) 1593 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, nzUi), [=](const int &uiIdx) { // index into i (U) 1594 PetscScalar Uij = baUi[uiIdx]; 1595 PetscInt col = bjUi[uiIdx]; 1596 if (col == myk) { 1597 // A_kk = A_kk - L_ki * U_ij(k) 1598 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1599 *Akkv = *Akkv - L_ki() * Uij; // UiK 1600 } else { 1601 PetscScalar *start, *end, *pAkjv = NULL; 1602 PetscInt high, low; 1603 const PetscInt *startj; 1604 if (col < myk) { // L 1605 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1606 PetscInt idx = (pLki + 1) - (ba_d + bi_d[myk]); // index into row 1607 start = pLki + 1; // start at pLki+1, A22(myk,1) 1608 startj = bj_d + bi_d[myk] + idx; 1609 end = ba_d + bi_d[myk + 1]; 1610 } else { 1611 PetscInt idx = bdiag_d[myk + 1] + 1; 1612 start = ba_d + idx; 1613 startj = bj_d + idx; 1614 end = ba_d + bdiag_d[myk]; 1615 } 1616 // search for 'col', use bisection search - TODO 1617 low = 0; 1618 high = (PetscInt)(end - start); 1619 while (high - low > 5) { 1620 int t = (low + high) / 2; 1621 if (startj[t] > col) high = t; 1622 else low = t; 1623 } 1624 for (pAkjv = start + low; pAkjv < start + high; pAkjv++) { 1625 if (startj[pAkjv - start] == col) break; 1626 } 1627 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1628 if (pAkjv == start + high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n", (int)myk, (int)col); // uses a register 1629 #endif 1630 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1631 } 1632 }); 1633 } 1634 }); 1635 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1636 if (field_block_idx == 0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add(flops.data(), (size_t)(2 * (nzUi * nzUi) + 2)); }); 1637 } /* endof for (i=0; i<n; i++) { */ 1638 Kokkos::single(Kokkos::PerTeam(team), [=]() { 1639 Kokkos::atomic_add(&d_flops_k(), flops()); 1640 flops() = 0; 1641 }); 1642 }); 1643 Kokkos::fence(); 1644 Kokkos::deep_copy(h_flops_k, d_flops_k); 1645 PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k())); 1646 Kokkos::parallel_for( 1647 Kokkos::TeamPolicy<>(Nf * Ni, 1, 256), KOKKOS_LAMBDA(const team_member team) { 1648 const PetscInt lg_rank = team.league_rank(), field = lg_rank / Ni; //, field_offset = lg_rank%Ni; 1649 const PetscInt start = field * nloc, end = start + nloc, n_its = (nloc / Ni + !!(nloc % Ni)); // 1/Ni iters 1650 /* Invert diagonal for simpler triangular solves */ 1651 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=](int outer_index) { 1652 int i = start + outer_index * Ni + lg_rank % Ni; 1653 if (i < end) { 1654 PetscScalar *pv = ba_d + bdiag_d[i]; 1655 *pv = 1.0 / (*pv); 1656 } 1657 }); 1658 }); 1659 } 1660 PetscCall(PetscLogGpuTimeEnd()); 1661 PetscCall(ISRestoreIndices(isicol, &ic_h)); 1662 PetscCall(ISRestoreIndices(isrow, &r_h)); 1663 1664 PetscCall(ISIdentity(isrow, &row_identity)); 1665 PetscCall(ISIdentity(isicol, &col_identity)); 1666 if (b->inode.size) { 1667 B->ops->solve = MatSolve_SeqAIJ_Inode; 1668 } else if (row_identity && col_identity) { 1669 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1670 } else { 1671 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1672 } 1673 B->offloadmask = PETSC_OFFLOAD_GPU; 1674 PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU 1675 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1676 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1677 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1678 B->ops->matsolve = MatMatSolve_SeqAIJ; 1679 B->assembled = PETSC_TRUE; 1680 B->preallocated = PETSC_TRUE; 1681 1682 PetscFunctionReturn(PETSC_SUCCESS); 1683 } 1684 1685 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1686 { 1687 PetscFunctionBegin; 1688 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1689 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1690 PetscFunctionReturn(PETSC_SUCCESS); 1691 } 1692 1693 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1694 { 1695 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1696 1697 PetscFunctionBegin; 1698 if (!factors->sptrsv_symbolic_completed) { 1699 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d); 1700 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d); 1701 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1702 } 1703 PetscFunctionReturn(PETSC_SUCCESS); 1704 } 1705 1706 /* Check if we need to update factors etc for transpose solve */ 1707 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1708 { 1709 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1710 MatColIdxType n = A->rmap->n; 1711 1712 PetscFunctionBegin; 1713 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1714 /* Update L^T and do sptrsv symbolic */ 1715 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); 1716 Kokkos::deep_copy(factors->iLt_d, 0); /* KK requires 0 */ 1717 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d", factors->jL_d.extent(0)); 1718 factors->aLt_d = MatScalarKokkosView("factors->aLt_d", factors->aL_d.extent(0)); 1719 1720 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d, 1721 factors->iLt_d, factors->jLt_d, factors->aLt_d); 1722 1723 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1724 We have to sort the indices, until KK provides finer control options. 1725 */ 1726 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d); 1727 1728 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d); 1729 1730 /* Update U^T and do sptrsv symbolic */ 1731 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); 1732 Kokkos::deep_copy(factors->iUt_d, 0); /* KK requires 0 */ 1733 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d", factors->jU_d.extent(0)); 1734 factors->aUt_d = MatScalarKokkosView("factors->aUt_d", factors->aU_d.extent(0)); 1735 1736 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d, 1737 factors->iUt_d, factors->jUt_d, factors->aUt_d); 1738 1739 /* Sort indices. See comments above */ 1740 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d); 1741 1742 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d); 1743 factors->transpose_updated = PETSC_TRUE; 1744 } 1745 PetscFunctionReturn(PETSC_SUCCESS); 1746 } 1747 1748 /* Solve Ax = b, with A = LU */ 1749 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x) 1750 { 1751 ConstPetscScalarKokkosView bv; 1752 PetscScalarKokkosView xv; 1753 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1754 1755 PetscFunctionBegin; 1756 PetscCall(PetscLogGpuTimeBegin()); 1757 PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 1758 PetscCall(VecGetKokkosView(b, &bv)); 1759 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1760 /* Solve L tmpv = b */ 1761 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector)); 1762 /* Solve Ux = tmpv */ 1763 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv)); 1764 PetscCall(VecRestoreKokkosView(b, &bv)); 1765 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1766 PetscCall(PetscLogGpuTimeEnd()); 1767 PetscFunctionReturn(PETSC_SUCCESS); 1768 } 1769 1770 /* Solve A^T x = b, where A^T = U^T L^T */ 1771 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x) 1772 { 1773 ConstPetscScalarKokkosView bv; 1774 PetscScalarKokkosView xv; 1775 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1776 1777 PetscFunctionBegin; 1778 PetscCall(PetscLogGpuTimeBegin()); 1779 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 1780 PetscCall(VecGetKokkosView(b, &bv)); 1781 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1782 /* Solve U^T tmpv = b */ 1783 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector); 1784 1785 /* Solve L^T x = tmpv */ 1786 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv); 1787 PetscCall(VecRestoreKokkosView(b, &bv)); 1788 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1789 PetscCall(PetscLogGpuTimeEnd()); 1790 PetscFunctionReturn(PETSC_SUCCESS); 1791 } 1792 1793 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1794 { 1795 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1796 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1797 PetscInt fill_lev = info->levels; 1798 1799 PetscFunctionBegin; 1800 PetscCall(PetscLogGpuTimeBegin()); 1801 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1802 1803 auto a_d = aijkok->a_dual.view_device(); 1804 auto i_d = aijkok->i_dual.view_device(); 1805 auto j_d = aijkok->j_dual.view_device(); 1806 1807 KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d); 1808 1809 B->assembled = PETSC_TRUE; 1810 B->preallocated = PETSC_TRUE; 1811 B->ops->solve = MatSolve_SeqAIJKokkos; 1812 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1813 B->ops->matsolve = NULL; 1814 B->ops->matsolvetranspose = NULL; 1815 B->offloadmask = PETSC_OFFLOAD_GPU; 1816 1817 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1818 factors->transpose_updated = PETSC_FALSE; 1819 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1820 /* TODO: log flops, but how to know that? */ 1821 PetscCall(PetscLogGpuTimeEnd()); 1822 PetscFunctionReturn(PETSC_SUCCESS); 1823 } 1824 1825 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1826 { 1827 Mat_SeqAIJKokkos *aijkok; 1828 Mat_SeqAIJ *b; 1829 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1830 PetscInt fill_lev = info->levels; 1831 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 1832 PetscInt n = A->rmap->n; 1833 1834 PetscFunctionBegin; 1835 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1836 /* Rebuild factors */ 1837 if (factors) { 1838 factors->Destroy(); 1839 } /* Destroy the old if it exists */ 1840 else { 1841 B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n); 1842 } 1843 1844 /* Create a spiluk handle and then do symbolic factorization */ 1845 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 1846 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 1847 1848 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1849 1850 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 1851 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 1852 Kokkos::realloc(factors->iU_d, n + 1); 1853 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 1854 1855 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1856 auto i_d = aijkok->i_dual.view_device(); 1857 auto j_d = aijkok->j_dual.view_device(); 1858 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d); 1859 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1860 1861 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1862 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 1863 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 1864 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 1865 1866 /* TODO: add options to select sptrsv algorithms */ 1867 /* Create sptrsv handles for L, U and their transpose */ 1868 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1869 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1870 #else 1871 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1872 #endif 1873 1874 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 1875 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 1876 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 1877 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 1878 1879 /* Fill fields of the factor matrix B */ 1880 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 1881 b = (Mat_SeqAIJ *)B->data; 1882 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 1883 B->info.fill_ratio_given = info->fill; 1884 B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 1885 1886 B->offloadmask = PETSC_OFFLOAD_GPU; 1887 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1888 PetscFunctionReturn(PETSC_SUCCESS); 1889 } 1890 1891 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1892 { 1893 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1894 const PetscInt nrows = A->rmap->n; 1895 1896 PetscFunctionBegin; 1897 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1898 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1899 // move B data into Kokkos 1900 PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok 1901 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok 1902 { 1903 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 1904 if (!baijkok->diag_d.extent(0)) { 1905 const Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_diag(b->diag, nrows + 1); 1906 baijkok->diag_d = Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_diag)); 1907 Kokkos::deep_copy(baijkok->diag_d, h_diag); 1908 } 1909 } 1910 PetscFunctionReturn(PETSC_SUCCESS); 1911 } 1912 1913 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type) 1914 { 1915 PetscFunctionBegin; 1916 *type = MATSOLVERKOKKOS; 1917 PetscFunctionReturn(PETSC_SUCCESS); 1918 } 1919 1920 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A, MatSolverType *type) 1921 { 1922 PetscFunctionBegin; 1923 *type = MATSOLVERKOKKOSDEVICE; 1924 PetscFunctionReturn(PETSC_SUCCESS); 1925 } 1926 1927 /*MC 1928 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1929 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 1930 1931 Level: beginner 1932 1933 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1934 M*/ 1935 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1936 { 1937 PetscInt n = A->rmap->n; 1938 1939 PetscFunctionBegin; 1940 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 1941 PetscCall(MatSetSizes(*B, n, n, n, n)); 1942 (*B)->factortype = ftype; 1943 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 1944 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1945 1946 if (ftype == MAT_FACTOR_LU) { 1947 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1948 (*B)->canuseordering = PETSC_TRUE; 1949 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1950 } else if (ftype == MAT_FACTOR_ILU) { 1951 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1952 (*B)->canuseordering = PETSC_FALSE; 1953 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1954 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1955 1956 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 1957 PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos)); 1958 PetscFunctionReturn(PETSC_SUCCESS); 1959 } 1960 1961 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A, MatFactorType ftype, Mat *B) 1962 { 1963 PetscInt n = A->rmap->n; 1964 1965 PetscFunctionBegin; 1966 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 1967 PetscCall(MatSetSizes(*B, n, n, n, n)); 1968 (*B)->factortype = ftype; 1969 (*B)->canuseordering = PETSC_TRUE; 1970 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 1971 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1972 1973 if (ftype == MAT_FACTOR_LU) { 1974 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1975 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1976 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for KOKKOS Matrix Types"); 1977 1978 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 1979 PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_kokkos_device)); 1980 PetscFunctionReturn(PETSC_SUCCESS); 1981 } 1982 1983 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1984 { 1985 PetscFunctionBegin; 1986 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1987 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1988 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_seqaijkokkos_kokkos_device)); 1989 PetscFunctionReturn(PETSC_SUCCESS); 1990 } 1991 1992 /* Utility to print out a KokkosCsrMatrix for debugging */ 1993 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 1994 { 1995 const auto &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map); 1996 const auto &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries); 1997 const auto &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values); 1998 const PetscInt *i = iv.data(); 1999 const PetscInt *j = jv.data(); 2000 const PetscScalar *a = av.data(); 2001 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 2002 2003 PetscFunctionBegin; 2004 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 2005 for (PetscInt k = 0; k < m; k++) { 2006 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 2007 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 2008 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 2009 } 2010 PetscFunctionReturn(PETSC_SUCCESS); 2011 } 2012