1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petsc/private/sfimpl.h> 5 #include <petscsystypes.h> 6 #include <petscerror.h> 7 8 #include <Kokkos_Core.hpp> 9 #include <KokkosBlas.hpp> 10 #include <KokkosSparse_CrsMatrix.hpp> 11 #include <KokkosSparse_spmv.hpp> 12 #include <KokkosSparse_spiluk.hpp> 13 #include <KokkosSparse_sptrsv.hpp> 14 #include <KokkosSparse_spgemm.hpp> 15 #include <KokkosSparse_spadd.hpp> 16 17 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 18 19 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 6, 99) 20 #include <KokkosSparse_Utils.hpp> 21 using KokkosSparse::sort_crs_matrix; 22 using KokkosSparse::Impl::transpose_matrix; 23 #else 24 #include <KokkosKernels_Sorting.hpp> 25 using KokkosKernels::sort_crs_matrix; 26 using KokkosKernels::Impl::transpose_matrix; 27 #endif 28 29 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 30 31 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 32 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 33 In the latter case, it is important to set a_dual's sync state correctly. 34 */ 35 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 36 { 37 Mat_SeqAIJ *aijseq; 38 Mat_SeqAIJKokkos *aijkok; 39 40 PetscFunctionBegin; 41 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 42 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 43 44 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 45 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 46 47 /* If aijkok does not exist, we just copy i, j to device. 48 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 49 In both cases, we build a new aijkok structure. 50 */ 51 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 52 delete aijkok; 53 aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 54 A->spptr = aijkok; 55 } 56 57 if (aijkok->device_mat_d.data()) { 58 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 59 } 60 PetscFunctionReturn(0); 61 } 62 63 /* Sync CSR data to device if not yet */ 64 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 65 { 66 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 67 68 PetscFunctionBegin; 69 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Cann't sync factorized matrix from host to device"); 70 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 71 if (aijkok->a_dual.need_sync_device()) { 72 aijkok->a_dual.sync_device(); 73 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 74 aijkok->hermitian_updated = PETSC_FALSE; 75 } 76 PetscFunctionReturn(0); 77 } 78 79 /* Mark the CSR data on device as modified */ 80 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 81 { 82 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 83 84 PetscFunctionBegin; 85 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 86 aijkok->a_dual.clear_sync_state(); 87 aijkok->a_dual.modify_device(); 88 aijkok->transpose_updated = PETSC_FALSE; 89 aijkok->hermitian_updated = PETSC_FALSE; 90 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 91 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 92 PetscFunctionReturn(0); 93 } 94 95 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 96 { 97 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 98 99 PetscFunctionBegin; 100 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 101 /* We do not expect one needs factors on host */ 102 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Cann't sync factorized matrix from device to host"); 103 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 104 aijkok->a_dual.sync_host(); 105 PetscFunctionReturn(0); 106 } 107 108 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 109 { 110 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 111 112 PetscFunctionBegin; 113 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 114 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 115 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 116 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 117 */ 118 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 119 aijkok->a_dual.sync_host(); 120 *array = aijkok->a_dual.view_host().data(); 121 } else { /* Happens when calling MatSetValues on a newly created matrix */ 122 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 123 } 124 PetscFunctionReturn(0); 125 } 126 127 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 128 { 129 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 130 131 PetscFunctionBegin; 132 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 133 PetscFunctionReturn(0); 134 } 135 136 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 137 { 138 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 139 140 PetscFunctionBegin; 141 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 142 aijkok->a_dual.sync_host(); 143 *array = aijkok->a_dual.view_host().data(); 144 } else { 145 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 146 } 147 PetscFunctionReturn(0); 148 } 149 150 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 151 { 152 PetscFunctionBegin; 153 *array = NULL; 154 PetscFunctionReturn(0); 155 } 156 157 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 158 { 159 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 160 161 PetscFunctionBegin; 162 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 163 *array = aijkok->a_dual.view_host().data(); 164 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 165 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 166 } 167 PetscFunctionReturn(0); 168 } 169 170 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 171 { 172 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 173 174 PetscFunctionBegin; 175 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 176 aijkok->a_dual.clear_sync_state(); 177 aijkok->a_dual.modify_host(); 178 } 179 PetscFunctionReturn(0); 180 } 181 182 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 183 { 184 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 185 186 PetscFunctionBegin; 187 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 188 189 if (i) *i = aijkok->i_device_data(); 190 if (j) *j = aijkok->j_device_data(); 191 if (a) { 192 aijkok->a_dual.sync_device(); 193 *a = aijkok->a_device_data(); 194 } 195 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 196 PetscFunctionReturn(0); 197 } 198 199 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 200 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 201 { 202 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 203 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 204 205 PetscFunctionBegin; 206 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 207 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(), h_mat_k); 208 Kokkos::deep_copy(aijkok->device_mat_d, h_mat_k); 209 PetscFunctionReturn(0); 210 } 211 212 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 213 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 214 { 215 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 216 217 PetscFunctionBegin; 218 if (aijkok && aijkok->device_mat_d.data()) { 219 *d_mat = aijkok->device_mat_d.data(); 220 } else { 221 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it) 222 *d_mat = NULL; 223 } 224 PetscFunctionReturn(0); 225 } 226 227 /* Generate the transpose on device and cache it internally */ 228 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 229 { 230 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 231 232 PetscFunctionBegin; 233 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 234 if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 235 /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 236 PetscCallCXX(aijkok->a_dual.sync_device()); 237 PetscCallCXX(aijkok->csrmatT = transpose_matrix(aijkok->csrmat)); 238 PetscCallCXX(sort_crs_matrix(aijkok->csrmatT)); 239 aijkok->transpose_updated = PETSC_TRUE; 240 } 241 *csrmatT = &aijkok->csrmatT; 242 PetscFunctionReturn(0); 243 } 244 245 /* Generate the Hermitian on device and cache it internally */ 246 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 247 { 248 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 249 250 PetscFunctionBegin; 251 PetscCall(PetscLogGpuTimeBegin()); 252 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 253 if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 254 PetscCallCXX(aijkok->a_dual.sync_device()); 255 PetscCallCXX(aijkok->csrmatH = transpose_matrix(aijkok->csrmat)); 256 PetscCallCXX(sort_crs_matrix(aijkok->csrmatH)); 257 #if defined(PETSC_USE_COMPLEX) 258 const auto &a = aijkok->csrmatH.values; 259 Kokkos::parallel_for( 260 a.extent(0), KOKKOS_LAMBDA(MatRowMapType i) { a(i) = PetscConj(a(i)); }); 261 #endif 262 aijkok->hermitian_updated = PETSC_TRUE; 263 } 264 *csrmatH = &aijkok->csrmatH; 265 PetscCall(PetscLogGpuTimeEnd()); 266 PetscFunctionReturn(0); 267 } 268 269 /* y = A x */ 270 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 271 { 272 Mat_SeqAIJKokkos *aijkok; 273 ConstPetscScalarKokkosView xv; 274 PetscScalarKokkosView yv; 275 276 PetscFunctionBegin; 277 PetscCall(PetscLogGpuTimeBegin()); 278 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 279 PetscCall(VecGetKokkosView(xx, &xv)); 280 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 281 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 282 PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 283 PetscCall(VecRestoreKokkosView(xx, &xv)); 284 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 285 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 286 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 287 PetscCall(PetscLogGpuTimeEnd()); 288 PetscFunctionReturn(0); 289 } 290 291 /* y = A^T x */ 292 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 293 { 294 Mat_SeqAIJKokkos *aijkok; 295 const char *mode; 296 ConstPetscScalarKokkosView xv; 297 PetscScalarKokkosView yv; 298 KokkosCsrMatrix *csrmat; 299 300 PetscFunctionBegin; 301 PetscCall(PetscLogGpuTimeBegin()); 302 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 303 PetscCall(VecGetKokkosView(xx, &xv)); 304 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 305 if (A->form_explicit_transpose) { 306 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 307 mode = "N"; 308 } else { 309 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 310 csrmat = &aijkok->csrmat; 311 mode = "T"; 312 } 313 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 314 PetscCall(VecRestoreKokkosView(xx, &xv)); 315 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 316 PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz())); 317 PetscCall(PetscLogGpuTimeEnd()); 318 PetscFunctionReturn(0); 319 } 320 321 /* y = A^H x */ 322 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 323 { 324 Mat_SeqAIJKokkos *aijkok; 325 const char *mode; 326 ConstPetscScalarKokkosView xv; 327 PetscScalarKokkosView yv; 328 KokkosCsrMatrix *csrmat; 329 330 PetscFunctionBegin; 331 PetscCall(PetscLogGpuTimeBegin()); 332 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 333 PetscCall(VecGetKokkosView(xx, &xv)); 334 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 335 if (A->form_explicit_transpose) { 336 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 337 mode = "N"; 338 } else { 339 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 340 csrmat = &aijkok->csrmat; 341 mode = "C"; 342 } 343 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 344 PetscCall(VecRestoreKokkosView(xx, &xv)); 345 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 346 PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz())); 347 PetscCall(PetscLogGpuTimeEnd()); 348 PetscFunctionReturn(0); 349 } 350 351 /* z = A x + y */ 352 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 353 { 354 Mat_SeqAIJKokkos *aijkok; 355 ConstPetscScalarKokkosView xv, yv; 356 PetscScalarKokkosView zv; 357 358 PetscFunctionBegin; 359 PetscCall(PetscLogGpuTimeBegin()); 360 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 361 PetscCall(VecGetKokkosView(xx, &xv)); 362 PetscCall(VecGetKokkosView(yy, &yv)); 363 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 364 if (zz != yy) Kokkos::deep_copy(zv, yv); 365 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 366 PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 367 PetscCall(VecRestoreKokkosView(xx, &xv)); 368 PetscCall(VecRestoreKokkosView(yy, &yv)); 369 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 370 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 371 PetscCall(PetscLogGpuTimeEnd()); 372 PetscFunctionReturn(0); 373 } 374 375 /* z = A^T x + y */ 376 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 377 { 378 Mat_SeqAIJKokkos *aijkok; 379 const char *mode; 380 ConstPetscScalarKokkosView xv, yv; 381 PetscScalarKokkosView zv; 382 KokkosCsrMatrix *csrmat; 383 384 PetscFunctionBegin; 385 PetscCall(PetscLogGpuTimeBegin()); 386 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 387 PetscCall(VecGetKokkosView(xx, &xv)); 388 PetscCall(VecGetKokkosView(yy, &yv)); 389 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 390 if (zz != yy) Kokkos::deep_copy(zv, yv); 391 if (A->form_explicit_transpose) { 392 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 393 mode = "N"; 394 } else { 395 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 396 csrmat = &aijkok->csrmat; 397 mode = "T"; 398 } 399 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 400 PetscCall(VecRestoreKokkosView(xx, &xv)); 401 PetscCall(VecRestoreKokkosView(yy, &yv)); 402 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 403 PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz())); 404 PetscCall(PetscLogGpuTimeEnd()); 405 PetscFunctionReturn(0); 406 } 407 408 /* z = A^H x + y */ 409 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 410 { 411 Mat_SeqAIJKokkos *aijkok; 412 const char *mode; 413 ConstPetscScalarKokkosView xv, yv; 414 PetscScalarKokkosView zv; 415 KokkosCsrMatrix *csrmat; 416 417 PetscFunctionBegin; 418 PetscCall(PetscLogGpuTimeBegin()); 419 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 420 PetscCall(VecGetKokkosView(xx, &xv)); 421 PetscCall(VecGetKokkosView(yy, &yv)); 422 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 423 if (zz != yy) Kokkos::deep_copy(zv, yv); 424 if (A->form_explicit_transpose) { 425 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 426 mode = "N"; 427 } else { 428 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 429 csrmat = &aijkok->csrmat; 430 mode = "C"; 431 } 432 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 433 PetscCall(VecRestoreKokkosView(xx, &xv)); 434 PetscCall(VecRestoreKokkosView(yy, &yv)); 435 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 436 PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz())); 437 PetscCall(PetscLogGpuTimeEnd()); 438 PetscFunctionReturn(0); 439 } 440 441 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 442 { 443 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 444 445 PetscFunctionBegin; 446 switch (op) { 447 case MAT_FORM_EXPLICIT_TRANSPOSE: 448 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 449 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 450 A->form_explicit_transpose = flg; 451 break; 452 default: 453 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 454 break; 455 } 456 PetscFunctionReturn(0); 457 } 458 459 /* Depending on reuse, either build a new mat, or use the existing mat */ 460 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 461 { 462 Mat_SeqAIJ *aseq; 463 464 PetscFunctionBegin; 465 PetscCall(PetscKokkosInitializeCheck()); 466 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 467 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */ 468 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 469 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 470 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 471 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 472 PetscCall(PetscFree(A->defaultvectype)); 473 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 474 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 475 PetscCall(MatSetOps_SeqAIJKokkos(A)); 476 aseq = static_cast<Mat_SeqAIJ *>(A->data); 477 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 478 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 479 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE); 480 } 481 } 482 PetscFunctionReturn(0); 483 } 484 485 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 486 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 487 */ 488 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 489 { 490 Mat_SeqAIJ *bseq; 491 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 492 Mat mat; 493 494 PetscFunctionBegin; 495 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 496 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 497 mat = *B; 498 if (A->assembled) { 499 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 500 bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE); 501 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 502 /* Now copy values to B if needed */ 503 if (dupOption == MAT_COPY_VALUES) { 504 if (akok->a_dual.need_sync_device()) { 505 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 506 bkok->a_dual.modify_host(); 507 } else { /* If device has the latest data, we only copy data on device */ 508 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 509 bkok->a_dual.modify_device(); 510 } 511 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 512 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 513 bkok->a_dual.modify_host(); 514 } 515 mat->spptr = bkok; 516 } 517 518 PetscCall(PetscFree(mat->defaultvectype)); 519 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 520 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 521 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 522 PetscFunctionReturn(0); 523 } 524 525 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 526 { 527 Mat At; 528 KokkosCsrMatrix *internT; 529 Mat_SeqAIJKokkos *atkok, *bkok; 530 531 PetscFunctionBegin; 532 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 533 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 534 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 535 /* Deep copy internT, as we want to isolate the internal transpose */ 536 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", *internT))); 537 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 538 if (reuse == MAT_INITIAL_MATRIX) *B = At; 539 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 540 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 541 if ((*B)->assembled) { 542 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 543 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT->values)); 544 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 545 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 546 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 547 MatScalarKokkosViewHost a_h(bseq->a, internT->nnz()); /* bseq->nz = 0 if unassembled */ 548 MatColIdxKokkosViewHost j_h(bseq->j, internT->nnz()); 549 PetscCallCXX(Kokkos::deep_copy(a_h, internT->values)); 550 PetscCallCXX(Kokkos::deep_copy(j_h, internT->graph.entries)); 551 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 552 } 553 PetscFunctionReturn(0); 554 } 555 556 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 557 { 558 Mat_SeqAIJKokkos *aijkok; 559 560 PetscFunctionBegin; 561 if (A->factortype == MAT_FACTOR_NONE) { 562 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 563 delete aijkok; 564 } else { 565 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 566 } 567 A->spptr = NULL; 568 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 569 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 570 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 571 PetscCall(MatDestroy_SeqAIJ(A)); 572 PetscFunctionReturn(0); 573 } 574 575 /*MC 576 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 577 578 A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 579 580 Options Database Keys: 581 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 582 583 Level: beginner 584 585 .seealso: `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 586 M*/ 587 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 588 { 589 PetscFunctionBegin; 590 PetscCall(PetscKokkosInitializeCheck()); 591 PetscCall(MatCreate_SeqAIJ(A)); 592 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 593 PetscFunctionReturn(0); 594 } 595 596 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 597 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 598 { 599 Mat_SeqAIJ *a, *b; 600 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 601 MatScalarKokkosView aa, ba, ca; 602 MatRowMapKokkosView ai, bi, ci; 603 MatColIdxKokkosView aj, bj, cj; 604 PetscInt m, n, nnz, aN; 605 606 PetscFunctionBegin; 607 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 608 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 609 PetscValidPointer(C, 4); 610 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 611 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 612 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 613 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 614 615 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 616 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 617 a = static_cast<Mat_SeqAIJ *>(A->data); 618 b = static_cast<Mat_SeqAIJ *>(B->data); 619 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 620 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 621 aa = akok->a_dual.view_device(); 622 ai = akok->i_dual.view_device(); 623 ba = bkok->a_dual.view_device(); 624 bi = bkok->i_dual.view_device(); 625 m = A->rmap->n; /* M, N and nnz of C */ 626 n = A->cmap->n + B->cmap->n; 627 nnz = a->nz + b->nz; 628 aN = A->cmap->n; /* N of A */ 629 if (reuse == MAT_INITIAL_MATRIX) { 630 aj = akok->j_dual.view_device(); 631 bj = bkok->j_dual.view_device(); 632 auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 633 auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 634 auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 635 ca = ca_dual.view_device(); 636 ci = ci_dual.view_device(); 637 cj = cj_dual.view_device(); 638 639 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 640 Kokkos::parallel_for( 641 Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 642 PetscInt i = t.league_rank(); /* row i */ 643 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 644 645 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 646 ci(i) = coffset; 647 if (i == m - 1) ci(m) = ai(m) + bi(m); 648 }); 649 650 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 651 if (k < alen) { 652 ca(coffset + k) = aa(ai(i) + k); 653 cj(coffset + k) = aj(ai(i) + k); 654 } else { 655 ca(coffset + k) = ba(bi(i) + k - alen); 656 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 657 } 658 }); 659 }); 660 ca_dual.modify_device(); 661 ci_dual.modify_device(); 662 cj_dual.modify_device(); 663 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 664 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 665 } else if (reuse == MAT_REUSE_MATRIX) { 666 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 667 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 668 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 669 ca = ckok->a_dual.view_device(); 670 ci = ckok->i_dual.view_device(); 671 672 Kokkos::parallel_for( 673 Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 674 PetscInt i = t.league_rank(); /* row i */ 675 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 676 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 677 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 678 else ca(ci(i) + k) = ba(bi(i) + k - alen); 679 }); 680 }); 681 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 682 } 683 PetscFunctionReturn(0); 684 } 685 686 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 687 { 688 PetscFunctionBegin; 689 delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 690 PetscFunctionReturn(0); 691 } 692 693 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 694 { 695 Mat_Product *product = C->product; 696 Mat A, B; 697 bool transA, transB; /* use bool, since KK needs this type */ 698 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 699 Mat_SeqAIJ *c; 700 MatProductData_SeqAIJKokkos *pdata; 701 KokkosCsrMatrix *csrmatA, *csrmatB; 702 703 PetscFunctionBegin; 704 MatCheckProduct(C, 1); 705 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 706 pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 707 708 if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 709 pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 710 PetscFunctionReturn(0); 711 } 712 713 switch (product->type) { 714 case MATPRODUCT_AB: 715 transA = false; 716 transB = false; 717 break; 718 case MATPRODUCT_AtB: 719 transA = true; 720 transB = false; 721 break; 722 case MATPRODUCT_ABt: 723 transA = false; 724 transB = true; 725 break; 726 default: 727 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 728 } 729 730 A = product->A; 731 B = product->B; 732 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 733 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 734 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 735 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 736 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 737 738 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 739 740 csrmatA = &akok->csrmat; 741 csrmatB = &bkok->csrmat; 742 743 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 744 if (transA) { 745 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 746 transA = false; 747 } 748 749 if (transB) { 750 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 751 transB = false; 752 } 753 PetscCall(PetscLogGpuTimeBegin()); 754 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, *csrmatA, transA, *csrmatB, transB, ckok->csrmat)); 755 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 756 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 757 758 PetscCall(PetscLogGpuTimeEnd()); 759 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 760 /* shorter version of MatAssemblyEnd_SeqAIJ */ 761 c = (Mat_SeqAIJ *)C->data; 762 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 763 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 764 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 765 c->reallocs = 0; 766 C->info.mallocs = 0; 767 C->info.nz_unneeded = 0; 768 C->assembled = C->was_assembled = PETSC_TRUE; 769 C->num_ass++; 770 PetscFunctionReturn(0); 771 } 772 773 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 774 { 775 Mat_Product *product = C->product; 776 MatProductType ptype; 777 Mat A, B; 778 bool transA, transB; 779 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 780 MatProductData_SeqAIJKokkos *pdata; 781 MPI_Comm comm; 782 KokkosCsrMatrix *csrmatA, *csrmatB, csrmatC; 783 784 PetscFunctionBegin; 785 MatCheckProduct(C, 1); 786 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 787 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 788 A = product->A; 789 B = product->B; 790 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 791 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 792 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 793 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 794 csrmatA = &akok->csrmat; 795 csrmatB = &bkok->csrmat; 796 797 ptype = product->type; 798 switch (ptype) { 799 case MATPRODUCT_AB: 800 transA = false; 801 transB = false; 802 break; 803 case MATPRODUCT_AtB: 804 transA = true; 805 transB = false; 806 break; 807 case MATPRODUCT_ABt: 808 transA = false; 809 transB = true; 810 break; 811 default: 812 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 813 } 814 815 product->data = pdata = new MatProductData_SeqAIJKokkos(); 816 pdata->kh.set_team_work_size(16); 817 pdata->kh.set_dynamic_scheduling(true); 818 pdata->reusesym = product->api_user; 819 820 /* TODO: add command line options to select spgemm algorithms */ 821 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 822 823 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 824 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 825 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 826 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 827 #endif 828 #endif 829 830 pdata->kh.create_spgemm_handle(spgemm_alg); 831 832 PetscCall(PetscLogGpuTimeBegin()); 833 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 834 if (transA) { 835 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 836 transA = false; 837 } 838 839 if (transB) { 840 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 841 transB = false; 842 } 843 844 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, *csrmatA, transA, *csrmatB, transB, csrmatC)); 845 846 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 847 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 848 calling new Mat_SeqAIJKokkos(). 849 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 850 */ 851 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, *csrmatA, transA, *csrmatB, transB, csrmatC)); 852 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 853 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 854 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 855 PetscCall(PetscLogGpuTimeEnd()); 856 857 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 858 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 859 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 860 PetscFunctionReturn(0); 861 } 862 863 /* handles sparse matrix matrix ops */ 864 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 865 { 866 Mat_Product *product = mat->product; 867 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 868 869 PetscFunctionBegin; 870 MatCheckProduct(mat, 1); 871 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 872 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 873 if (Biskok && Ciskok) { 874 switch (product->type) { 875 case MATPRODUCT_AB: 876 case MATPRODUCT_AtB: 877 case MATPRODUCT_ABt: 878 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 879 break; 880 case MATPRODUCT_PtAP: 881 case MATPRODUCT_RARt: 882 case MATPRODUCT_ABC: 883 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 884 break; 885 default: 886 break; 887 } 888 } else { /* fallback for AIJ */ 889 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 890 } 891 PetscFunctionReturn(0); 892 } 893 894 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 895 { 896 Mat_SeqAIJKokkos *aijkok; 897 898 PetscFunctionBegin; 899 PetscCall(PetscLogGpuTimeBegin()); 900 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 901 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 902 KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 903 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 904 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 905 PetscCall(PetscLogGpuTimeEnd()); 906 PetscFunctionReturn(0); 907 } 908 909 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 910 { 911 Mat_SeqAIJKokkos *aijkok; 912 913 PetscFunctionBegin; 914 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 915 if (aijkok) { /* Only zero the device if data is already there */ 916 KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0); 917 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 918 } else { /* Might be preallocated but not assembled */ 919 PetscCall(MatZeroEntries_SeqAIJ(A)); 920 } 921 PetscFunctionReturn(0); 922 } 923 924 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 925 { 926 Mat_SeqAIJ *aijseq; 927 Mat_SeqAIJKokkos *aijkok; 928 PetscInt n; 929 PetscScalarKokkosView xv; 930 931 PetscFunctionBegin; 932 PetscCall(VecGetLocalSize(x, &n)); 933 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 934 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 935 936 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 937 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 938 939 if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */ 940 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 941 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 942 aijkok->SetDiagonal(aijseq->diag); 943 } 944 945 const auto &Aa = aijkok->a_dual.view_device(); 946 const auto &Ai = aijkok->i_dual.view_device(); 947 const auto &Adiag = aijkok->diag_dual.view_device(); 948 949 PetscCall(VecGetKokkosViewWrite(x, &xv)); 950 Kokkos::parallel_for( 951 n, KOKKOS_LAMBDA(const PetscInt i) { 952 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 953 else xv(i) = 0; 954 }); 955 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 956 PetscFunctionReturn(0); 957 } 958 959 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 960 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 961 { 962 Mat_SeqAIJKokkos *aijkok; 963 964 PetscFunctionBegin; 965 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 966 PetscValidPointer(kv, 2); 967 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 968 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 969 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 970 *kv = aijkok->a_dual.view_device(); 971 PetscFunctionReturn(0); 972 } 973 974 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 975 { 976 PetscFunctionBegin; 977 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 978 PetscValidPointer(kv, 2); 979 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 980 PetscFunctionReturn(0); 981 } 982 983 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 984 { 985 Mat_SeqAIJKokkos *aijkok; 986 987 PetscFunctionBegin; 988 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 989 PetscValidPointer(kv, 2); 990 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 991 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 992 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 993 *kv = aijkok->a_dual.view_device(); 994 PetscFunctionReturn(0); 995 } 996 997 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 998 { 999 PetscFunctionBegin; 1000 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1001 PetscValidPointer(kv, 2); 1002 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1003 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1004 PetscFunctionReturn(0); 1005 } 1006 1007 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1008 { 1009 Mat_SeqAIJKokkos *aijkok; 1010 1011 PetscFunctionBegin; 1012 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1013 PetscValidPointer(kv, 2); 1014 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1015 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1016 *kv = aijkok->a_dual.view_device(); 1017 PetscFunctionReturn(0); 1018 } 1019 1020 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1021 { 1022 PetscFunctionBegin; 1023 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1024 PetscValidPointer(kv, 2); 1025 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1026 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1027 PetscFunctionReturn(0); 1028 } 1029 1030 /* Computes Y += alpha X */ 1031 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1032 { 1033 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1034 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1035 ConstMatScalarKokkosView Xa; 1036 MatScalarKokkosView Ya; 1037 1038 PetscFunctionBegin; 1039 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1040 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1041 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1042 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1043 PetscCall(PetscLogGpuTimeBegin()); 1044 1045 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1046 /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 1047 PetscBool e; 1048 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1049 if (e) { 1050 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1051 if (e) pattern = SAME_NONZERO_PATTERN; 1052 } 1053 } 1054 1055 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1056 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1057 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1058 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1059 */ 1060 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1061 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1062 Xa = xkok->a_dual.view_device(); 1063 Ya = ykok->a_dual.view_device(); 1064 1065 if (pattern == SAME_NONZERO_PATTERN) { 1066 KokkosBlas::axpy(alpha, Xa, Ya); 1067 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1068 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1069 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1070 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1071 1072 Kokkos::parallel_for( 1073 Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1074 PetscInt i = t.league_rank(); /* row i */ 1075 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* Only one thread works in a team */ 1076 PetscInt p, q = Yi(i); 1077 for (p = Xi(i); p < Xi(i + 1); p++) { /* For each nonzero on row i of X */ 1078 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; /* find the matching nonzero on row i of Y */ 1079 if (Xj(p) == Yj(q)) { /* Found it */ 1080 Ya(q) += alpha * Xa(p); 1081 q++; 1082 } else { 1083 /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1084 Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1085 */ 1086 if (Yi(i) != Yi(i + 1)) 1087 Ya(Yi(i)) = 1088 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 6, 99) 1089 Kokkos::nan("1"); /* auto promote the double NaN if needed */ 1090 #else 1091 Kokkos::Experimental::nan("1"); 1092 #endif 1093 } 1094 } 1095 }); 1096 }); 1097 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1098 } else { /* different nonzero patterns */ 1099 Mat Z; 1100 KokkosCsrMatrix zcsr; 1101 KernelHandle kh; 1102 kh.create_spadd_handle(false); 1103 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1104 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1105 zkok = new Mat_SeqAIJKokkos(zcsr); 1106 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1107 PetscCall(MatHeaderReplace(Y, &Z)); 1108 kh.destroy_spadd_handle(); 1109 } 1110 PetscCall(PetscLogGpuTimeEnd()); 1111 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); /* Because we scaled X and then added it to Y */ 1112 PetscFunctionReturn(0); 1113 } 1114 1115 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1116 { 1117 Mat_SeqAIJKokkos *akok; 1118 Mat_SeqAIJ *aseq; 1119 1120 PetscFunctionBegin; 1121 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1122 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1123 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1124 delete akok; 1125 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE); 1126 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1127 akok->SetUpCOO(aseq); 1128 PetscFunctionReturn(0); 1129 } 1130 1131 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1132 { 1133 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 1134 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1135 PetscCount Annz = aseq->nz; 1136 const PetscCountKokkosView &jmap = akok->jmap_d; 1137 const PetscCountKokkosView &perm = akok->perm_d; 1138 MatScalarKokkosView Aa; 1139 ConstMatScalarKokkosView kv; 1140 PetscMemType memtype; 1141 1142 PetscFunctionBegin; 1143 PetscCall(PetscGetMemType(v, &memtype)); 1144 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1145 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, aseq->coo_n)); 1146 } else { 1147 kv = ConstMatScalarKokkosView(v, aseq->coo_n); /* Directly use v[]'s memory */ 1148 } 1149 1150 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1151 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1152 1153 Kokkos::parallel_for( 1154 Annz, KOKKOS_LAMBDA(const PetscCount i) { 1155 PetscScalar sum = 0.0; 1156 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1157 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1158 }); 1159 1160 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1161 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1162 PetscFunctionReturn(0); 1163 } 1164 1165 PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A, const PetscInt *diag) 1166 { 1167 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1168 MatScalarKokkosView Aa; 1169 const MatRowMapKokkosView &Ai = akok->i_dual.view_device(); 1170 PetscInt m = A->rmap->n; 1171 ConstMatRowMapKokkosView Adiag(diag, m); /* diag is a device pointer */ 1172 1173 PetscFunctionBegin; 1174 PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); 1175 Kokkos::parallel_for( 1176 m, KOKKOS_LAMBDA(const PetscInt i) { 1177 PetscScalar tmp; 1178 if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i + 1)) { /* The diagonal element exists */ 1179 tmp = Aa(Ai(i)); 1180 Aa(Ai(i)) = Aa(Adiag(i)); 1181 Aa(Adiag(i)) = tmp; 1182 } 1183 }); 1184 PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1185 PetscFunctionReturn(0); 1186 } 1187 1188 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1189 { 1190 PetscFunctionBegin; 1191 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1192 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1193 B->offloadmask = PETSC_OFFLOAD_CPU; 1194 PetscFunctionReturn(0); 1195 } 1196 1197 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1198 { 1199 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1200 1201 PetscFunctionBegin; 1202 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1203 A->boundtocpu = PETSC_FALSE; 1204 1205 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1206 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1207 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1208 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1209 A->ops->scale = MatScale_SeqAIJKokkos; 1210 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1211 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1212 A->ops->mult = MatMult_SeqAIJKokkos; 1213 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1214 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1215 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1216 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1217 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1218 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1219 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1220 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1221 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1222 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1223 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1224 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1225 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1226 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1227 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1228 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1229 1230 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1231 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1232 PetscFunctionReturn(0); 1233 } 1234 1235 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1236 { 1237 Mat_SeqAIJ *aseq; 1238 PetscInt i, m, n; 1239 1240 PetscFunctionBegin; 1241 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1242 1243 m = akok->nrows(); 1244 n = akok->ncols(); 1245 PetscCall(MatSetSizes(A, m, n, m, n)); 1246 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1247 1248 /* Set up data structures of A as a MATSEQAIJ */ 1249 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1250 aseq = (Mat_SeqAIJ *)(A)->data; 1251 1252 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1253 akok->j_dual.sync_host(); 1254 1255 aseq->i = akok->i_host_data(); 1256 aseq->j = akok->j_host_data(); 1257 aseq->a = akok->a_host_data(); 1258 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1259 aseq->singlemalloc = PETSC_FALSE; 1260 aseq->free_a = PETSC_FALSE; 1261 aseq->free_ij = PETSC_FALSE; 1262 aseq->nz = akok->nnz(); 1263 aseq->maxnz = aseq->nz; 1264 1265 PetscCall(PetscMalloc1(m, &aseq->imax)); 1266 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1267 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1268 1269 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1270 akok->nonzerostate = A->nonzerostate; 1271 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1272 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1273 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1274 PetscFunctionReturn(0); 1275 } 1276 1277 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1278 1279 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1280 */ 1281 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1282 { 1283 PetscFunctionBegin; 1284 PetscCall(MatCreate(comm, A)); 1285 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1286 PetscFunctionReturn(0); 1287 } 1288 1289 /* --------------------------------------------------------------------------------*/ 1290 /*@C 1291 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1292 (the default parallel PETSc format). This matrix will ultimately be handled by 1293 Kokkos for calculations. For good matrix 1294 assembly performance the user should preallocate the matrix storage by setting 1295 the parameter nz (or the array nnz). By setting these parameters accurately, 1296 performance during matrix assembly can be increased by more than a factor of 50. 1297 1298 Collective 1299 1300 Input Parameters: 1301 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1302 . m - number of rows 1303 . n - number of columns 1304 . nz - number of nonzeros per row (same for all rows) 1305 - nnz - array containing the number of nonzeros in the various rows 1306 (possibly different for each row) or NULL 1307 1308 Output Parameter: 1309 . A - the matrix 1310 1311 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1312 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1313 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1314 1315 Notes: 1316 If nnz is given then nz is ignored 1317 1318 The AIJ format, also called 1319 compressed row storage, is fully compatible with standard Fortran 77 1320 storage. That is, the stored row and column indices can begin at 1321 either one (as in Fortran) or zero. See the users' manual for details. 1322 1323 Specify the preallocated storage with either nz or nnz (not both). 1324 Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory 1325 allocation. For large problems you MUST preallocate memory or you 1326 will get TERRIBLE performance, see the users' manual chapter on matrices. 1327 1328 By default, this format uses inodes (identical nodes) when possible, to 1329 improve numerical efficiency of matrix-vector products and solves. We 1330 search for consecutive rows with the same nonzero structure, thereby 1331 reusing matrix information to achieve increased efficiency. 1332 1333 Level: intermediate 1334 1335 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()` 1336 @*/ 1337 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1338 { 1339 PetscFunctionBegin; 1340 PetscCall(PetscKokkosInitializeCheck()); 1341 PetscCall(MatCreate(comm, A)); 1342 PetscCall(MatSetSizes(*A, m, n, m, n)); 1343 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1344 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1345 PetscFunctionReturn(0); 1346 } 1347 1348 typedef Kokkos::TeamPolicy<>::member_type team_member; 1349 // 1350 // This factorization exploits block diagonal matrices with "Nf" (not used). 1351 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1352 // 1353 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B, Mat A, const MatFactorInfo *info) 1354 { 1355 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1356 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 1357 IS isrow = b->row, isicol = b->icol; 1358 const PetscInt *r_h, *ic_h; 1359 const PetscInt n = A->rmap->n, *ai_d = aijkok->i_dual.view_device().data(), *aj_d = aijkok->j_dual.view_device().data(), *bi_d = baijkok->i_dual.view_device().data(), *bj_d = baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data(); 1360 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1361 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1362 PetscBool row_identity, col_identity; 1363 PetscInt nc, Nf = 1, nVec = 32; // should be a parameter, Nf is batch size - not used 1364 1365 PetscFunctionBegin; 1366 PetscCheck(A->rmap->n == n, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT, A->rmap->n, n); 1367 PetscCall(MatIsStructurallySymmetric(A, &row_identity)); 1368 PetscCheck(row_identity, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "structurally symmetric matrices only supported"); 1369 PetscCall(ISGetIndices(isrow, &r_h)); 1370 PetscCall(ISGetIndices(isicol, &ic_h)); 1371 PetscCall(ISGetSize(isicol, &nc)); 1372 PetscCall(PetscLogGpuTimeBegin()); 1373 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1374 { 1375 #define KOKKOS_SHARED_LEVEL 1 1376 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1377 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1378 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1379 const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_r_k(r_h, n); 1380 Kokkos::View<PetscInt *, Kokkos::LayoutLeft> d_r_k("r", n); 1381 const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_ic_k(ic_h, nc); 1382 Kokkos::View<PetscInt *, Kokkos::LayoutLeft> d_ic_k("ic", nc); 1383 size_t flops_h = 0.0; 1384 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k(&flops_h); 1385 Kokkos::View<size_t> d_flops_k("flops"); 1386 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1387 const int nloc = n / Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 1388 Kokkos::deep_copy(d_flops_k, h_flops_k); 1389 Kokkos::deep_copy(d_r_k, h_r_k); 1390 Kokkos::deep_copy(d_ic_k, h_ic_k); 1391 // Fill A --> fact 1392 Kokkos::parallel_for( 1393 Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec), KOKKOS_LAMBDA(const team_member team) { 1394 const PetscInt field = team.league_rank() / Ni, field_block = team.league_rank() % Ni; // use grid.x/y in CUDA 1395 const PetscInt nloc_i = (nloc / Ni + !!(nloc % Ni)), start_i = field * nloc + field_block * nloc_i, end_i = (start_i + nloc_i) > (field + 1) * nloc ? (field + 1) * nloc : (start_i + nloc_i); 1396 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1397 // zero rows of B 1398 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) { 1399 PetscInt nzbL = bi_d[rowb + 1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb + 1]; // with diag 1400 PetscScalar *baL = ba_d + bi_d[rowb]; 1401 PetscScalar *baU = ba_d + bdiag_d[rowb + 1] + 1; 1402 /* zero (unfactored row) */ 1403 for (int j = 0; j < nzbL; j++) baL[j] = 0; 1404 for (int j = 0; j < nzbU; j++) baU[j] = 0; 1405 }); 1406 // copy A into B 1407 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) { 1408 PetscInt rowa = r[rowb], nza = ai_d[rowa + 1] - ai_d[rowa]; 1409 const PetscScalar *av = aa_d + ai_d[rowa]; 1410 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1411 /* load in initial (unfactored row) */ 1412 for (int j = 0; j < nza; j++) { 1413 PetscInt colb = ic[ajtmp[j]]; 1414 PetscScalar vala = av[j]; 1415 if (colb == rowb) { 1416 *(ba_d + bdiag_d[rowb]) = vala; 1417 } else { 1418 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]); 1419 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]); 1420 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb + 1] + 1) : bi_d[rowb + 1] - bi_d[rowb], set = 0; 1421 for (int j = 0; j < nz; j++) { 1422 if (pbj[j] == colb) { 1423 pba[j] = vala; 1424 set++; 1425 break; 1426 } 1427 } 1428 #if !defined(PETSC_HAVE_SYCL) 1429 if (set != 1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1430 #endif 1431 } 1432 } 1433 }); 1434 }); 1435 Kokkos::fence(); 1436 1437 Kokkos::parallel_for( 1438 Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size() + scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA(const team_member team) { 1439 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1440 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1441 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1442 const PetscInt field = team.league_rank() / Ni, field_block_idx = team.league_rank() % Ni; // use grid.x/y in CUDA 1443 const PetscInt start = field * nloc, end = start + nloc; 1444 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1445 // A22 panel update for each row A(1,:) and col A(:,1) 1446 for (int ii = start; ii < end - 1; ii++) { 1447 const PetscInt *bjUi = bj_d + bdiag_d[ii + 1] + 1, nzUi = bdiag_d[ii] - (bdiag_d[ii + 1] + 1); // vector, and vector size, of column indices of U(i,(i+1):end) 1448 const PetscScalar *baUi = ba_d + bdiag_d[ii + 1] + 1; // vector of data U(i,i+1:end) 1449 const PetscInt nUi_its = nzUi / Ni + !!(nzUi % Ni); 1450 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1451 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=](const int j) { 1452 PetscInt kIdx = j * Ni + field_block_idx; 1453 if (kIdx >= nzUi) /* void */ 1454 ; 1455 else { 1456 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1457 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1458 const PetscInt nzL = bi_d[myk + 1] - bi_d[myk]; // size of L_k(:) 1459 size_t st_idx; 1460 // find and do L(k,i) = A(:k,i) / A(i,i) 1461 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1462 // get column, there has got to be a better way 1463 Kokkos::parallel_reduce( 1464 Kokkos::ThreadVectorRange(team, nzL), 1465 [&](const int &j, size_t &idx) { 1466 if (pjL[j] == ii) { 1467 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1468 idx = j; // output 1469 *pLki = *pLki / Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1470 } 1471 }, 1472 st_idx); 1473 Kokkos::single(Kokkos::PerThread(team), [=]() { 1474 colkIdx() = st_idx; 1475 L_ki() = *(ba_d + bi_d[myk] + st_idx); 1476 }); 1477 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1478 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n", (int)myk, ii); // uses a register 1479 #endif 1480 // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1481 // U(i+1,:end) 1482 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, nzUi), [=](const int &uiIdx) { // index into i (U) 1483 PetscScalar Uij = baUi[uiIdx]; 1484 PetscInt col = bjUi[uiIdx]; 1485 if (col == myk) { 1486 // A_kk = A_kk - L_ki * U_ij(k) 1487 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1488 *Akkv = *Akkv - L_ki() * Uij; // UiK 1489 } else { 1490 PetscScalar *start, *end, *pAkjv = NULL; 1491 PetscInt high, low; 1492 const PetscInt *startj; 1493 if (col < myk) { // L 1494 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1495 PetscInt idx = (pLki + 1) - (ba_d + bi_d[myk]); // index into row 1496 start = pLki + 1; // start at pLki+1, A22(myk,1) 1497 startj = bj_d + bi_d[myk] + idx; 1498 end = ba_d + bi_d[myk + 1]; 1499 } else { 1500 PetscInt idx = bdiag_d[myk + 1] + 1; 1501 start = ba_d + idx; 1502 startj = bj_d + idx; 1503 end = ba_d + bdiag_d[myk]; 1504 } 1505 // search for 'col', use bisection search - TODO 1506 low = 0; 1507 high = (PetscInt)(end - start); 1508 while (high - low > 5) { 1509 int t = (low + high) / 2; 1510 if (startj[t] > col) high = t; 1511 else low = t; 1512 } 1513 for (pAkjv = start + low; pAkjv < start + high; pAkjv++) { 1514 if (startj[pAkjv - start] == col) break; 1515 } 1516 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1517 if (pAkjv == start + high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n", (int)myk, (int)col); // uses a register 1518 #endif 1519 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1520 } 1521 }); 1522 } 1523 }); 1524 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1525 if (field_block_idx == 0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add(flops.data(), (size_t)(2 * (nzUi * nzUi) + 2)); }); 1526 } /* endof for (i=0; i<n; i++) { */ 1527 Kokkos::single(Kokkos::PerTeam(team), [=]() { 1528 Kokkos::atomic_add(&d_flops_k(), flops()); 1529 flops() = 0; 1530 }); 1531 }); 1532 Kokkos::fence(); 1533 Kokkos::deep_copy(h_flops_k, d_flops_k); 1534 PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k())); 1535 Kokkos::parallel_for( 1536 Kokkos::TeamPolicy<>(Nf * Ni, 1, 256), KOKKOS_LAMBDA(const team_member team) { 1537 const PetscInt lg_rank = team.league_rank(), field = lg_rank / Ni; //, field_offset = lg_rank%Ni; 1538 const PetscInt start = field * nloc, end = start + nloc, n_its = (nloc / Ni + !!(nloc % Ni)); // 1/Ni iters 1539 /* Invert diagonal for simpler triangular solves */ 1540 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=](int outer_index) { 1541 int i = start + outer_index * Ni + lg_rank % Ni; 1542 if (i < end) { 1543 PetscScalar *pv = ba_d + bdiag_d[i]; 1544 *pv = 1.0 / (*pv); 1545 } 1546 }); 1547 }); 1548 } 1549 PetscCall(PetscLogGpuTimeEnd()); 1550 PetscCall(ISRestoreIndices(isicol, &ic_h)); 1551 PetscCall(ISRestoreIndices(isrow, &r_h)); 1552 1553 PetscCall(ISIdentity(isrow, &row_identity)); 1554 PetscCall(ISIdentity(isicol, &col_identity)); 1555 if (b->inode.size) { 1556 B->ops->solve = MatSolve_SeqAIJ_Inode; 1557 } else if (row_identity && col_identity) { 1558 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1559 } else { 1560 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1561 } 1562 B->offloadmask = PETSC_OFFLOAD_GPU; 1563 PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU 1564 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1565 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1566 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1567 B->ops->matsolve = MatMatSolve_SeqAIJ; 1568 B->assembled = PETSC_TRUE; 1569 B->preallocated = PETSC_TRUE; 1570 1571 PetscFunctionReturn(0); 1572 } 1573 1574 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1575 { 1576 PetscFunctionBegin; 1577 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1578 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1579 PetscFunctionReturn(0); 1580 } 1581 1582 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1583 { 1584 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1585 1586 PetscFunctionBegin; 1587 if (!factors->sptrsv_symbolic_completed) { 1588 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d); 1589 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d); 1590 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1591 } 1592 PetscFunctionReturn(0); 1593 } 1594 1595 /* Check if we need to update factors etc for transpose solve */ 1596 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1597 { 1598 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1599 MatColIdxType n = A->rmap->n; 1600 1601 PetscFunctionBegin; 1602 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1603 /* Update L^T and do sptrsv symbolic */ 1604 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); 1605 Kokkos::deep_copy(factors->iLt_d, 0); /* KK requires 0 */ 1606 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d", factors->jL_d.extent(0)); 1607 factors->aLt_d = MatScalarKokkosView("factors->aLt_d", factors->aL_d.extent(0)); 1608 1609 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d, 1610 factors->iLt_d, factors->jLt_d, factors->aLt_d); 1611 1612 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1613 We have to sort the indices, until KK provides finer control options. 1614 */ 1615 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d); 1616 1617 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d); 1618 1619 /* Update U^T and do sptrsv symbolic */ 1620 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); 1621 Kokkos::deep_copy(factors->iUt_d, 0); /* KK requires 0 */ 1622 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d", factors->jU_d.extent(0)); 1623 factors->aUt_d = MatScalarKokkosView("factors->aUt_d", factors->aU_d.extent(0)); 1624 1625 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d, 1626 factors->iUt_d, factors->jUt_d, factors->aUt_d); 1627 1628 /* Sort indices. See comments above */ 1629 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d); 1630 1631 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d); 1632 factors->transpose_updated = PETSC_TRUE; 1633 } 1634 PetscFunctionReturn(0); 1635 } 1636 1637 /* Solve Ax = b, with A = LU */ 1638 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x) 1639 { 1640 ConstPetscScalarKokkosView bv; 1641 PetscScalarKokkosView xv; 1642 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1643 1644 PetscFunctionBegin; 1645 PetscCall(PetscLogGpuTimeBegin()); 1646 PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 1647 PetscCall(VecGetKokkosView(b, &bv)); 1648 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1649 /* Solve L tmpv = b */ 1650 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector)); 1651 /* Solve Ux = tmpv */ 1652 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv)); 1653 PetscCall(VecRestoreKokkosView(b, &bv)); 1654 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1655 PetscCall(PetscLogGpuTimeEnd()); 1656 PetscFunctionReturn(0); 1657 } 1658 1659 /* Solve A^T x = b, where A^T = U^T L^T */ 1660 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x) 1661 { 1662 ConstPetscScalarKokkosView bv; 1663 PetscScalarKokkosView xv; 1664 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1665 1666 PetscFunctionBegin; 1667 PetscCall(PetscLogGpuTimeBegin()); 1668 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 1669 PetscCall(VecGetKokkosView(b, &bv)); 1670 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1671 /* Solve U^T tmpv = b */ 1672 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector); 1673 1674 /* Solve L^T x = tmpv */ 1675 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv); 1676 PetscCall(VecRestoreKokkosView(b, &bv)); 1677 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1678 PetscCall(PetscLogGpuTimeEnd()); 1679 PetscFunctionReturn(0); 1680 } 1681 1682 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1683 { 1684 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1685 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1686 PetscInt fill_lev = info->levels; 1687 1688 PetscFunctionBegin; 1689 PetscCall(PetscLogGpuTimeBegin()); 1690 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1691 1692 auto a_d = aijkok->a_dual.view_device(); 1693 auto i_d = aijkok->i_dual.view_device(); 1694 auto j_d = aijkok->j_dual.view_device(); 1695 1696 KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d); 1697 1698 B->assembled = PETSC_TRUE; 1699 B->preallocated = PETSC_TRUE; 1700 B->ops->solve = MatSolve_SeqAIJKokkos; 1701 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1702 B->ops->matsolve = NULL; 1703 B->ops->matsolvetranspose = NULL; 1704 B->offloadmask = PETSC_OFFLOAD_GPU; 1705 1706 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1707 factors->transpose_updated = PETSC_FALSE; 1708 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1709 /* TODO: log flops, but how to know that? */ 1710 PetscCall(PetscLogGpuTimeEnd()); 1711 PetscFunctionReturn(0); 1712 } 1713 1714 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1715 { 1716 Mat_SeqAIJKokkos *aijkok; 1717 Mat_SeqAIJ *b; 1718 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1719 PetscInt fill_lev = info->levels; 1720 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 1721 PetscInt n = A->rmap->n; 1722 1723 PetscFunctionBegin; 1724 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1725 /* Rebuild factors */ 1726 if (factors) { 1727 factors->Destroy(); 1728 } /* Destroy the old if it exists */ 1729 else { 1730 B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n); 1731 } 1732 1733 /* Create a spiluk handle and then do symbolic factorization */ 1734 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 1735 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 1736 1737 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1738 1739 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 1740 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 1741 Kokkos::realloc(factors->iU_d, n + 1); 1742 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 1743 1744 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1745 auto i_d = aijkok->i_dual.view_device(); 1746 auto j_d = aijkok->j_dual.view_device(); 1747 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d); 1748 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1749 1750 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1751 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 1752 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 1753 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 1754 1755 /* TODO: add options to select sptrsv algorithms */ 1756 /* Create sptrsv handles for L, U and their transpose */ 1757 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1758 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1759 #else 1760 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1761 #endif 1762 1763 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 1764 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 1765 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 1766 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 1767 1768 /* Fill fields of the factor matrix B */ 1769 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 1770 b = (Mat_SeqAIJ *)B->data; 1771 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 1772 B->info.fill_ratio_given = info->fill; 1773 B->info.fill_ratio_needed = ((PetscReal)b->nz) / ((PetscReal)nnzA); 1774 1775 B->offloadmask = PETSC_OFFLOAD_GPU; 1776 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1777 PetscFunctionReturn(0); 1778 } 1779 1780 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1781 { 1782 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1783 const PetscInt nrows = A->rmap->n; 1784 1785 PetscFunctionBegin; 1786 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1787 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1788 // move B data into Kokkos 1789 PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok 1790 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok 1791 { 1792 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 1793 if (!baijkok->diag_d.extent(0)) { 1794 const Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_diag(b->diag, nrows + 1); 1795 baijkok->diag_d = Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_diag)); 1796 Kokkos::deep_copy(baijkok->diag_d, h_diag); 1797 } 1798 } 1799 PetscFunctionReturn(0); 1800 } 1801 1802 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type) 1803 { 1804 PetscFunctionBegin; 1805 *type = MATSOLVERKOKKOS; 1806 PetscFunctionReturn(0); 1807 } 1808 1809 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A, MatSolverType *type) 1810 { 1811 PetscFunctionBegin; 1812 *type = MATSOLVERKOKKOSDEVICE; 1813 PetscFunctionReturn(0); 1814 } 1815 1816 /*MC 1817 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1818 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 1819 1820 Level: beginner 1821 1822 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1823 M*/ 1824 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1825 { 1826 PetscInt n = A->rmap->n; 1827 1828 PetscFunctionBegin; 1829 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 1830 PetscCall(MatSetSizes(*B, n, n, n, n)); 1831 (*B)->factortype = ftype; 1832 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 1833 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1834 1835 if (ftype == MAT_FACTOR_LU) { 1836 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1837 (*B)->canuseordering = PETSC_TRUE; 1838 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1839 } else if (ftype == MAT_FACTOR_ILU) { 1840 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1841 (*B)->canuseordering = PETSC_FALSE; 1842 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1843 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1844 1845 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 1846 PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos)); 1847 PetscFunctionReturn(0); 1848 } 1849 1850 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A, MatFactorType ftype, Mat *B) 1851 { 1852 PetscInt n = A->rmap->n; 1853 1854 PetscFunctionBegin; 1855 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 1856 PetscCall(MatSetSizes(*B, n, n, n, n)); 1857 (*B)->factortype = ftype; 1858 (*B)->canuseordering = PETSC_TRUE; 1859 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 1860 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1861 1862 if (ftype == MAT_FACTOR_LU) { 1863 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1864 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1865 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for KOKKOS Matrix Types"); 1866 1867 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 1868 PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_kokkos_device)); 1869 PetscFunctionReturn(0); 1870 } 1871 1872 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1873 { 1874 PetscFunctionBegin; 1875 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1876 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1877 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_seqaijkokkos_kokkos_device)); 1878 PetscFunctionReturn(0); 1879 } 1880 1881 /* Utility to print out a KokkosCsrMatrix for debugging */ 1882 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 1883 { 1884 const auto &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map); 1885 const auto &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries); 1886 const auto &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values); 1887 const PetscInt *i = iv.data(); 1888 const PetscInt *j = jv.data(); 1889 const PetscScalar *a = av.data(); 1890 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 1891 1892 PetscFunctionBegin; 1893 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 1894 for (PetscInt k = 0; k < m; k++) { 1895 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 1896 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 1897 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 1898 } 1899 PetscFunctionReturn(0); 1900 } 1901