1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petsc/private/sfimpl.h> 5 #include <petscsystypes.h> 6 #include <petscerror.h> 7 8 #include <Kokkos_Core.hpp> 9 #include <KokkosBlas.hpp> 10 #include <KokkosSparse_CrsMatrix.hpp> 11 #include <KokkosSparse_spmv.hpp> 12 #include <KokkosSparse_spiluk.hpp> 13 #include <KokkosSparse_sptrsv.hpp> 14 #include <KokkosSparse_spgemm.hpp> 15 #include <KokkosSparse_spadd.hpp> 16 17 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 18 19 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 6, 99) 20 #include <KokkosSparse_Utils.hpp> 21 using KokkosSparse::sort_crs_matrix; 22 using KokkosSparse::Impl::transpose_matrix; 23 #else 24 #include <KokkosKernels_Sorting.hpp> 25 using KokkosKernels::sort_crs_matrix; 26 using KokkosKernels::Impl::transpose_matrix; 27 #endif 28 29 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 30 31 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 32 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 33 In the latter case, it is important to set a_dual's sync state correctly. 34 */ 35 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 36 { 37 Mat_SeqAIJ *aijseq; 38 Mat_SeqAIJKokkos *aijkok; 39 40 PetscFunctionBegin; 41 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 42 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 43 44 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 45 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 46 47 /* If aijkok does not exist, we just copy i, j to device. 48 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 49 In both cases, we build a new aijkok structure. 50 */ 51 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 52 delete aijkok; 53 aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 54 A->spptr = aijkok; 55 } 56 57 if (aijkok->device_mat_d.data()) { 58 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 59 } 60 PetscFunctionReturn(PETSC_SUCCESS); 61 } 62 63 /* Sync CSR data to device if not yet */ 64 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 65 { 66 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 67 68 PetscFunctionBegin; 69 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Cann't sync factorized matrix from host to device"); 70 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 71 if (aijkok->a_dual.need_sync_device()) { 72 aijkok->a_dual.sync_device(); 73 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 74 aijkok->hermitian_updated = PETSC_FALSE; 75 } 76 PetscFunctionReturn(PETSC_SUCCESS); 77 } 78 79 /* Mark the CSR data on device as modified */ 80 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 81 { 82 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 83 84 PetscFunctionBegin; 85 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 86 aijkok->a_dual.clear_sync_state(); 87 aijkok->a_dual.modify_device(); 88 aijkok->transpose_updated = PETSC_FALSE; 89 aijkok->hermitian_updated = PETSC_FALSE; 90 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 91 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 92 PetscFunctionReturn(PETSC_SUCCESS); 93 } 94 95 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 96 { 97 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 98 99 PetscFunctionBegin; 100 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 101 /* We do not expect one needs factors on host */ 102 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Cann't sync factorized matrix from device to host"); 103 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 104 aijkok->a_dual.sync_host(); 105 PetscFunctionReturn(PETSC_SUCCESS); 106 } 107 108 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 109 { 110 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 111 112 PetscFunctionBegin; 113 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 114 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 115 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 116 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 117 */ 118 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 119 aijkok->a_dual.sync_host(); 120 *array = aijkok->a_dual.view_host().data(); 121 } else { /* Happens when calling MatSetValues on a newly created matrix */ 122 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 123 } 124 PetscFunctionReturn(PETSC_SUCCESS); 125 } 126 127 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 128 { 129 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 130 131 PetscFunctionBegin; 132 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 133 PetscFunctionReturn(PETSC_SUCCESS); 134 } 135 136 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 137 { 138 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 139 140 PetscFunctionBegin; 141 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 142 aijkok->a_dual.sync_host(); 143 *array = aijkok->a_dual.view_host().data(); 144 } else { 145 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 146 } 147 PetscFunctionReturn(PETSC_SUCCESS); 148 } 149 150 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 151 { 152 PetscFunctionBegin; 153 *array = NULL; 154 PetscFunctionReturn(PETSC_SUCCESS); 155 } 156 157 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 158 { 159 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 160 161 PetscFunctionBegin; 162 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 163 *array = aijkok->a_dual.view_host().data(); 164 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 165 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 166 } 167 PetscFunctionReturn(PETSC_SUCCESS); 168 } 169 170 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 171 { 172 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 173 174 PetscFunctionBegin; 175 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 176 aijkok->a_dual.clear_sync_state(); 177 aijkok->a_dual.modify_host(); 178 } 179 PetscFunctionReturn(PETSC_SUCCESS); 180 } 181 182 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 183 { 184 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 185 186 PetscFunctionBegin; 187 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 188 189 if (i) *i = aijkok->i_device_data(); 190 if (j) *j = aijkok->j_device_data(); 191 if (a) { 192 aijkok->a_dual.sync_device(); 193 *a = aijkok->a_device_data(); 194 } 195 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 196 PetscFunctionReturn(PETSC_SUCCESS); 197 } 198 199 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 200 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 201 { 202 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 203 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 204 205 PetscFunctionBegin; 206 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 207 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(), h_mat_k); 208 Kokkos::deep_copy(aijkok->device_mat_d, h_mat_k); 209 PetscFunctionReturn(PETSC_SUCCESS); 210 } 211 212 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 213 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 214 { 215 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 216 217 PetscFunctionBegin; 218 if (aijkok && aijkok->device_mat_d.data()) { 219 *d_mat = aijkok->device_mat_d.data(); 220 } else { 221 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it) 222 *d_mat = NULL; 223 } 224 PetscFunctionReturn(PETSC_SUCCESS); 225 } 226 227 /* Generate the transpose on device and cache it internally */ 228 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 229 { 230 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 231 232 PetscFunctionBegin; 233 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 234 if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 235 /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 236 PetscCallCXX(aijkok->a_dual.sync_device()); 237 PetscCallCXX(aijkok->csrmatT = transpose_matrix(aijkok->csrmat)); 238 PetscCallCXX(sort_crs_matrix(aijkok->csrmatT)); 239 aijkok->transpose_updated = PETSC_TRUE; 240 } 241 *csrmatT = &aijkok->csrmatT; 242 PetscFunctionReturn(PETSC_SUCCESS); 243 } 244 245 /* Generate the Hermitian on device and cache it internally */ 246 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 247 { 248 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 249 250 PetscFunctionBegin; 251 PetscCall(PetscLogGpuTimeBegin()); 252 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 253 if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 254 PetscCallCXX(aijkok->a_dual.sync_device()); 255 PetscCallCXX(aijkok->csrmatH = transpose_matrix(aijkok->csrmat)); 256 PetscCallCXX(sort_crs_matrix(aijkok->csrmatH)); 257 #if defined(PETSC_USE_COMPLEX) 258 const auto &a = aijkok->csrmatH.values; 259 Kokkos::parallel_for( 260 a.extent(0), KOKKOS_LAMBDA(MatRowMapType i) { a(i) = PetscConj(a(i)); }); 261 #endif 262 aijkok->hermitian_updated = PETSC_TRUE; 263 } 264 *csrmatH = &aijkok->csrmatH; 265 PetscCall(PetscLogGpuTimeEnd()); 266 PetscFunctionReturn(PETSC_SUCCESS); 267 } 268 269 /* y = A x */ 270 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 271 { 272 Mat_SeqAIJKokkos *aijkok; 273 ConstPetscScalarKokkosView xv; 274 PetscScalarKokkosView yv; 275 276 PetscFunctionBegin; 277 PetscCall(PetscLogGpuTimeBegin()); 278 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 279 PetscCall(VecGetKokkosView(xx, &xv)); 280 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 281 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 282 PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 283 PetscCall(VecRestoreKokkosView(xx, &xv)); 284 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 285 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 286 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 287 PetscCall(PetscLogGpuTimeEnd()); 288 PetscFunctionReturn(PETSC_SUCCESS); 289 } 290 291 /* y = A^T x */ 292 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 293 { 294 Mat_SeqAIJKokkos *aijkok; 295 const char *mode; 296 ConstPetscScalarKokkosView xv; 297 PetscScalarKokkosView yv; 298 KokkosCsrMatrix *csrmat; 299 300 PetscFunctionBegin; 301 PetscCall(PetscLogGpuTimeBegin()); 302 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 303 PetscCall(VecGetKokkosView(xx, &xv)); 304 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 305 if (A->form_explicit_transpose) { 306 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 307 mode = "N"; 308 } else { 309 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 310 csrmat = &aijkok->csrmat; 311 mode = "T"; 312 } 313 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 314 PetscCall(VecRestoreKokkosView(xx, &xv)); 315 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 316 PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz())); 317 PetscCall(PetscLogGpuTimeEnd()); 318 PetscFunctionReturn(PETSC_SUCCESS); 319 } 320 321 /* y = A^H x */ 322 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 323 { 324 Mat_SeqAIJKokkos *aijkok; 325 const char *mode; 326 ConstPetscScalarKokkosView xv; 327 PetscScalarKokkosView yv; 328 KokkosCsrMatrix *csrmat; 329 330 PetscFunctionBegin; 331 PetscCall(PetscLogGpuTimeBegin()); 332 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 333 PetscCall(VecGetKokkosView(xx, &xv)); 334 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 335 if (A->form_explicit_transpose) { 336 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 337 mode = "N"; 338 } else { 339 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 340 csrmat = &aijkok->csrmat; 341 mode = "C"; 342 } 343 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 344 PetscCall(VecRestoreKokkosView(xx, &xv)); 345 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 346 PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz())); 347 PetscCall(PetscLogGpuTimeEnd()); 348 PetscFunctionReturn(PETSC_SUCCESS); 349 } 350 351 /* z = A x + y */ 352 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 353 { 354 Mat_SeqAIJKokkos *aijkok; 355 ConstPetscScalarKokkosView xv, yv; 356 PetscScalarKokkosView zv; 357 358 PetscFunctionBegin; 359 PetscCall(PetscLogGpuTimeBegin()); 360 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 361 PetscCall(VecGetKokkosView(xx, &xv)); 362 PetscCall(VecGetKokkosView(yy, &yv)); 363 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 364 if (zz != yy) Kokkos::deep_copy(zv, yv); 365 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 366 PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 367 PetscCall(VecRestoreKokkosView(xx, &xv)); 368 PetscCall(VecRestoreKokkosView(yy, &yv)); 369 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 370 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 371 PetscCall(PetscLogGpuTimeEnd()); 372 PetscFunctionReturn(PETSC_SUCCESS); 373 } 374 375 /* z = A^T x + y */ 376 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 377 { 378 Mat_SeqAIJKokkos *aijkok; 379 const char *mode; 380 ConstPetscScalarKokkosView xv, yv; 381 PetscScalarKokkosView zv; 382 KokkosCsrMatrix *csrmat; 383 384 PetscFunctionBegin; 385 PetscCall(PetscLogGpuTimeBegin()); 386 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 387 PetscCall(VecGetKokkosView(xx, &xv)); 388 PetscCall(VecGetKokkosView(yy, &yv)); 389 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 390 if (zz != yy) Kokkos::deep_copy(zv, yv); 391 if (A->form_explicit_transpose) { 392 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 393 mode = "N"; 394 } else { 395 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 396 csrmat = &aijkok->csrmat; 397 mode = "T"; 398 } 399 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 400 PetscCall(VecRestoreKokkosView(xx, &xv)); 401 PetscCall(VecRestoreKokkosView(yy, &yv)); 402 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 403 PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz())); 404 PetscCall(PetscLogGpuTimeEnd()); 405 PetscFunctionReturn(PETSC_SUCCESS); 406 } 407 408 /* z = A^H x + y */ 409 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 410 { 411 Mat_SeqAIJKokkos *aijkok; 412 const char *mode; 413 ConstPetscScalarKokkosView xv, yv; 414 PetscScalarKokkosView zv; 415 KokkosCsrMatrix *csrmat; 416 417 PetscFunctionBegin; 418 PetscCall(PetscLogGpuTimeBegin()); 419 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 420 PetscCall(VecGetKokkosView(xx, &xv)); 421 PetscCall(VecGetKokkosView(yy, &yv)); 422 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 423 if (zz != yy) Kokkos::deep_copy(zv, yv); 424 if (A->form_explicit_transpose) { 425 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 426 mode = "N"; 427 } else { 428 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 429 csrmat = &aijkok->csrmat; 430 mode = "C"; 431 } 432 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 433 PetscCall(VecRestoreKokkosView(xx, &xv)); 434 PetscCall(VecRestoreKokkosView(yy, &yv)); 435 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 436 PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz())); 437 PetscCall(PetscLogGpuTimeEnd()); 438 PetscFunctionReturn(PETSC_SUCCESS); 439 } 440 441 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 442 { 443 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 444 445 PetscFunctionBegin; 446 switch (op) { 447 case MAT_FORM_EXPLICIT_TRANSPOSE: 448 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 449 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 450 A->form_explicit_transpose = flg; 451 break; 452 default: 453 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 454 break; 455 } 456 PetscFunctionReturn(PETSC_SUCCESS); 457 } 458 459 /* Depending on reuse, either build a new mat, or use the existing mat */ 460 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 461 { 462 Mat_SeqAIJ *aseq; 463 464 PetscFunctionBegin; 465 PetscCall(PetscKokkosInitializeCheck()); 466 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 467 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */ 468 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 469 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 470 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 471 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 472 PetscCall(PetscFree(A->defaultvectype)); 473 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 474 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 475 PetscCall(MatSetOps_SeqAIJKokkos(A)); 476 aseq = static_cast<Mat_SeqAIJ *>(A->data); 477 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 478 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 479 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE); 480 } 481 } 482 PetscFunctionReturn(PETSC_SUCCESS); 483 } 484 485 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 486 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 487 */ 488 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 489 { 490 Mat_SeqAIJ *bseq; 491 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 492 Mat mat; 493 494 PetscFunctionBegin; 495 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 496 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 497 mat = *B; 498 if (A->assembled) { 499 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 500 bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE); 501 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 502 /* Now copy values to B if needed */ 503 if (dupOption == MAT_COPY_VALUES) { 504 if (akok->a_dual.need_sync_device()) { 505 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 506 bkok->a_dual.modify_host(); 507 } else { /* If device has the latest data, we only copy data on device */ 508 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 509 bkok->a_dual.modify_device(); 510 } 511 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 512 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 513 bkok->a_dual.modify_host(); 514 } 515 mat->spptr = bkok; 516 } 517 518 PetscCall(PetscFree(mat->defaultvectype)); 519 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 520 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 521 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 522 PetscFunctionReturn(PETSC_SUCCESS); 523 } 524 525 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 526 { 527 Mat At; 528 KokkosCsrMatrix *internT; 529 Mat_SeqAIJKokkos *atkok, *bkok; 530 531 PetscFunctionBegin; 532 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 533 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 534 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 535 /* Deep copy internT, as we want to isolate the internal transpose */ 536 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", *internT))); 537 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 538 if (reuse == MAT_INITIAL_MATRIX) *B = At; 539 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 540 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 541 if ((*B)->assembled) { 542 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 543 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT->values)); 544 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 545 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 546 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 547 MatScalarKokkosViewHost a_h(bseq->a, internT->nnz()); /* bseq->nz = 0 if unassembled */ 548 MatColIdxKokkosViewHost j_h(bseq->j, internT->nnz()); 549 PetscCallCXX(Kokkos::deep_copy(a_h, internT->values)); 550 PetscCallCXX(Kokkos::deep_copy(j_h, internT->graph.entries)); 551 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 552 } 553 PetscFunctionReturn(PETSC_SUCCESS); 554 } 555 556 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 557 { 558 Mat_SeqAIJKokkos *aijkok; 559 560 PetscFunctionBegin; 561 if (A->factortype == MAT_FACTOR_NONE) { 562 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 563 delete aijkok; 564 } else { 565 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 566 } 567 A->spptr = NULL; 568 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 569 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 570 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 571 PetscCall(MatDestroy_SeqAIJ(A)); 572 PetscFunctionReturn(PETSC_SUCCESS); 573 } 574 575 /*MC 576 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 577 578 A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 579 580 Options Database Key: 581 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 582 583 Level: beginner 584 585 .seealso: [](chapter_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 586 M*/ 587 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 588 { 589 PetscFunctionBegin; 590 PetscCall(PetscKokkosInitializeCheck()); 591 PetscCall(MatCreate_SeqAIJ(A)); 592 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 593 PetscFunctionReturn(PETSC_SUCCESS); 594 } 595 596 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 597 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 598 { 599 Mat_SeqAIJ *a, *b; 600 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 601 MatScalarKokkosView aa, ba, ca; 602 MatRowMapKokkosView ai, bi, ci; 603 MatColIdxKokkosView aj, bj, cj; 604 PetscInt m, n, nnz, aN; 605 606 PetscFunctionBegin; 607 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 608 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 609 PetscValidPointer(C, 4); 610 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 611 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 612 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 613 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 614 615 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 616 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 617 a = static_cast<Mat_SeqAIJ *>(A->data); 618 b = static_cast<Mat_SeqAIJ *>(B->data); 619 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 620 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 621 aa = akok->a_dual.view_device(); 622 ai = akok->i_dual.view_device(); 623 ba = bkok->a_dual.view_device(); 624 bi = bkok->i_dual.view_device(); 625 m = A->rmap->n; /* M, N and nnz of C */ 626 n = A->cmap->n + B->cmap->n; 627 nnz = a->nz + b->nz; 628 aN = A->cmap->n; /* N of A */ 629 if (reuse == MAT_INITIAL_MATRIX) { 630 aj = akok->j_dual.view_device(); 631 bj = bkok->j_dual.view_device(); 632 auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 633 auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 634 auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 635 ca = ca_dual.view_device(); 636 ci = ci_dual.view_device(); 637 cj = cj_dual.view_device(); 638 639 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 640 Kokkos::parallel_for( 641 Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 642 PetscInt i = t.league_rank(); /* row i */ 643 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 644 645 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 646 ci(i) = coffset; 647 if (i == m - 1) ci(m) = ai(m) + bi(m); 648 }); 649 650 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 651 if (k < alen) { 652 ca(coffset + k) = aa(ai(i) + k); 653 cj(coffset + k) = aj(ai(i) + k); 654 } else { 655 ca(coffset + k) = ba(bi(i) + k - alen); 656 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 657 } 658 }); 659 }); 660 ca_dual.modify_device(); 661 ci_dual.modify_device(); 662 cj_dual.modify_device(); 663 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 664 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 665 } else if (reuse == MAT_REUSE_MATRIX) { 666 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 667 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 668 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 669 ca = ckok->a_dual.view_device(); 670 ci = ckok->i_dual.view_device(); 671 672 Kokkos::parallel_for( 673 Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 674 PetscInt i = t.league_rank(); /* row i */ 675 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 676 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 677 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 678 else ca(ci(i) + k) = ba(bi(i) + k - alen); 679 }); 680 }); 681 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 682 } 683 PetscFunctionReturn(PETSC_SUCCESS); 684 } 685 686 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 687 { 688 PetscFunctionBegin; 689 delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 690 PetscFunctionReturn(PETSC_SUCCESS); 691 } 692 693 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 694 { 695 Mat_Product *product = C->product; 696 Mat A, B; 697 bool transA, transB; /* use bool, since KK needs this type */ 698 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 699 Mat_SeqAIJ *c; 700 MatProductData_SeqAIJKokkos *pdata; 701 KokkosCsrMatrix *csrmatA, *csrmatB; 702 703 PetscFunctionBegin; 704 MatCheckProduct(C, 1); 705 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 706 pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 707 708 if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 709 pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 710 PetscFunctionReturn(PETSC_SUCCESS); 711 } 712 713 switch (product->type) { 714 case MATPRODUCT_AB: 715 transA = false; 716 transB = false; 717 break; 718 case MATPRODUCT_AtB: 719 transA = true; 720 transB = false; 721 break; 722 case MATPRODUCT_ABt: 723 transA = false; 724 transB = true; 725 break; 726 default: 727 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 728 } 729 730 A = product->A; 731 B = product->B; 732 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 733 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 734 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 735 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 736 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 737 738 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 739 740 csrmatA = &akok->csrmat; 741 csrmatB = &bkok->csrmat; 742 743 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 744 if (transA) { 745 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 746 transA = false; 747 } 748 749 if (transB) { 750 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 751 transB = false; 752 } 753 PetscCall(PetscLogGpuTimeBegin()); 754 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, *csrmatA, transA, *csrmatB, transB, ckok->csrmat)); 755 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(3, 7, 99) 756 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 757 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 758 #endif 759 760 PetscCall(PetscLogGpuTimeEnd()); 761 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 762 /* shorter version of MatAssemblyEnd_SeqAIJ */ 763 c = (Mat_SeqAIJ *)C->data; 764 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 765 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 766 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 767 c->reallocs = 0; 768 C->info.mallocs = 0; 769 C->info.nz_unneeded = 0; 770 C->assembled = C->was_assembled = PETSC_TRUE; 771 C->num_ass++; 772 PetscFunctionReturn(PETSC_SUCCESS); 773 } 774 775 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 776 { 777 Mat_Product *product = C->product; 778 MatProductType ptype; 779 Mat A, B; 780 bool transA, transB; 781 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 782 MatProductData_SeqAIJKokkos *pdata; 783 MPI_Comm comm; 784 KokkosCsrMatrix *csrmatA, *csrmatB, csrmatC; 785 786 PetscFunctionBegin; 787 MatCheckProduct(C, 1); 788 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 789 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 790 A = product->A; 791 B = product->B; 792 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 793 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 794 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 795 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 796 csrmatA = &akok->csrmat; 797 csrmatB = &bkok->csrmat; 798 799 ptype = product->type; 800 switch (ptype) { 801 case MATPRODUCT_AB: 802 transA = false; 803 transB = false; 804 break; 805 case MATPRODUCT_AtB: 806 transA = true; 807 transB = false; 808 break; 809 case MATPRODUCT_ABt: 810 transA = false; 811 transB = true; 812 break; 813 default: 814 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 815 } 816 817 product->data = pdata = new MatProductData_SeqAIJKokkos(); 818 pdata->kh.set_team_work_size(16); 819 pdata->kh.set_dynamic_scheduling(true); 820 pdata->reusesym = product->api_user; 821 822 /* TODO: add command line options to select spgemm algorithms */ 823 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 824 825 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 826 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 827 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 828 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 829 #endif 830 #endif 831 832 pdata->kh.create_spgemm_handle(spgemm_alg); 833 834 PetscCall(PetscLogGpuTimeBegin()); 835 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 836 if (transA) { 837 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 838 transA = false; 839 } 840 841 if (transB) { 842 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 843 transB = false; 844 } 845 846 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, *csrmatA, transA, *csrmatB, transB, csrmatC)); 847 848 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 849 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 850 calling new Mat_SeqAIJKokkos(). 851 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 852 */ 853 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, *csrmatA, transA, *csrmatB, transB, csrmatC)); 854 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(3, 7, 99) 855 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 856 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 857 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 858 #endif 859 PetscCall(PetscLogGpuTimeEnd()); 860 861 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 862 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 863 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 864 PetscFunctionReturn(PETSC_SUCCESS); 865 } 866 867 /* handles sparse matrix matrix ops */ 868 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 869 { 870 Mat_Product *product = mat->product; 871 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 872 873 PetscFunctionBegin; 874 MatCheckProduct(mat, 1); 875 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 876 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 877 if (Biskok && Ciskok) { 878 switch (product->type) { 879 case MATPRODUCT_AB: 880 case MATPRODUCT_AtB: 881 case MATPRODUCT_ABt: 882 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 883 break; 884 case MATPRODUCT_PtAP: 885 case MATPRODUCT_RARt: 886 case MATPRODUCT_ABC: 887 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 888 break; 889 default: 890 break; 891 } 892 } else { /* fallback for AIJ */ 893 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 894 } 895 PetscFunctionReturn(PETSC_SUCCESS); 896 } 897 898 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 899 { 900 Mat_SeqAIJKokkos *aijkok; 901 902 PetscFunctionBegin; 903 PetscCall(PetscLogGpuTimeBegin()); 904 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 905 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 906 KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 907 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 908 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 909 PetscCall(PetscLogGpuTimeEnd()); 910 PetscFunctionReturn(PETSC_SUCCESS); 911 } 912 913 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 914 { 915 Mat_SeqAIJKokkos *aijkok; 916 917 PetscFunctionBegin; 918 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 919 if (aijkok) { /* Only zero the device if data is already there */ 920 KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0); 921 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 922 } else { /* Might be preallocated but not assembled */ 923 PetscCall(MatZeroEntries_SeqAIJ(A)); 924 } 925 PetscFunctionReturn(PETSC_SUCCESS); 926 } 927 928 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 929 { 930 Mat_SeqAIJ *aijseq; 931 Mat_SeqAIJKokkos *aijkok; 932 PetscInt n; 933 PetscScalarKokkosView xv; 934 935 PetscFunctionBegin; 936 PetscCall(VecGetLocalSize(x, &n)); 937 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 938 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 939 940 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 941 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 942 943 if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */ 944 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 945 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 946 aijkok->SetDiagonal(aijseq->diag); 947 } 948 949 const auto &Aa = aijkok->a_dual.view_device(); 950 const auto &Ai = aijkok->i_dual.view_device(); 951 const auto &Adiag = aijkok->diag_dual.view_device(); 952 953 PetscCall(VecGetKokkosViewWrite(x, &xv)); 954 Kokkos::parallel_for( 955 n, KOKKOS_LAMBDA(const PetscInt i) { 956 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 957 else xv(i) = 0; 958 }); 959 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 960 PetscFunctionReturn(PETSC_SUCCESS); 961 } 962 963 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 964 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 965 { 966 Mat_SeqAIJKokkos *aijkok; 967 968 PetscFunctionBegin; 969 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 970 PetscValidPointer(kv, 2); 971 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 972 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 973 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 974 *kv = aijkok->a_dual.view_device(); 975 PetscFunctionReturn(PETSC_SUCCESS); 976 } 977 978 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 979 { 980 PetscFunctionBegin; 981 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 982 PetscValidPointer(kv, 2); 983 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 984 PetscFunctionReturn(PETSC_SUCCESS); 985 } 986 987 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 988 { 989 Mat_SeqAIJKokkos *aijkok; 990 991 PetscFunctionBegin; 992 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 993 PetscValidPointer(kv, 2); 994 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 995 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 996 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 997 *kv = aijkok->a_dual.view_device(); 998 PetscFunctionReturn(PETSC_SUCCESS); 999 } 1000 1001 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1002 { 1003 PetscFunctionBegin; 1004 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1005 PetscValidPointer(kv, 2); 1006 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1007 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1008 PetscFunctionReturn(PETSC_SUCCESS); 1009 } 1010 1011 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1012 { 1013 Mat_SeqAIJKokkos *aijkok; 1014 1015 PetscFunctionBegin; 1016 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1017 PetscValidPointer(kv, 2); 1018 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1019 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1020 *kv = aijkok->a_dual.view_device(); 1021 PetscFunctionReturn(PETSC_SUCCESS); 1022 } 1023 1024 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1025 { 1026 PetscFunctionBegin; 1027 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1028 PetscValidPointer(kv, 2); 1029 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1030 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1031 PetscFunctionReturn(PETSC_SUCCESS); 1032 } 1033 1034 /* Computes Y += alpha X */ 1035 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1036 { 1037 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1038 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1039 ConstMatScalarKokkosView Xa; 1040 MatScalarKokkosView Ya; 1041 1042 PetscFunctionBegin; 1043 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1044 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1045 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1046 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1047 PetscCall(PetscLogGpuTimeBegin()); 1048 1049 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1050 /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 1051 PetscBool e; 1052 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1053 if (e) { 1054 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1055 if (e) pattern = SAME_NONZERO_PATTERN; 1056 } 1057 } 1058 1059 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1060 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1061 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1062 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1063 */ 1064 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1065 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1066 Xa = xkok->a_dual.view_device(); 1067 Ya = ykok->a_dual.view_device(); 1068 1069 if (pattern == SAME_NONZERO_PATTERN) { 1070 KokkosBlas::axpy(alpha, Xa, Ya); 1071 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1072 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1073 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1074 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1075 1076 Kokkos::parallel_for( 1077 Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1078 PetscInt i = t.league_rank(); /* row i */ 1079 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* Only one thread works in a team */ 1080 PetscInt p, q = Yi(i); 1081 for (p = Xi(i); p < Xi(i + 1); p++) { /* For each nonzero on row i of X */ 1082 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; /* find the matching nonzero on row i of Y */ 1083 if (Xj(p) == Yj(q)) { /* Found it */ 1084 Ya(q) += alpha * Xa(p); 1085 q++; 1086 } else { 1087 /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1088 Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1089 */ 1090 if (Yi(i) != Yi(i + 1)) 1091 Ya(Yi(i)) = 1092 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 6, 99) 1093 Kokkos::ArithTraits<PetscScalar>::nan(); 1094 #else 1095 Kokkos::Experimental::nan("1"); 1096 #endif 1097 } 1098 } 1099 }); 1100 }); 1101 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1102 } else { /* different nonzero patterns */ 1103 Mat Z; 1104 KokkosCsrMatrix zcsr; 1105 KernelHandle kh; 1106 kh.create_spadd_handle(false); 1107 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1108 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1109 zkok = new Mat_SeqAIJKokkos(zcsr); 1110 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1111 PetscCall(MatHeaderReplace(Y, &Z)); 1112 kh.destroy_spadd_handle(); 1113 } 1114 PetscCall(PetscLogGpuTimeEnd()); 1115 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); /* Because we scaled X and then added it to Y */ 1116 PetscFunctionReturn(PETSC_SUCCESS); 1117 } 1118 1119 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1120 { 1121 Mat_SeqAIJKokkos *akok; 1122 Mat_SeqAIJ *aseq; 1123 1124 PetscFunctionBegin; 1125 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1126 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1127 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1128 delete akok; 1129 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE); 1130 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1131 akok->SetUpCOO(aseq); 1132 PetscFunctionReturn(PETSC_SUCCESS); 1133 } 1134 1135 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1136 { 1137 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 1138 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1139 PetscCount Annz = aseq->nz; 1140 const PetscCountKokkosView &jmap = akok->jmap_d; 1141 const PetscCountKokkosView &perm = akok->perm_d; 1142 MatScalarKokkosView Aa; 1143 ConstMatScalarKokkosView kv; 1144 PetscMemType memtype; 1145 1146 PetscFunctionBegin; 1147 PetscCall(PetscGetMemType(v, &memtype)); 1148 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1149 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, aseq->coo_n)); 1150 } else { 1151 kv = ConstMatScalarKokkosView(v, aseq->coo_n); /* Directly use v[]'s memory */ 1152 } 1153 1154 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1155 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1156 1157 Kokkos::parallel_for( 1158 Annz, KOKKOS_LAMBDA(const PetscCount i) { 1159 PetscScalar sum = 0.0; 1160 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1161 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1162 }); 1163 1164 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1165 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1166 PetscFunctionReturn(PETSC_SUCCESS); 1167 } 1168 1169 PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A, const PetscInt *diag) 1170 { 1171 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1172 MatScalarKokkosView Aa; 1173 const MatRowMapKokkosView &Ai = akok->i_dual.view_device(); 1174 PetscInt m = A->rmap->n; 1175 ConstMatRowMapKokkosView Adiag(diag, m); /* diag is a device pointer */ 1176 1177 PetscFunctionBegin; 1178 PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); 1179 Kokkos::parallel_for( 1180 m, KOKKOS_LAMBDA(const PetscInt i) { 1181 PetscScalar tmp; 1182 if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i + 1)) { /* The diagonal element exists */ 1183 tmp = Aa(Ai(i)); 1184 Aa(Ai(i)) = Aa(Adiag(i)); 1185 Aa(Adiag(i)) = tmp; 1186 } 1187 }); 1188 PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1189 PetscFunctionReturn(PETSC_SUCCESS); 1190 } 1191 1192 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1193 { 1194 PetscFunctionBegin; 1195 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1196 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1197 B->offloadmask = PETSC_OFFLOAD_CPU; 1198 PetscFunctionReturn(PETSC_SUCCESS); 1199 } 1200 1201 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1202 { 1203 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1204 1205 PetscFunctionBegin; 1206 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1207 A->boundtocpu = PETSC_FALSE; 1208 1209 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1210 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1211 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1212 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1213 A->ops->scale = MatScale_SeqAIJKokkos; 1214 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1215 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1216 A->ops->mult = MatMult_SeqAIJKokkos; 1217 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1218 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1219 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1220 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1221 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1222 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1223 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1224 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1225 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1226 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1227 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1228 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1229 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1230 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1231 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1232 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1233 1234 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1235 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1236 PetscFunctionReturn(PETSC_SUCCESS); 1237 } 1238 1239 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1240 { 1241 Mat_SeqAIJ *aseq; 1242 PetscInt i, m, n; 1243 1244 PetscFunctionBegin; 1245 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1246 1247 m = akok->nrows(); 1248 n = akok->ncols(); 1249 PetscCall(MatSetSizes(A, m, n, m, n)); 1250 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1251 1252 /* Set up data structures of A as a MATSEQAIJ */ 1253 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1254 aseq = (Mat_SeqAIJ *)(A)->data; 1255 1256 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1257 akok->j_dual.sync_host(); 1258 1259 aseq->i = akok->i_host_data(); 1260 aseq->j = akok->j_host_data(); 1261 aseq->a = akok->a_host_data(); 1262 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1263 aseq->singlemalloc = PETSC_FALSE; 1264 aseq->free_a = PETSC_FALSE; 1265 aseq->free_ij = PETSC_FALSE; 1266 aseq->nz = akok->nnz(); 1267 aseq->maxnz = aseq->nz; 1268 1269 PetscCall(PetscMalloc1(m, &aseq->imax)); 1270 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1271 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1272 1273 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1274 akok->nonzerostate = A->nonzerostate; 1275 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1276 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1277 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1278 PetscFunctionReturn(PETSC_SUCCESS); 1279 } 1280 1281 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1282 1283 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1284 */ 1285 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1286 { 1287 PetscFunctionBegin; 1288 PetscCall(MatCreate(comm, A)); 1289 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1290 PetscFunctionReturn(PETSC_SUCCESS); 1291 } 1292 1293 /*@C 1294 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1295 (the default parallel PETSc format). This matrix will ultimately be handled by 1296 Kokkos for calculations. For good matrix 1297 assembly performance the user should preallocate the matrix storage by setting 1298 the parameter nz (or the array nnz). By setting these parameters accurately, 1299 performance during matrix assembly can be increased by more than a factor of 50. 1300 1301 Collective 1302 1303 Input Parameters: 1304 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1305 . m - number of rows 1306 . n - number of columns 1307 . nz - number of nonzeros per row (same for all rows) 1308 - nnz - array containing the number of nonzeros in the various rows 1309 (possibly different for each row) or `NULL` 1310 1311 Output Parameter: 1312 . A - the matrix 1313 1314 Level: intermediate 1315 1316 Notes: 1317 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1318 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1319 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1320 1321 If `nnz` is given then `nz` is ignored 1322 1323 The AIJ format, also called 1324 compressed row storage, is fully compatible with standard Fortran 1325 storage. That is, the stored row and column indices can begin at 1326 either one (as in Fortran) or zero. See the users' manual for details. 1327 1328 Specify the preallocated storage with either `nz` or `nnz` (not both). 1329 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1330 allocation. 1331 1332 By default, this format uses inodes (identical nodes) when possible, to 1333 improve numerical efficiency of matrix-vector products and solves. We 1334 search for consecutive rows with the same nonzero structure, thereby 1335 reusing matrix information to achieve increased efficiency. 1336 1337 .seealso: [](chapter_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()` 1338 @*/ 1339 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1340 { 1341 PetscFunctionBegin; 1342 PetscCall(PetscKokkosInitializeCheck()); 1343 PetscCall(MatCreate(comm, A)); 1344 PetscCall(MatSetSizes(*A, m, n, m, n)); 1345 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1346 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1347 PetscFunctionReturn(PETSC_SUCCESS); 1348 } 1349 1350 typedef Kokkos::TeamPolicy<>::member_type team_member; 1351 // 1352 // This factorization exploits block diagonal matrices with "Nf" (not used). 1353 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1354 // 1355 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B, Mat A, const MatFactorInfo *info) 1356 { 1357 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1358 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 1359 IS isrow = b->row, isicol = b->icol; 1360 const PetscInt *r_h, *ic_h; 1361 const PetscInt n = A->rmap->n, *ai_d = aijkok->i_dual.view_device().data(), *aj_d = aijkok->j_dual.view_device().data(), *bi_d = baijkok->i_dual.view_device().data(), *bj_d = baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data(); 1362 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1363 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1364 PetscBool row_identity, col_identity; 1365 PetscInt nc, Nf = 1, nVec = 32; // should be a parameter, Nf is batch size - not used 1366 1367 PetscFunctionBegin; 1368 PetscCheck(A->rmap->n == n, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT, A->rmap->n, n); 1369 PetscCall(MatIsStructurallySymmetric(A, &row_identity)); 1370 PetscCheck(row_identity, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "structurally symmetric matrices only supported"); 1371 PetscCall(ISGetIndices(isrow, &r_h)); 1372 PetscCall(ISGetIndices(isicol, &ic_h)); 1373 PetscCall(ISGetSize(isicol, &nc)); 1374 PetscCall(PetscLogGpuTimeBegin()); 1375 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1376 { 1377 #define KOKKOS_SHARED_LEVEL 1 1378 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1379 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1380 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1381 const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_r_k(r_h, n); 1382 Kokkos::View<PetscInt *, Kokkos::LayoutLeft> d_r_k("r", n); 1383 const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_ic_k(ic_h, nc); 1384 Kokkos::View<PetscInt *, Kokkos::LayoutLeft> d_ic_k("ic", nc); 1385 size_t flops_h = 0.0; 1386 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k(&flops_h); 1387 Kokkos::View<size_t> d_flops_k("flops"); 1388 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1389 const int nloc = n / Nf, Ni = (conc > 8) ? 1 /* some intelligent number of SMs -- but need league_barrier */ : 1; 1390 Kokkos::deep_copy(d_flops_k, h_flops_k); 1391 Kokkos::deep_copy(d_r_k, h_r_k); 1392 Kokkos::deep_copy(d_ic_k, h_ic_k); 1393 // Fill A --> fact 1394 Kokkos::parallel_for( 1395 Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec), KOKKOS_LAMBDA(const team_member team) { 1396 const PetscInt field = team.league_rank() / Ni, field_block = team.league_rank() % Ni; // use grid.x/y in CUDA 1397 const PetscInt nloc_i = (nloc / Ni + !!(nloc % Ni)), start_i = field * nloc + field_block * nloc_i, end_i = (start_i + nloc_i) > (field + 1) * nloc ? (field + 1) * nloc : (start_i + nloc_i); 1398 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1399 // zero rows of B 1400 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) { 1401 PetscInt nzbL = bi_d[rowb + 1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb + 1]; // with diag 1402 PetscScalar *baL = ba_d + bi_d[rowb]; 1403 PetscScalar *baU = ba_d + bdiag_d[rowb + 1] + 1; 1404 /* zero (unfactored row) */ 1405 for (int j = 0; j < nzbL; j++) baL[j] = 0; 1406 for (int j = 0; j < nzbU; j++) baU[j] = 0; 1407 }); 1408 // copy A into B 1409 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) { 1410 PetscInt rowa = r[rowb], nza = ai_d[rowa + 1] - ai_d[rowa]; 1411 const PetscScalar *av = aa_d + ai_d[rowa]; 1412 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1413 /* load in initial (unfactored row) */ 1414 for (int j = 0; j < nza; j++) { 1415 PetscInt colb = ic[ajtmp[j]]; 1416 PetscScalar vala = av[j]; 1417 if (colb == rowb) { 1418 *(ba_d + bdiag_d[rowb]) = vala; 1419 } else { 1420 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]); 1421 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]); 1422 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb + 1] + 1) : bi_d[rowb + 1] - bi_d[rowb], set = 0; 1423 for (int j = 0; j < nz; j++) { 1424 if (pbj[j] == colb) { 1425 pba[j] = vala; 1426 set++; 1427 break; 1428 } 1429 } 1430 #if !defined(PETSC_HAVE_SYCL) 1431 if (set != 1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1432 #endif 1433 } 1434 } 1435 }); 1436 }); 1437 Kokkos::fence(); 1438 1439 Kokkos::parallel_for( 1440 Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size() + scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA(const team_member team) { 1441 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1442 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1443 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1444 const PetscInt field = team.league_rank() / Ni, field_block_idx = team.league_rank() % Ni; // use grid.x/y in CUDA 1445 const PetscInt start = field * nloc, end = start + nloc; 1446 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1447 // A22 panel update for each row A(1,:) and col A(:,1) 1448 for (int ii = start; ii < end - 1; ii++) { 1449 const PetscInt *bjUi = bj_d + bdiag_d[ii + 1] + 1, nzUi = bdiag_d[ii] - (bdiag_d[ii + 1] + 1); // vector, and vector size, of column indices of U(i,(i+1):end) 1450 const PetscScalar *baUi = ba_d + bdiag_d[ii + 1] + 1; // vector of data U(i,i+1:end) 1451 const PetscInt nUi_its = nzUi / Ni + !!(nzUi % Ni); 1452 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1453 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=](const int j) { 1454 PetscInt kIdx = j * Ni + field_block_idx; 1455 if (kIdx >= nzUi) /* void */ 1456 ; 1457 else { 1458 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1459 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1460 const PetscInt nzL = bi_d[myk + 1] - bi_d[myk]; // size of L_k(:) 1461 size_t st_idx; 1462 // find and do L(k,i) = A(:k,i) / A(i,i) 1463 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1464 // get column, there has got to be a better way 1465 Kokkos::parallel_reduce( 1466 Kokkos::ThreadVectorRange(team, nzL), 1467 [&](const int &j, size_t &idx) { 1468 if (pjL[j] == ii) { 1469 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1470 idx = j; // output 1471 *pLki = *pLki / Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1472 } 1473 }, 1474 st_idx); 1475 Kokkos::single(Kokkos::PerThread(team), [=]() { 1476 colkIdx() = st_idx; 1477 L_ki() = *(ba_d + bi_d[myk] + st_idx); 1478 }); 1479 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1480 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n", (int)myk, ii); // uses a register 1481 #endif 1482 // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1483 // U(i+1,:end) 1484 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, nzUi), [=](const int &uiIdx) { // index into i (U) 1485 PetscScalar Uij = baUi[uiIdx]; 1486 PetscInt col = bjUi[uiIdx]; 1487 if (col == myk) { 1488 // A_kk = A_kk - L_ki * U_ij(k) 1489 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1490 *Akkv = *Akkv - L_ki() * Uij; // UiK 1491 } else { 1492 PetscScalar *start, *end, *pAkjv = NULL; 1493 PetscInt high, low; 1494 const PetscInt *startj; 1495 if (col < myk) { // L 1496 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1497 PetscInt idx = (pLki + 1) - (ba_d + bi_d[myk]); // index into row 1498 start = pLki + 1; // start at pLki+1, A22(myk,1) 1499 startj = bj_d + bi_d[myk] + idx; 1500 end = ba_d + bi_d[myk + 1]; 1501 } else { 1502 PetscInt idx = bdiag_d[myk + 1] + 1; 1503 start = ba_d + idx; 1504 startj = bj_d + idx; 1505 end = ba_d + bdiag_d[myk]; 1506 } 1507 // search for 'col', use bisection search - TODO 1508 low = 0; 1509 high = (PetscInt)(end - start); 1510 while (high - low > 5) { 1511 int t = (low + high) / 2; 1512 if (startj[t] > col) high = t; 1513 else low = t; 1514 } 1515 for (pAkjv = start + low; pAkjv < start + high; pAkjv++) { 1516 if (startj[pAkjv - start] == col) break; 1517 } 1518 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1519 if (pAkjv == start + high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n", (int)myk, (int)col); // uses a register 1520 #endif 1521 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1522 } 1523 }); 1524 } 1525 }); 1526 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1527 if (field_block_idx == 0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add(flops.data(), (size_t)(2 * (nzUi * nzUi) + 2)); }); 1528 } /* endof for (i=0; i<n; i++) { */ 1529 Kokkos::single(Kokkos::PerTeam(team), [=]() { 1530 Kokkos::atomic_add(&d_flops_k(), flops()); 1531 flops() = 0; 1532 }); 1533 }); 1534 Kokkos::fence(); 1535 Kokkos::deep_copy(h_flops_k, d_flops_k); 1536 PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k())); 1537 Kokkos::parallel_for( 1538 Kokkos::TeamPolicy<>(Nf * Ni, 1, 256), KOKKOS_LAMBDA(const team_member team) { 1539 const PetscInt lg_rank = team.league_rank(), field = lg_rank / Ni; //, field_offset = lg_rank%Ni; 1540 const PetscInt start = field * nloc, end = start + nloc, n_its = (nloc / Ni + !!(nloc % Ni)); // 1/Ni iters 1541 /* Invert diagonal for simpler triangular solves */ 1542 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=](int outer_index) { 1543 int i = start + outer_index * Ni + lg_rank % Ni; 1544 if (i < end) { 1545 PetscScalar *pv = ba_d + bdiag_d[i]; 1546 *pv = 1.0 / (*pv); 1547 } 1548 }); 1549 }); 1550 } 1551 PetscCall(PetscLogGpuTimeEnd()); 1552 PetscCall(ISRestoreIndices(isicol, &ic_h)); 1553 PetscCall(ISRestoreIndices(isrow, &r_h)); 1554 1555 PetscCall(ISIdentity(isrow, &row_identity)); 1556 PetscCall(ISIdentity(isicol, &col_identity)); 1557 if (b->inode.size) { 1558 B->ops->solve = MatSolve_SeqAIJ_Inode; 1559 } else if (row_identity && col_identity) { 1560 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1561 } else { 1562 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1563 } 1564 B->offloadmask = PETSC_OFFLOAD_GPU; 1565 PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU 1566 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1567 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1568 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1569 B->ops->matsolve = MatMatSolve_SeqAIJ; 1570 B->assembled = PETSC_TRUE; 1571 B->preallocated = PETSC_TRUE; 1572 1573 PetscFunctionReturn(PETSC_SUCCESS); 1574 } 1575 1576 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1577 { 1578 PetscFunctionBegin; 1579 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1580 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1581 PetscFunctionReturn(PETSC_SUCCESS); 1582 } 1583 1584 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1585 { 1586 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1587 1588 PetscFunctionBegin; 1589 if (!factors->sptrsv_symbolic_completed) { 1590 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d); 1591 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d); 1592 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1593 } 1594 PetscFunctionReturn(PETSC_SUCCESS); 1595 } 1596 1597 /* Check if we need to update factors etc for transpose solve */ 1598 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1599 { 1600 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1601 MatColIdxType n = A->rmap->n; 1602 1603 PetscFunctionBegin; 1604 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1605 /* Update L^T and do sptrsv symbolic */ 1606 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); 1607 Kokkos::deep_copy(factors->iLt_d, 0); /* KK requires 0 */ 1608 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d", factors->jL_d.extent(0)); 1609 factors->aLt_d = MatScalarKokkosView("factors->aLt_d", factors->aL_d.extent(0)); 1610 1611 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d, 1612 factors->iLt_d, factors->jLt_d, factors->aLt_d); 1613 1614 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1615 We have to sort the indices, until KK provides finer control options. 1616 */ 1617 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d); 1618 1619 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d); 1620 1621 /* Update U^T and do sptrsv symbolic */ 1622 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); 1623 Kokkos::deep_copy(factors->iUt_d, 0); /* KK requires 0 */ 1624 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d", factors->jU_d.extent(0)); 1625 factors->aUt_d = MatScalarKokkosView("factors->aUt_d", factors->aU_d.extent(0)); 1626 1627 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d, 1628 factors->iUt_d, factors->jUt_d, factors->aUt_d); 1629 1630 /* Sort indices. See comments above */ 1631 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d); 1632 1633 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d); 1634 factors->transpose_updated = PETSC_TRUE; 1635 } 1636 PetscFunctionReturn(PETSC_SUCCESS); 1637 } 1638 1639 /* Solve Ax = b, with A = LU */ 1640 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x) 1641 { 1642 ConstPetscScalarKokkosView bv; 1643 PetscScalarKokkosView xv; 1644 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1645 1646 PetscFunctionBegin; 1647 PetscCall(PetscLogGpuTimeBegin()); 1648 PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 1649 PetscCall(VecGetKokkosView(b, &bv)); 1650 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1651 /* Solve L tmpv = b */ 1652 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector)); 1653 /* Solve Ux = tmpv */ 1654 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv)); 1655 PetscCall(VecRestoreKokkosView(b, &bv)); 1656 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1657 PetscCall(PetscLogGpuTimeEnd()); 1658 PetscFunctionReturn(PETSC_SUCCESS); 1659 } 1660 1661 /* Solve A^T x = b, where A^T = U^T L^T */ 1662 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x) 1663 { 1664 ConstPetscScalarKokkosView bv; 1665 PetscScalarKokkosView xv; 1666 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1667 1668 PetscFunctionBegin; 1669 PetscCall(PetscLogGpuTimeBegin()); 1670 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 1671 PetscCall(VecGetKokkosView(b, &bv)); 1672 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1673 /* Solve U^T tmpv = b */ 1674 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector); 1675 1676 /* Solve L^T x = tmpv */ 1677 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv); 1678 PetscCall(VecRestoreKokkosView(b, &bv)); 1679 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1680 PetscCall(PetscLogGpuTimeEnd()); 1681 PetscFunctionReturn(PETSC_SUCCESS); 1682 } 1683 1684 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1685 { 1686 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1687 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1688 PetscInt fill_lev = info->levels; 1689 1690 PetscFunctionBegin; 1691 PetscCall(PetscLogGpuTimeBegin()); 1692 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1693 1694 auto a_d = aijkok->a_dual.view_device(); 1695 auto i_d = aijkok->i_dual.view_device(); 1696 auto j_d = aijkok->j_dual.view_device(); 1697 1698 KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d); 1699 1700 B->assembled = PETSC_TRUE; 1701 B->preallocated = PETSC_TRUE; 1702 B->ops->solve = MatSolve_SeqAIJKokkos; 1703 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1704 B->ops->matsolve = NULL; 1705 B->ops->matsolvetranspose = NULL; 1706 B->offloadmask = PETSC_OFFLOAD_GPU; 1707 1708 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1709 factors->transpose_updated = PETSC_FALSE; 1710 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1711 /* TODO: log flops, but how to know that? */ 1712 PetscCall(PetscLogGpuTimeEnd()); 1713 PetscFunctionReturn(PETSC_SUCCESS); 1714 } 1715 1716 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1717 { 1718 Mat_SeqAIJKokkos *aijkok; 1719 Mat_SeqAIJ *b; 1720 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1721 PetscInt fill_lev = info->levels; 1722 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 1723 PetscInt n = A->rmap->n; 1724 1725 PetscFunctionBegin; 1726 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1727 /* Rebuild factors */ 1728 if (factors) { 1729 factors->Destroy(); 1730 } /* Destroy the old if it exists */ 1731 else { 1732 B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n); 1733 } 1734 1735 /* Create a spiluk handle and then do symbolic factorization */ 1736 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 1737 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 1738 1739 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1740 1741 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 1742 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 1743 Kokkos::realloc(factors->iU_d, n + 1); 1744 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 1745 1746 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1747 auto i_d = aijkok->i_dual.view_device(); 1748 auto j_d = aijkok->j_dual.view_device(); 1749 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d); 1750 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1751 1752 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1753 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 1754 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 1755 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 1756 1757 /* TODO: add options to select sptrsv algorithms */ 1758 /* Create sptrsv handles for L, U and their transpose */ 1759 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1760 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1761 #else 1762 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1763 #endif 1764 1765 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 1766 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 1767 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 1768 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 1769 1770 /* Fill fields of the factor matrix B */ 1771 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 1772 b = (Mat_SeqAIJ *)B->data; 1773 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 1774 B->info.fill_ratio_given = info->fill; 1775 B->info.fill_ratio_needed = ((PetscReal)b->nz) / ((PetscReal)nnzA); 1776 1777 B->offloadmask = PETSC_OFFLOAD_GPU; 1778 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1779 PetscFunctionReturn(PETSC_SUCCESS); 1780 } 1781 1782 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1783 { 1784 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1785 const PetscInt nrows = A->rmap->n; 1786 1787 PetscFunctionBegin; 1788 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1789 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1790 // move B data into Kokkos 1791 PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok 1792 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok 1793 { 1794 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 1795 if (!baijkok->diag_d.extent(0)) { 1796 const Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_diag(b->diag, nrows + 1); 1797 baijkok->diag_d = Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_diag)); 1798 Kokkos::deep_copy(baijkok->diag_d, h_diag); 1799 } 1800 } 1801 PetscFunctionReturn(PETSC_SUCCESS); 1802 } 1803 1804 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type) 1805 { 1806 PetscFunctionBegin; 1807 *type = MATSOLVERKOKKOS; 1808 PetscFunctionReturn(PETSC_SUCCESS); 1809 } 1810 1811 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A, MatSolverType *type) 1812 { 1813 PetscFunctionBegin; 1814 *type = MATSOLVERKOKKOSDEVICE; 1815 PetscFunctionReturn(PETSC_SUCCESS); 1816 } 1817 1818 /*MC 1819 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1820 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 1821 1822 Level: beginner 1823 1824 .seealso: [](chapter_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1825 M*/ 1826 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1827 { 1828 PetscInt n = A->rmap->n; 1829 1830 PetscFunctionBegin; 1831 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 1832 PetscCall(MatSetSizes(*B, n, n, n, n)); 1833 (*B)->factortype = ftype; 1834 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 1835 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1836 1837 if (ftype == MAT_FACTOR_LU) { 1838 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1839 (*B)->canuseordering = PETSC_TRUE; 1840 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1841 } else if (ftype == MAT_FACTOR_ILU) { 1842 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1843 (*B)->canuseordering = PETSC_FALSE; 1844 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1845 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1846 1847 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 1848 PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos)); 1849 PetscFunctionReturn(PETSC_SUCCESS); 1850 } 1851 1852 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A, MatFactorType ftype, Mat *B) 1853 { 1854 PetscInt n = A->rmap->n; 1855 1856 PetscFunctionBegin; 1857 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 1858 PetscCall(MatSetSizes(*B, n, n, n, n)); 1859 (*B)->factortype = ftype; 1860 (*B)->canuseordering = PETSC_TRUE; 1861 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 1862 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1863 1864 if (ftype == MAT_FACTOR_LU) { 1865 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1866 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1867 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for KOKKOS Matrix Types"); 1868 1869 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 1870 PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_kokkos_device)); 1871 PetscFunctionReturn(PETSC_SUCCESS); 1872 } 1873 1874 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1875 { 1876 PetscFunctionBegin; 1877 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1878 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1879 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_seqaijkokkos_kokkos_device)); 1880 PetscFunctionReturn(PETSC_SUCCESS); 1881 } 1882 1883 /* Utility to print out a KokkosCsrMatrix for debugging */ 1884 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 1885 { 1886 const auto &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map); 1887 const auto &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries); 1888 const auto &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values); 1889 const PetscInt *i = iv.data(); 1890 const PetscInt *j = jv.data(); 1891 const PetscScalar *a = av.data(); 1892 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 1893 1894 PetscFunctionBegin; 1895 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 1896 for (PetscInt k = 0; k < m; k++) { 1897 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 1898 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 1899 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 1900 } 1901 PetscFunctionReturn(PETSC_SUCCESS); 1902 } 1903