1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petsc/private/sfimpl.h> 5 #include <petscsystypes.h> 6 #include <petscerror.h> 7 8 #include <Kokkos_Core.hpp> 9 #include <KokkosBlas.hpp> 10 #include <KokkosSparse_CrsMatrix.hpp> 11 #include <KokkosSparse_spmv.hpp> 12 #include <KokkosSparse_spiluk.hpp> 13 #include <KokkosSparse_sptrsv.hpp> 14 #include <KokkosSparse_spgemm.hpp> 15 #include <KokkosSparse_spadd.hpp> 16 17 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 18 19 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 6, 99) 20 #include <KokkosSparse_Utils.hpp> 21 using KokkosSparse::sort_crs_matrix; 22 using KokkosSparse::Impl::transpose_matrix; 23 #else 24 #include <KokkosKernels_Sorting.hpp> 25 using KokkosKernels::sort_crs_matrix; 26 using KokkosKernels::Impl::transpose_matrix; 27 #endif 28 29 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 30 31 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 32 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 33 In the latter case, it is important to set a_dual's sync state correctly. 34 */ 35 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 36 { 37 Mat_SeqAIJ *aijseq; 38 Mat_SeqAIJKokkos *aijkok; 39 40 PetscFunctionBegin; 41 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 42 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 43 44 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 45 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 46 47 /* If aijkok does not exist, we just copy i, j to device. 48 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 49 In both cases, we build a new aijkok structure. 50 */ 51 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 52 delete aijkok; 53 aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 54 A->spptr = aijkok; 55 } 56 57 if (aijkok->device_mat_d.data()) { 58 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 59 } 60 PetscFunctionReturn(PETSC_SUCCESS); 61 } 62 63 /* Sync CSR data to device if not yet */ 64 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 65 { 66 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 67 68 PetscFunctionBegin; 69 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Cann't sync factorized matrix from host to device"); 70 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 71 if (aijkok->a_dual.need_sync_device()) { 72 aijkok->a_dual.sync_device(); 73 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 74 aijkok->hermitian_updated = PETSC_FALSE; 75 } 76 PetscFunctionReturn(PETSC_SUCCESS); 77 } 78 79 /* Mark the CSR data on device as modified */ 80 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 81 { 82 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 83 84 PetscFunctionBegin; 85 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 86 aijkok->a_dual.clear_sync_state(); 87 aijkok->a_dual.modify_device(); 88 aijkok->transpose_updated = PETSC_FALSE; 89 aijkok->hermitian_updated = PETSC_FALSE; 90 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 91 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 92 PetscFunctionReturn(PETSC_SUCCESS); 93 } 94 95 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 96 { 97 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 98 99 PetscFunctionBegin; 100 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 101 /* We do not expect one needs factors on host */ 102 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Cann't sync factorized matrix from device to host"); 103 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 104 aijkok->a_dual.sync_host(); 105 PetscFunctionReturn(PETSC_SUCCESS); 106 } 107 108 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 109 { 110 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 111 112 PetscFunctionBegin; 113 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 114 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 115 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 116 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 117 */ 118 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 119 aijkok->a_dual.sync_host(); 120 *array = aijkok->a_dual.view_host().data(); 121 } else { /* Happens when calling MatSetValues on a newly created matrix */ 122 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 123 } 124 PetscFunctionReturn(PETSC_SUCCESS); 125 } 126 127 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 128 { 129 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 130 131 PetscFunctionBegin; 132 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 133 PetscFunctionReturn(PETSC_SUCCESS); 134 } 135 136 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 137 { 138 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 139 140 PetscFunctionBegin; 141 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 142 aijkok->a_dual.sync_host(); 143 *array = aijkok->a_dual.view_host().data(); 144 } else { 145 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 146 } 147 PetscFunctionReturn(PETSC_SUCCESS); 148 } 149 150 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 151 { 152 PetscFunctionBegin; 153 *array = NULL; 154 PetscFunctionReturn(PETSC_SUCCESS); 155 } 156 157 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 158 { 159 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 160 161 PetscFunctionBegin; 162 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 163 *array = aijkok->a_dual.view_host().data(); 164 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 165 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 166 } 167 PetscFunctionReturn(PETSC_SUCCESS); 168 } 169 170 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 171 { 172 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 173 174 PetscFunctionBegin; 175 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 176 aijkok->a_dual.clear_sync_state(); 177 aijkok->a_dual.modify_host(); 178 } 179 PetscFunctionReturn(PETSC_SUCCESS); 180 } 181 182 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 183 { 184 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 185 186 PetscFunctionBegin; 187 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 188 189 if (i) *i = aijkok->i_device_data(); 190 if (j) *j = aijkok->j_device_data(); 191 if (a) { 192 aijkok->a_dual.sync_device(); 193 *a = aijkok->a_device_data(); 194 } 195 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 196 PetscFunctionReturn(PETSC_SUCCESS); 197 } 198 199 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 200 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 201 { 202 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 203 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 204 205 PetscFunctionBegin; 206 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 207 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(), h_mat_k); 208 Kokkos::deep_copy(aijkok->device_mat_d, h_mat_k); 209 PetscFunctionReturn(PETSC_SUCCESS); 210 } 211 212 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 213 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 214 { 215 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 216 217 PetscFunctionBegin; 218 if (aijkok && aijkok->device_mat_d.data()) { 219 *d_mat = aijkok->device_mat_d.data(); 220 } else { 221 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it) 222 *d_mat = NULL; 223 } 224 PetscFunctionReturn(PETSC_SUCCESS); 225 } 226 227 /* Generate the transpose on device and cache it internally */ 228 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 229 { 230 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 231 232 PetscFunctionBegin; 233 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 234 if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 235 /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 236 PetscCallCXX(aijkok->a_dual.sync_device()); 237 PetscCallCXX(aijkok->csrmatT = transpose_matrix(aijkok->csrmat)); 238 PetscCallCXX(sort_crs_matrix(aijkok->csrmatT)); 239 aijkok->transpose_updated = PETSC_TRUE; 240 } 241 *csrmatT = &aijkok->csrmatT; 242 PetscFunctionReturn(PETSC_SUCCESS); 243 } 244 245 /* Generate the Hermitian on device and cache it internally */ 246 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 247 { 248 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 249 250 PetscFunctionBegin; 251 PetscCall(PetscLogGpuTimeBegin()); 252 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 253 if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 254 PetscCallCXX(aijkok->a_dual.sync_device()); 255 PetscCallCXX(aijkok->csrmatH = transpose_matrix(aijkok->csrmat)); 256 PetscCallCXX(sort_crs_matrix(aijkok->csrmatH)); 257 #if defined(PETSC_USE_COMPLEX) 258 const auto &a = aijkok->csrmatH.values; 259 Kokkos::parallel_for( 260 a.extent(0), KOKKOS_LAMBDA(MatRowMapType i) { a(i) = PetscConj(a(i)); }); 261 #endif 262 aijkok->hermitian_updated = PETSC_TRUE; 263 } 264 *csrmatH = &aijkok->csrmatH; 265 PetscCall(PetscLogGpuTimeEnd()); 266 PetscFunctionReturn(PETSC_SUCCESS); 267 } 268 269 /* y = A x */ 270 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 271 { 272 Mat_SeqAIJKokkos *aijkok; 273 ConstPetscScalarKokkosView xv; 274 PetscScalarKokkosView yv; 275 276 PetscFunctionBegin; 277 PetscCall(PetscLogGpuTimeBegin()); 278 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 279 PetscCall(VecGetKokkosView(xx, &xv)); 280 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 281 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 282 PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 283 PetscCall(VecRestoreKokkosView(xx, &xv)); 284 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 285 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 286 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 287 PetscCall(PetscLogGpuTimeEnd()); 288 PetscFunctionReturn(PETSC_SUCCESS); 289 } 290 291 /* y = A^T x */ 292 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 293 { 294 Mat_SeqAIJKokkos *aijkok; 295 const char *mode; 296 ConstPetscScalarKokkosView xv; 297 PetscScalarKokkosView yv; 298 KokkosCsrMatrix *csrmat; 299 300 PetscFunctionBegin; 301 PetscCall(PetscLogGpuTimeBegin()); 302 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 303 PetscCall(VecGetKokkosView(xx, &xv)); 304 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 305 if (A->form_explicit_transpose) { 306 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 307 mode = "N"; 308 } else { 309 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 310 csrmat = &aijkok->csrmat; 311 mode = "T"; 312 } 313 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 314 PetscCall(VecRestoreKokkosView(xx, &xv)); 315 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 316 PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz())); 317 PetscCall(PetscLogGpuTimeEnd()); 318 PetscFunctionReturn(PETSC_SUCCESS); 319 } 320 321 /* y = A^H x */ 322 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 323 { 324 Mat_SeqAIJKokkos *aijkok; 325 const char *mode; 326 ConstPetscScalarKokkosView xv; 327 PetscScalarKokkosView yv; 328 KokkosCsrMatrix *csrmat; 329 330 PetscFunctionBegin; 331 PetscCall(PetscLogGpuTimeBegin()); 332 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 333 PetscCall(VecGetKokkosView(xx, &xv)); 334 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 335 if (A->form_explicit_transpose) { 336 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 337 mode = "N"; 338 } else { 339 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 340 csrmat = &aijkok->csrmat; 341 mode = "C"; 342 } 343 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 344 PetscCall(VecRestoreKokkosView(xx, &xv)); 345 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 346 PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz())); 347 PetscCall(PetscLogGpuTimeEnd()); 348 PetscFunctionReturn(PETSC_SUCCESS); 349 } 350 351 /* z = A x + y */ 352 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 353 { 354 Mat_SeqAIJKokkos *aijkok; 355 ConstPetscScalarKokkosView xv, yv; 356 PetscScalarKokkosView zv; 357 358 PetscFunctionBegin; 359 PetscCall(PetscLogGpuTimeBegin()); 360 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 361 PetscCall(VecGetKokkosView(xx, &xv)); 362 PetscCall(VecGetKokkosView(yy, &yv)); 363 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 364 if (zz != yy) Kokkos::deep_copy(zv, yv); 365 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 366 PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 367 PetscCall(VecRestoreKokkosView(xx, &xv)); 368 PetscCall(VecRestoreKokkosView(yy, &yv)); 369 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 370 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 371 PetscCall(PetscLogGpuTimeEnd()); 372 PetscFunctionReturn(PETSC_SUCCESS); 373 } 374 375 /* z = A^T x + y */ 376 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 377 { 378 Mat_SeqAIJKokkos *aijkok; 379 const char *mode; 380 ConstPetscScalarKokkosView xv, yv; 381 PetscScalarKokkosView zv; 382 KokkosCsrMatrix *csrmat; 383 384 PetscFunctionBegin; 385 PetscCall(PetscLogGpuTimeBegin()); 386 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 387 PetscCall(VecGetKokkosView(xx, &xv)); 388 PetscCall(VecGetKokkosView(yy, &yv)); 389 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 390 if (zz != yy) Kokkos::deep_copy(zv, yv); 391 if (A->form_explicit_transpose) { 392 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 393 mode = "N"; 394 } else { 395 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 396 csrmat = &aijkok->csrmat; 397 mode = "T"; 398 } 399 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 400 PetscCall(VecRestoreKokkosView(xx, &xv)); 401 PetscCall(VecRestoreKokkosView(yy, &yv)); 402 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 403 PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz())); 404 PetscCall(PetscLogGpuTimeEnd()); 405 PetscFunctionReturn(PETSC_SUCCESS); 406 } 407 408 /* z = A^H x + y */ 409 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 410 { 411 Mat_SeqAIJKokkos *aijkok; 412 const char *mode; 413 ConstPetscScalarKokkosView xv, yv; 414 PetscScalarKokkosView zv; 415 KokkosCsrMatrix *csrmat; 416 417 PetscFunctionBegin; 418 PetscCall(PetscLogGpuTimeBegin()); 419 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 420 PetscCall(VecGetKokkosView(xx, &xv)); 421 PetscCall(VecGetKokkosView(yy, &yv)); 422 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 423 if (zz != yy) Kokkos::deep_copy(zv, yv); 424 if (A->form_explicit_transpose) { 425 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 426 mode = "N"; 427 } else { 428 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 429 csrmat = &aijkok->csrmat; 430 mode = "C"; 431 } 432 PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 433 PetscCall(VecRestoreKokkosView(xx, &xv)); 434 PetscCall(VecRestoreKokkosView(yy, &yv)); 435 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 436 PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz())); 437 PetscCall(PetscLogGpuTimeEnd()); 438 PetscFunctionReturn(PETSC_SUCCESS); 439 } 440 441 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 442 { 443 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 444 445 PetscFunctionBegin; 446 switch (op) { 447 case MAT_FORM_EXPLICIT_TRANSPOSE: 448 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 449 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 450 A->form_explicit_transpose = flg; 451 break; 452 default: 453 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 454 break; 455 } 456 PetscFunctionReturn(PETSC_SUCCESS); 457 } 458 459 /* Depending on reuse, either build a new mat, or use the existing mat */ 460 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 461 { 462 Mat_SeqAIJ *aseq; 463 464 PetscFunctionBegin; 465 PetscCall(PetscKokkosInitializeCheck()); 466 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 467 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */ 468 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 469 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 470 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 471 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 472 PetscCall(PetscFree(A->defaultvectype)); 473 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 474 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 475 PetscCall(MatSetOps_SeqAIJKokkos(A)); 476 aseq = static_cast<Mat_SeqAIJ *>(A->data); 477 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 478 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 479 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE); 480 } 481 } 482 PetscFunctionReturn(PETSC_SUCCESS); 483 } 484 485 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 486 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 487 */ 488 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 489 { 490 Mat_SeqAIJ *bseq; 491 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 492 Mat mat; 493 494 PetscFunctionBegin; 495 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 496 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 497 mat = *B; 498 if (A->assembled) { 499 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 500 bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE); 501 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 502 /* Now copy values to B if needed */ 503 if (dupOption == MAT_COPY_VALUES) { 504 if (akok->a_dual.need_sync_device()) { 505 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 506 bkok->a_dual.modify_host(); 507 } else { /* If device has the latest data, we only copy data on device */ 508 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 509 bkok->a_dual.modify_device(); 510 } 511 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 512 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 513 bkok->a_dual.modify_host(); 514 } 515 mat->spptr = bkok; 516 } 517 518 PetscCall(PetscFree(mat->defaultvectype)); 519 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 520 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 521 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 522 PetscFunctionReturn(PETSC_SUCCESS); 523 } 524 525 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 526 { 527 Mat At; 528 KokkosCsrMatrix *internT; 529 Mat_SeqAIJKokkos *atkok, *bkok; 530 531 PetscFunctionBegin; 532 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 533 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 534 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 535 /* Deep copy internT, as we want to isolate the internal transpose */ 536 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", *internT))); 537 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 538 if (reuse == MAT_INITIAL_MATRIX) *B = At; 539 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 540 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 541 if ((*B)->assembled) { 542 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 543 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT->values)); 544 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 545 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 546 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 547 MatScalarKokkosViewHost a_h(bseq->a, internT->nnz()); /* bseq->nz = 0 if unassembled */ 548 MatColIdxKokkosViewHost j_h(bseq->j, internT->nnz()); 549 PetscCallCXX(Kokkos::deep_copy(a_h, internT->values)); 550 PetscCallCXX(Kokkos::deep_copy(j_h, internT->graph.entries)); 551 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 552 } 553 PetscFunctionReturn(PETSC_SUCCESS); 554 } 555 556 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 557 { 558 Mat_SeqAIJKokkos *aijkok; 559 560 PetscFunctionBegin; 561 if (A->factortype == MAT_FACTOR_NONE) { 562 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 563 delete aijkok; 564 } else { 565 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 566 } 567 A->spptr = NULL; 568 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 569 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 570 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 571 PetscCall(MatDestroy_SeqAIJ(A)); 572 PetscFunctionReturn(PETSC_SUCCESS); 573 } 574 575 /*MC 576 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 577 578 A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 579 580 Options Database Keys: 581 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 582 583 Level: beginner 584 585 .seealso: `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 586 M*/ 587 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 588 { 589 PetscFunctionBegin; 590 PetscCall(PetscKokkosInitializeCheck()); 591 PetscCall(MatCreate_SeqAIJ(A)); 592 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 593 PetscFunctionReturn(PETSC_SUCCESS); 594 } 595 596 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 597 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 598 { 599 Mat_SeqAIJ *a, *b; 600 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 601 MatScalarKokkosView aa, ba, ca; 602 MatRowMapKokkosView ai, bi, ci; 603 MatColIdxKokkosView aj, bj, cj; 604 PetscInt m, n, nnz, aN; 605 606 PetscFunctionBegin; 607 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 608 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 609 PetscValidPointer(C, 4); 610 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 611 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 612 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 613 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 614 615 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 616 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 617 a = static_cast<Mat_SeqAIJ *>(A->data); 618 b = static_cast<Mat_SeqAIJ *>(B->data); 619 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 620 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 621 aa = akok->a_dual.view_device(); 622 ai = akok->i_dual.view_device(); 623 ba = bkok->a_dual.view_device(); 624 bi = bkok->i_dual.view_device(); 625 m = A->rmap->n; /* M, N and nnz of C */ 626 n = A->cmap->n + B->cmap->n; 627 nnz = a->nz + b->nz; 628 aN = A->cmap->n; /* N of A */ 629 if (reuse == MAT_INITIAL_MATRIX) { 630 aj = akok->j_dual.view_device(); 631 bj = bkok->j_dual.view_device(); 632 auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 633 auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 634 auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 635 ca = ca_dual.view_device(); 636 ci = ci_dual.view_device(); 637 cj = cj_dual.view_device(); 638 639 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 640 Kokkos::parallel_for( 641 Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 642 PetscInt i = t.league_rank(); /* row i */ 643 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 644 645 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 646 ci(i) = coffset; 647 if (i == m - 1) ci(m) = ai(m) + bi(m); 648 }); 649 650 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 651 if (k < alen) { 652 ca(coffset + k) = aa(ai(i) + k); 653 cj(coffset + k) = aj(ai(i) + k); 654 } else { 655 ca(coffset + k) = ba(bi(i) + k - alen); 656 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 657 } 658 }); 659 }); 660 ca_dual.modify_device(); 661 ci_dual.modify_device(); 662 cj_dual.modify_device(); 663 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 664 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 665 } else if (reuse == MAT_REUSE_MATRIX) { 666 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 667 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 668 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 669 ca = ckok->a_dual.view_device(); 670 ci = ckok->i_dual.view_device(); 671 672 Kokkos::parallel_for( 673 Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 674 PetscInt i = t.league_rank(); /* row i */ 675 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 676 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 677 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 678 else ca(ci(i) + k) = ba(bi(i) + k - alen); 679 }); 680 }); 681 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 682 } 683 PetscFunctionReturn(PETSC_SUCCESS); 684 } 685 686 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 687 { 688 PetscFunctionBegin; 689 delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 690 PetscFunctionReturn(PETSC_SUCCESS); 691 } 692 693 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 694 { 695 Mat_Product *product = C->product; 696 Mat A, B; 697 bool transA, transB; /* use bool, since KK needs this type */ 698 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 699 Mat_SeqAIJ *c; 700 MatProductData_SeqAIJKokkos *pdata; 701 KokkosCsrMatrix *csrmatA, *csrmatB; 702 703 PetscFunctionBegin; 704 MatCheckProduct(C, 1); 705 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 706 pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 707 708 if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 709 pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 710 PetscFunctionReturn(PETSC_SUCCESS); 711 } 712 713 switch (product->type) { 714 case MATPRODUCT_AB: 715 transA = false; 716 transB = false; 717 break; 718 case MATPRODUCT_AtB: 719 transA = true; 720 transB = false; 721 break; 722 case MATPRODUCT_ABt: 723 transA = false; 724 transB = true; 725 break; 726 default: 727 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 728 } 729 730 A = product->A; 731 B = product->B; 732 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 733 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 734 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 735 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 736 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 737 738 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 739 740 csrmatA = &akok->csrmat; 741 csrmatB = &bkok->csrmat; 742 743 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 744 if (transA) { 745 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 746 transA = false; 747 } 748 749 if (transB) { 750 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 751 transB = false; 752 } 753 PetscCall(PetscLogGpuTimeBegin()); 754 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, *csrmatA, transA, *csrmatB, transB, ckok->csrmat)); 755 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(3, 7, 99) 756 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 757 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 758 #endif 759 760 PetscCall(PetscLogGpuTimeEnd()); 761 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 762 /* shorter version of MatAssemblyEnd_SeqAIJ */ 763 c = (Mat_SeqAIJ *)C->data; 764 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 765 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 766 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 767 c->reallocs = 0; 768 C->info.mallocs = 0; 769 C->info.nz_unneeded = 0; 770 C->assembled = C->was_assembled = PETSC_TRUE; 771 C->num_ass++; 772 PetscFunctionReturn(PETSC_SUCCESS); 773 } 774 775 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 776 { 777 Mat_Product *product = C->product; 778 MatProductType ptype; 779 Mat A, B; 780 bool transA, transB; 781 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 782 MatProductData_SeqAIJKokkos *pdata; 783 MPI_Comm comm; 784 KokkosCsrMatrix *csrmatA, *csrmatB, csrmatC; 785 786 PetscFunctionBegin; 787 MatCheckProduct(C, 1); 788 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 789 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 790 A = product->A; 791 B = product->B; 792 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 793 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 794 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 795 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 796 csrmatA = &akok->csrmat; 797 csrmatB = &bkok->csrmat; 798 799 ptype = product->type; 800 switch (ptype) { 801 case MATPRODUCT_AB: 802 transA = false; 803 transB = false; 804 break; 805 case MATPRODUCT_AtB: 806 transA = true; 807 transB = false; 808 break; 809 case MATPRODUCT_ABt: 810 transA = false; 811 transB = true; 812 break; 813 default: 814 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 815 } 816 817 product->data = pdata = new MatProductData_SeqAIJKokkos(); 818 pdata->kh.set_team_work_size(16); 819 pdata->kh.set_dynamic_scheduling(true); 820 pdata->reusesym = product->api_user; 821 822 /* TODO: add command line options to select spgemm algorithms */ 823 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 824 825 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 826 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 827 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 828 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 829 #endif 830 #endif 831 832 pdata->kh.create_spgemm_handle(spgemm_alg); 833 834 PetscCall(PetscLogGpuTimeBegin()); 835 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 836 if (transA) { 837 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 838 transA = false; 839 } 840 841 if (transB) { 842 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 843 transB = false; 844 } 845 846 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, *csrmatA, transA, *csrmatB, transB, csrmatC)); 847 848 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 849 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 850 calling new Mat_SeqAIJKokkos(). 851 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 852 */ 853 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, *csrmatA, transA, *csrmatB, transB, csrmatC)); 854 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(3, 7, 99) 855 /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 856 auto spgemmHandle = pdata->kh.get_spgemm_handle(); 857 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 858 #endif 859 PetscCall(PetscLogGpuTimeEnd()); 860 861 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 862 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 863 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 864 PetscFunctionReturn(PETSC_SUCCESS); 865 } 866 867 /* handles sparse matrix matrix ops */ 868 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 869 { 870 Mat_Product *product = mat->product; 871 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 872 873 PetscFunctionBegin; 874 MatCheckProduct(mat, 1); 875 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 876 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 877 if (Biskok && Ciskok) { 878 switch (product->type) { 879 case MATPRODUCT_AB: 880 case MATPRODUCT_AtB: 881 case MATPRODUCT_ABt: 882 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 883 break; 884 case MATPRODUCT_PtAP: 885 case MATPRODUCT_RARt: 886 case MATPRODUCT_ABC: 887 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 888 break; 889 default: 890 break; 891 } 892 } else { /* fallback for AIJ */ 893 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 894 } 895 PetscFunctionReturn(PETSC_SUCCESS); 896 } 897 898 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 899 { 900 Mat_SeqAIJKokkos *aijkok; 901 902 PetscFunctionBegin; 903 PetscCall(PetscLogGpuTimeBegin()); 904 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 905 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 906 KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 907 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 908 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 909 PetscCall(PetscLogGpuTimeEnd()); 910 PetscFunctionReturn(PETSC_SUCCESS); 911 } 912 913 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 914 { 915 Mat_SeqAIJKokkos *aijkok; 916 917 PetscFunctionBegin; 918 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 919 if (aijkok) { /* Only zero the device if data is already there */ 920 KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0); 921 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 922 } else { /* Might be preallocated but not assembled */ 923 PetscCall(MatZeroEntries_SeqAIJ(A)); 924 } 925 PetscFunctionReturn(PETSC_SUCCESS); 926 } 927 928 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 929 { 930 Mat_SeqAIJ *aijseq; 931 Mat_SeqAIJKokkos *aijkok; 932 PetscInt n; 933 PetscScalarKokkosView xv; 934 935 PetscFunctionBegin; 936 PetscCall(VecGetLocalSize(x, &n)); 937 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 938 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 939 940 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 941 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 942 943 if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */ 944 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 945 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 946 aijkok->SetDiagonal(aijseq->diag); 947 } 948 949 const auto &Aa = aijkok->a_dual.view_device(); 950 const auto &Ai = aijkok->i_dual.view_device(); 951 const auto &Adiag = aijkok->diag_dual.view_device(); 952 953 PetscCall(VecGetKokkosViewWrite(x, &xv)); 954 Kokkos::parallel_for( 955 n, KOKKOS_LAMBDA(const PetscInt i) { 956 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 957 else xv(i) = 0; 958 }); 959 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 960 PetscFunctionReturn(PETSC_SUCCESS); 961 } 962 963 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 964 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 965 { 966 Mat_SeqAIJKokkos *aijkok; 967 968 PetscFunctionBegin; 969 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 970 PetscValidPointer(kv, 2); 971 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 972 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 973 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 974 *kv = aijkok->a_dual.view_device(); 975 PetscFunctionReturn(PETSC_SUCCESS); 976 } 977 978 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 979 { 980 PetscFunctionBegin; 981 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 982 PetscValidPointer(kv, 2); 983 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 984 PetscFunctionReturn(PETSC_SUCCESS); 985 } 986 987 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 988 { 989 Mat_SeqAIJKokkos *aijkok; 990 991 PetscFunctionBegin; 992 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 993 PetscValidPointer(kv, 2); 994 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 995 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 996 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 997 *kv = aijkok->a_dual.view_device(); 998 PetscFunctionReturn(PETSC_SUCCESS); 999 } 1000 1001 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1002 { 1003 PetscFunctionBegin; 1004 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1005 PetscValidPointer(kv, 2); 1006 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1007 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1008 PetscFunctionReturn(PETSC_SUCCESS); 1009 } 1010 1011 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1012 { 1013 Mat_SeqAIJKokkos *aijkok; 1014 1015 PetscFunctionBegin; 1016 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1017 PetscValidPointer(kv, 2); 1018 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1019 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1020 *kv = aijkok->a_dual.view_device(); 1021 PetscFunctionReturn(PETSC_SUCCESS); 1022 } 1023 1024 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1025 { 1026 PetscFunctionBegin; 1027 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1028 PetscValidPointer(kv, 2); 1029 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1030 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1031 PetscFunctionReturn(PETSC_SUCCESS); 1032 } 1033 1034 /* Computes Y += alpha X */ 1035 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1036 { 1037 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1038 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1039 ConstMatScalarKokkosView Xa; 1040 MatScalarKokkosView Ya; 1041 1042 PetscFunctionBegin; 1043 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1044 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 1045 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1046 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1047 PetscCall(PetscLogGpuTimeBegin()); 1048 1049 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1050 /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 1051 PetscBool e; 1052 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1053 if (e) { 1054 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1055 if (e) pattern = SAME_NONZERO_PATTERN; 1056 } 1057 } 1058 1059 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1060 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1061 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1062 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1063 */ 1064 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1065 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1066 Xa = xkok->a_dual.view_device(); 1067 Ya = ykok->a_dual.view_device(); 1068 1069 if (pattern == SAME_NONZERO_PATTERN) { 1070 KokkosBlas::axpy(alpha, Xa, Ya); 1071 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1072 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1073 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1074 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1075 1076 Kokkos::parallel_for( 1077 Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1078 PetscInt i = t.league_rank(); /* row i */ 1079 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* Only one thread works in a team */ 1080 PetscInt p, q = Yi(i); 1081 for (p = Xi(i); p < Xi(i + 1); p++) { /* For each nonzero on row i of X */ 1082 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; /* find the matching nonzero on row i of Y */ 1083 if (Xj(p) == Yj(q)) { /* Found it */ 1084 Ya(q) += alpha * Xa(p); 1085 q++; 1086 } else { 1087 /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1088 Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1089 */ 1090 if (Yi(i) != Yi(i + 1)) 1091 Ya(Yi(i)) = 1092 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 6, 99) 1093 Kokkos::ArithTraits<PetscScalar>::nan(); 1094 #else 1095 Kokkos::Experimental::nan("1"); 1096 #endif 1097 } 1098 } 1099 }); 1100 }); 1101 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1102 } else { /* different nonzero patterns */ 1103 Mat Z; 1104 KokkosCsrMatrix zcsr; 1105 KernelHandle kh; 1106 kh.create_spadd_handle(false); 1107 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1108 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1109 zkok = new Mat_SeqAIJKokkos(zcsr); 1110 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1111 PetscCall(MatHeaderReplace(Y, &Z)); 1112 kh.destroy_spadd_handle(); 1113 } 1114 PetscCall(PetscLogGpuTimeEnd()); 1115 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); /* Because we scaled X and then added it to Y */ 1116 PetscFunctionReturn(PETSC_SUCCESS); 1117 } 1118 1119 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1120 { 1121 Mat_SeqAIJKokkos *akok; 1122 Mat_SeqAIJ *aseq; 1123 1124 PetscFunctionBegin; 1125 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1126 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1127 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1128 delete akok; 1129 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE); 1130 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1131 akok->SetUpCOO(aseq); 1132 PetscFunctionReturn(PETSC_SUCCESS); 1133 } 1134 1135 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1136 { 1137 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 1138 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1139 PetscCount Annz = aseq->nz; 1140 const PetscCountKokkosView &jmap = akok->jmap_d; 1141 const PetscCountKokkosView &perm = akok->perm_d; 1142 MatScalarKokkosView Aa; 1143 ConstMatScalarKokkosView kv; 1144 PetscMemType memtype; 1145 1146 PetscFunctionBegin; 1147 PetscCall(PetscGetMemType(v, &memtype)); 1148 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1149 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, aseq->coo_n)); 1150 } else { 1151 kv = ConstMatScalarKokkosView(v, aseq->coo_n); /* Directly use v[]'s memory */ 1152 } 1153 1154 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1155 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1156 1157 Kokkos::parallel_for( 1158 Annz, KOKKOS_LAMBDA(const PetscCount i) { 1159 PetscScalar sum = 0.0; 1160 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1161 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1162 }); 1163 1164 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1165 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1166 PetscFunctionReturn(PETSC_SUCCESS); 1167 } 1168 1169 PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A, const PetscInt *diag) 1170 { 1171 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1172 MatScalarKokkosView Aa; 1173 const MatRowMapKokkosView &Ai = akok->i_dual.view_device(); 1174 PetscInt m = A->rmap->n; 1175 ConstMatRowMapKokkosView Adiag(diag, m); /* diag is a device pointer */ 1176 1177 PetscFunctionBegin; 1178 PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); 1179 Kokkos::parallel_for( 1180 m, KOKKOS_LAMBDA(const PetscInt i) { 1181 PetscScalar tmp; 1182 if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i + 1)) { /* The diagonal element exists */ 1183 tmp = Aa(Ai(i)); 1184 Aa(Ai(i)) = Aa(Adiag(i)); 1185 Aa(Adiag(i)) = tmp; 1186 } 1187 }); 1188 PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1189 PetscFunctionReturn(PETSC_SUCCESS); 1190 } 1191 1192 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1193 { 1194 PetscFunctionBegin; 1195 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1196 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1197 B->offloadmask = PETSC_OFFLOAD_CPU; 1198 PetscFunctionReturn(PETSC_SUCCESS); 1199 } 1200 1201 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1202 { 1203 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1204 1205 PetscFunctionBegin; 1206 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1207 A->boundtocpu = PETSC_FALSE; 1208 1209 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1210 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1211 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1212 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1213 A->ops->scale = MatScale_SeqAIJKokkos; 1214 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1215 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1216 A->ops->mult = MatMult_SeqAIJKokkos; 1217 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1218 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1219 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1220 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1221 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1222 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1223 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1224 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1225 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1226 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1227 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1228 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1229 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1230 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1231 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1232 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1233 1234 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1235 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1236 PetscFunctionReturn(PETSC_SUCCESS); 1237 } 1238 1239 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1240 { 1241 Mat_SeqAIJ *aseq; 1242 PetscInt i, m, n; 1243 1244 PetscFunctionBegin; 1245 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1246 1247 m = akok->nrows(); 1248 n = akok->ncols(); 1249 PetscCall(MatSetSizes(A, m, n, m, n)); 1250 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1251 1252 /* Set up data structures of A as a MATSEQAIJ */ 1253 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1254 aseq = (Mat_SeqAIJ *)(A)->data; 1255 1256 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1257 akok->j_dual.sync_host(); 1258 1259 aseq->i = akok->i_host_data(); 1260 aseq->j = akok->j_host_data(); 1261 aseq->a = akok->a_host_data(); 1262 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1263 aseq->singlemalloc = PETSC_FALSE; 1264 aseq->free_a = PETSC_FALSE; 1265 aseq->free_ij = PETSC_FALSE; 1266 aseq->nz = akok->nnz(); 1267 aseq->maxnz = aseq->nz; 1268 1269 PetscCall(PetscMalloc1(m, &aseq->imax)); 1270 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1271 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1272 1273 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1274 akok->nonzerostate = A->nonzerostate; 1275 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1276 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1277 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1278 PetscFunctionReturn(PETSC_SUCCESS); 1279 } 1280 1281 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1282 1283 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1284 */ 1285 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1286 { 1287 PetscFunctionBegin; 1288 PetscCall(MatCreate(comm, A)); 1289 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1290 PetscFunctionReturn(PETSC_SUCCESS); 1291 } 1292 1293 /* --------------------------------------------------------------------------------*/ 1294 /*@C 1295 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 1296 (the default parallel PETSc format). This matrix will ultimately be handled by 1297 Kokkos for calculations. For good matrix 1298 assembly performance the user should preallocate the matrix storage by setting 1299 the parameter nz (or the array nnz). By setting these parameters accurately, 1300 performance during matrix assembly can be increased by more than a factor of 50. 1301 1302 Collective 1303 1304 Input Parameters: 1305 + comm - MPI communicator, set to `PETSC_COMM_SELF` 1306 . m - number of rows 1307 . n - number of columns 1308 . nz - number of nonzeros per row (same for all rows) 1309 - nnz - array containing the number of nonzeros in the various rows 1310 (possibly different for each row) or NULL 1311 1312 Output Parameter: 1313 . A - the matrix 1314 1315 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 1316 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1317 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 1318 1319 Notes: 1320 If nnz is given then nz is ignored 1321 1322 The AIJ format, also called 1323 compressed row storage, is fully compatible with standard Fortran 77 1324 storage. That is, the stored row and column indices can begin at 1325 either one (as in Fortran) or zero. See the users' manual for details. 1326 1327 Specify the preallocated storage with either nz or nnz (not both). 1328 Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory 1329 allocation. For large problems you MUST preallocate memory or you 1330 will get TERRIBLE performance, see the users' manual chapter on matrices. 1331 1332 By default, this format uses inodes (identical nodes) when possible, to 1333 improve numerical efficiency of matrix-vector products and solves. We 1334 search for consecutive rows with the same nonzero structure, thereby 1335 reusing matrix information to achieve increased efficiency. 1336 1337 Level: intermediate 1338 1339 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()` 1340 @*/ 1341 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1342 { 1343 PetscFunctionBegin; 1344 PetscCall(PetscKokkosInitializeCheck()); 1345 PetscCall(MatCreate(comm, A)); 1346 PetscCall(MatSetSizes(*A, m, n, m, n)); 1347 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1348 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1349 PetscFunctionReturn(PETSC_SUCCESS); 1350 } 1351 1352 typedef Kokkos::TeamPolicy<>::member_type team_member; 1353 // 1354 // This factorization exploits block diagonal matrices with "Nf" (not used). 1355 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1356 // 1357 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B, Mat A, const MatFactorInfo *info) 1358 { 1359 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1360 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 1361 IS isrow = b->row, isicol = b->icol; 1362 const PetscInt *r_h, *ic_h; 1363 const PetscInt n = A->rmap->n, *ai_d = aijkok->i_dual.view_device().data(), *aj_d = aijkok->j_dual.view_device().data(), *bi_d = baijkok->i_dual.view_device().data(), *bj_d = baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data(); 1364 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1365 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1366 PetscBool row_identity, col_identity; 1367 PetscInt nc, Nf = 1, nVec = 32; // should be a parameter, Nf is batch size - not used 1368 1369 PetscFunctionBegin; 1370 PetscCheck(A->rmap->n == n, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT, A->rmap->n, n); 1371 PetscCall(MatIsStructurallySymmetric(A, &row_identity)); 1372 PetscCheck(row_identity, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "structurally symmetric matrices only supported"); 1373 PetscCall(ISGetIndices(isrow, &r_h)); 1374 PetscCall(ISGetIndices(isicol, &ic_h)); 1375 PetscCall(ISGetSize(isicol, &nc)); 1376 PetscCall(PetscLogGpuTimeBegin()); 1377 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1378 { 1379 #define KOKKOS_SHARED_LEVEL 1 1380 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1381 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1382 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1383 const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_r_k(r_h, n); 1384 Kokkos::View<PetscInt *, Kokkos::LayoutLeft> d_r_k("r", n); 1385 const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_ic_k(ic_h, nc); 1386 Kokkos::View<PetscInt *, Kokkos::LayoutLeft> d_ic_k("ic", nc); 1387 size_t flops_h = 0.0; 1388 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k(&flops_h); 1389 Kokkos::View<size_t> d_flops_k("flops"); 1390 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1391 const int nloc = n / Nf, Ni = (conc > 8) ? 1 /* some intelligent number of SMs -- but need league_barrier */ : 1; 1392 Kokkos::deep_copy(d_flops_k, h_flops_k); 1393 Kokkos::deep_copy(d_r_k, h_r_k); 1394 Kokkos::deep_copy(d_ic_k, h_ic_k); 1395 // Fill A --> fact 1396 Kokkos::parallel_for( 1397 Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec), KOKKOS_LAMBDA(const team_member team) { 1398 const PetscInt field = team.league_rank() / Ni, field_block = team.league_rank() % Ni; // use grid.x/y in CUDA 1399 const PetscInt nloc_i = (nloc / Ni + !!(nloc % Ni)), start_i = field * nloc + field_block * nloc_i, end_i = (start_i + nloc_i) > (field + 1) * nloc ? (field + 1) * nloc : (start_i + nloc_i); 1400 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1401 // zero rows of B 1402 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) { 1403 PetscInt nzbL = bi_d[rowb + 1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb + 1]; // with diag 1404 PetscScalar *baL = ba_d + bi_d[rowb]; 1405 PetscScalar *baU = ba_d + bdiag_d[rowb + 1] + 1; 1406 /* zero (unfactored row) */ 1407 for (int j = 0; j < nzbL; j++) baL[j] = 0; 1408 for (int j = 0; j < nzbU; j++) baU[j] = 0; 1409 }); 1410 // copy A into B 1411 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) { 1412 PetscInt rowa = r[rowb], nza = ai_d[rowa + 1] - ai_d[rowa]; 1413 const PetscScalar *av = aa_d + ai_d[rowa]; 1414 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1415 /* load in initial (unfactored row) */ 1416 for (int j = 0; j < nza; j++) { 1417 PetscInt colb = ic[ajtmp[j]]; 1418 PetscScalar vala = av[j]; 1419 if (colb == rowb) { 1420 *(ba_d + bdiag_d[rowb]) = vala; 1421 } else { 1422 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]); 1423 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]); 1424 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb + 1] + 1) : bi_d[rowb + 1] - bi_d[rowb], set = 0; 1425 for (int j = 0; j < nz; j++) { 1426 if (pbj[j] == colb) { 1427 pba[j] = vala; 1428 set++; 1429 break; 1430 } 1431 } 1432 #if !defined(PETSC_HAVE_SYCL) 1433 if (set != 1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1434 #endif 1435 } 1436 } 1437 }); 1438 }); 1439 Kokkos::fence(); 1440 1441 Kokkos::parallel_for( 1442 Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size() + scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA(const team_member team) { 1443 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1444 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1445 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1446 const PetscInt field = team.league_rank() / Ni, field_block_idx = team.league_rank() % Ni; // use grid.x/y in CUDA 1447 const PetscInt start = field * nloc, end = start + nloc; 1448 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1449 // A22 panel update for each row A(1,:) and col A(:,1) 1450 for (int ii = start; ii < end - 1; ii++) { 1451 const PetscInt *bjUi = bj_d + bdiag_d[ii + 1] + 1, nzUi = bdiag_d[ii] - (bdiag_d[ii + 1] + 1); // vector, and vector size, of column indices of U(i,(i+1):end) 1452 const PetscScalar *baUi = ba_d + bdiag_d[ii + 1] + 1; // vector of data U(i,i+1:end) 1453 const PetscInt nUi_its = nzUi / Ni + !!(nzUi % Ni); 1454 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1455 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=](const int j) { 1456 PetscInt kIdx = j * Ni + field_block_idx; 1457 if (kIdx >= nzUi) /* void */ 1458 ; 1459 else { 1460 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1461 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1462 const PetscInt nzL = bi_d[myk + 1] - bi_d[myk]; // size of L_k(:) 1463 size_t st_idx; 1464 // find and do L(k,i) = A(:k,i) / A(i,i) 1465 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1466 // get column, there has got to be a better way 1467 Kokkos::parallel_reduce( 1468 Kokkos::ThreadVectorRange(team, nzL), 1469 [&](const int &j, size_t &idx) { 1470 if (pjL[j] == ii) { 1471 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1472 idx = j; // output 1473 *pLki = *pLki / Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1474 } 1475 }, 1476 st_idx); 1477 Kokkos::single(Kokkos::PerThread(team), [=]() { 1478 colkIdx() = st_idx; 1479 L_ki() = *(ba_d + bi_d[myk] + st_idx); 1480 }); 1481 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1482 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n", (int)myk, ii); // uses a register 1483 #endif 1484 // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1485 // U(i+1,:end) 1486 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, nzUi), [=](const int &uiIdx) { // index into i (U) 1487 PetscScalar Uij = baUi[uiIdx]; 1488 PetscInt col = bjUi[uiIdx]; 1489 if (col == myk) { 1490 // A_kk = A_kk - L_ki * U_ij(k) 1491 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1492 *Akkv = *Akkv - L_ki() * Uij; // UiK 1493 } else { 1494 PetscScalar *start, *end, *pAkjv = NULL; 1495 PetscInt high, low; 1496 const PetscInt *startj; 1497 if (col < myk) { // L 1498 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1499 PetscInt idx = (pLki + 1) - (ba_d + bi_d[myk]); // index into row 1500 start = pLki + 1; // start at pLki+1, A22(myk,1) 1501 startj = bj_d + bi_d[myk] + idx; 1502 end = ba_d + bi_d[myk + 1]; 1503 } else { 1504 PetscInt idx = bdiag_d[myk + 1] + 1; 1505 start = ba_d + idx; 1506 startj = bj_d + idx; 1507 end = ba_d + bdiag_d[myk]; 1508 } 1509 // search for 'col', use bisection search - TODO 1510 low = 0; 1511 high = (PetscInt)(end - start); 1512 while (high - low > 5) { 1513 int t = (low + high) / 2; 1514 if (startj[t] > col) high = t; 1515 else low = t; 1516 } 1517 for (pAkjv = start + low; pAkjv < start + high; pAkjv++) { 1518 if (startj[pAkjv - start] == col) break; 1519 } 1520 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1521 if (pAkjv == start + high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n", (int)myk, (int)col); // uses a register 1522 #endif 1523 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1524 } 1525 }); 1526 } 1527 }); 1528 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1529 if (field_block_idx == 0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add(flops.data(), (size_t)(2 * (nzUi * nzUi) + 2)); }); 1530 } /* endof for (i=0; i<n; i++) { */ 1531 Kokkos::single(Kokkos::PerTeam(team), [=]() { 1532 Kokkos::atomic_add(&d_flops_k(), flops()); 1533 flops() = 0; 1534 }); 1535 }); 1536 Kokkos::fence(); 1537 Kokkos::deep_copy(h_flops_k, d_flops_k); 1538 PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k())); 1539 Kokkos::parallel_for( 1540 Kokkos::TeamPolicy<>(Nf * Ni, 1, 256), KOKKOS_LAMBDA(const team_member team) { 1541 const PetscInt lg_rank = team.league_rank(), field = lg_rank / Ni; //, field_offset = lg_rank%Ni; 1542 const PetscInt start = field * nloc, end = start + nloc, n_its = (nloc / Ni + !!(nloc % Ni)); // 1/Ni iters 1543 /* Invert diagonal for simpler triangular solves */ 1544 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=](int outer_index) { 1545 int i = start + outer_index * Ni + lg_rank % Ni; 1546 if (i < end) { 1547 PetscScalar *pv = ba_d + bdiag_d[i]; 1548 *pv = 1.0 / (*pv); 1549 } 1550 }); 1551 }); 1552 } 1553 PetscCall(PetscLogGpuTimeEnd()); 1554 PetscCall(ISRestoreIndices(isicol, &ic_h)); 1555 PetscCall(ISRestoreIndices(isrow, &r_h)); 1556 1557 PetscCall(ISIdentity(isrow, &row_identity)); 1558 PetscCall(ISIdentity(isicol, &col_identity)); 1559 if (b->inode.size) { 1560 B->ops->solve = MatSolve_SeqAIJ_Inode; 1561 } else if (row_identity && col_identity) { 1562 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1563 } else { 1564 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1565 } 1566 B->offloadmask = PETSC_OFFLOAD_GPU; 1567 PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU 1568 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1569 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1570 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1571 B->ops->matsolve = MatMatSolve_SeqAIJ; 1572 B->assembled = PETSC_TRUE; 1573 B->preallocated = PETSC_TRUE; 1574 1575 PetscFunctionReturn(PETSC_SUCCESS); 1576 } 1577 1578 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1579 { 1580 PetscFunctionBegin; 1581 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1582 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1583 PetscFunctionReturn(PETSC_SUCCESS); 1584 } 1585 1586 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1587 { 1588 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1589 1590 PetscFunctionBegin; 1591 if (!factors->sptrsv_symbolic_completed) { 1592 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d); 1593 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d); 1594 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1595 } 1596 PetscFunctionReturn(PETSC_SUCCESS); 1597 } 1598 1599 /* Check if we need to update factors etc for transpose solve */ 1600 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1601 { 1602 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1603 MatColIdxType n = A->rmap->n; 1604 1605 PetscFunctionBegin; 1606 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1607 /* Update L^T and do sptrsv symbolic */ 1608 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); 1609 Kokkos::deep_copy(factors->iLt_d, 0); /* KK requires 0 */ 1610 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d", factors->jL_d.extent(0)); 1611 factors->aLt_d = MatScalarKokkosView("factors->aLt_d", factors->aL_d.extent(0)); 1612 1613 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d, 1614 factors->iLt_d, factors->jLt_d, factors->aLt_d); 1615 1616 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1617 We have to sort the indices, until KK provides finer control options. 1618 */ 1619 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d); 1620 1621 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d); 1622 1623 /* Update U^T and do sptrsv symbolic */ 1624 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); 1625 Kokkos::deep_copy(factors->iUt_d, 0); /* KK requires 0 */ 1626 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d", factors->jU_d.extent(0)); 1627 factors->aUt_d = MatScalarKokkosView("factors->aUt_d", factors->aU_d.extent(0)); 1628 1629 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d, 1630 factors->iUt_d, factors->jUt_d, factors->aUt_d); 1631 1632 /* Sort indices. See comments above */ 1633 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d); 1634 1635 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d); 1636 factors->transpose_updated = PETSC_TRUE; 1637 } 1638 PetscFunctionReturn(PETSC_SUCCESS); 1639 } 1640 1641 /* Solve Ax = b, with A = LU */ 1642 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x) 1643 { 1644 ConstPetscScalarKokkosView bv; 1645 PetscScalarKokkosView xv; 1646 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1647 1648 PetscFunctionBegin; 1649 PetscCall(PetscLogGpuTimeBegin()); 1650 PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 1651 PetscCall(VecGetKokkosView(b, &bv)); 1652 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1653 /* Solve L tmpv = b */ 1654 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector)); 1655 /* Solve Ux = tmpv */ 1656 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv)); 1657 PetscCall(VecRestoreKokkosView(b, &bv)); 1658 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1659 PetscCall(PetscLogGpuTimeEnd()); 1660 PetscFunctionReturn(PETSC_SUCCESS); 1661 } 1662 1663 /* Solve A^T x = b, where A^T = U^T L^T */ 1664 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x) 1665 { 1666 ConstPetscScalarKokkosView bv; 1667 PetscScalarKokkosView xv; 1668 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1669 1670 PetscFunctionBegin; 1671 PetscCall(PetscLogGpuTimeBegin()); 1672 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 1673 PetscCall(VecGetKokkosView(b, &bv)); 1674 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1675 /* Solve U^T tmpv = b */ 1676 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector); 1677 1678 /* Solve L^T x = tmpv */ 1679 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv); 1680 PetscCall(VecRestoreKokkosView(b, &bv)); 1681 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1682 PetscCall(PetscLogGpuTimeEnd()); 1683 PetscFunctionReturn(PETSC_SUCCESS); 1684 } 1685 1686 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1687 { 1688 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1689 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1690 PetscInt fill_lev = info->levels; 1691 1692 PetscFunctionBegin; 1693 PetscCall(PetscLogGpuTimeBegin()); 1694 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1695 1696 auto a_d = aijkok->a_dual.view_device(); 1697 auto i_d = aijkok->i_dual.view_device(); 1698 auto j_d = aijkok->j_dual.view_device(); 1699 1700 KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d); 1701 1702 B->assembled = PETSC_TRUE; 1703 B->preallocated = PETSC_TRUE; 1704 B->ops->solve = MatSolve_SeqAIJKokkos; 1705 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1706 B->ops->matsolve = NULL; 1707 B->ops->matsolvetranspose = NULL; 1708 B->offloadmask = PETSC_OFFLOAD_GPU; 1709 1710 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1711 factors->transpose_updated = PETSC_FALSE; 1712 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1713 /* TODO: log flops, but how to know that? */ 1714 PetscCall(PetscLogGpuTimeEnd()); 1715 PetscFunctionReturn(PETSC_SUCCESS); 1716 } 1717 1718 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1719 { 1720 Mat_SeqAIJKokkos *aijkok; 1721 Mat_SeqAIJ *b; 1722 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1723 PetscInt fill_lev = info->levels; 1724 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 1725 PetscInt n = A->rmap->n; 1726 1727 PetscFunctionBegin; 1728 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1729 /* Rebuild factors */ 1730 if (factors) { 1731 factors->Destroy(); 1732 } /* Destroy the old if it exists */ 1733 else { 1734 B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n); 1735 } 1736 1737 /* Create a spiluk handle and then do symbolic factorization */ 1738 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 1739 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 1740 1741 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1742 1743 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 1744 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 1745 Kokkos::realloc(factors->iU_d, n + 1); 1746 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 1747 1748 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1749 auto i_d = aijkok->i_dual.view_device(); 1750 auto j_d = aijkok->j_dual.view_device(); 1751 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d); 1752 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1753 1754 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1755 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 1756 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 1757 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 1758 1759 /* TODO: add options to select sptrsv algorithms */ 1760 /* Create sptrsv handles for L, U and their transpose */ 1761 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1762 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1763 #else 1764 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1765 #endif 1766 1767 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 1768 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 1769 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 1770 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 1771 1772 /* Fill fields of the factor matrix B */ 1773 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 1774 b = (Mat_SeqAIJ *)B->data; 1775 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 1776 B->info.fill_ratio_given = info->fill; 1777 B->info.fill_ratio_needed = ((PetscReal)b->nz) / ((PetscReal)nnzA); 1778 1779 B->offloadmask = PETSC_OFFLOAD_GPU; 1780 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1781 PetscFunctionReturn(PETSC_SUCCESS); 1782 } 1783 1784 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1785 { 1786 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1787 const PetscInt nrows = A->rmap->n; 1788 1789 PetscFunctionBegin; 1790 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1791 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1792 // move B data into Kokkos 1793 PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok 1794 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok 1795 { 1796 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 1797 if (!baijkok->diag_d.extent(0)) { 1798 const Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_diag(b->diag, nrows + 1); 1799 baijkok->diag_d = Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_diag)); 1800 Kokkos::deep_copy(baijkok->diag_d, h_diag); 1801 } 1802 } 1803 PetscFunctionReturn(PETSC_SUCCESS); 1804 } 1805 1806 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type) 1807 { 1808 PetscFunctionBegin; 1809 *type = MATSOLVERKOKKOS; 1810 PetscFunctionReturn(PETSC_SUCCESS); 1811 } 1812 1813 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A, MatSolverType *type) 1814 { 1815 PetscFunctionBegin; 1816 *type = MATSOLVERKOKKOSDEVICE; 1817 PetscFunctionReturn(PETSC_SUCCESS); 1818 } 1819 1820 /*MC 1821 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1822 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 1823 1824 Level: beginner 1825 1826 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1827 M*/ 1828 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1829 { 1830 PetscInt n = A->rmap->n; 1831 1832 PetscFunctionBegin; 1833 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 1834 PetscCall(MatSetSizes(*B, n, n, n, n)); 1835 (*B)->factortype = ftype; 1836 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 1837 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1838 1839 if (ftype == MAT_FACTOR_LU) { 1840 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1841 (*B)->canuseordering = PETSC_TRUE; 1842 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1843 } else if (ftype == MAT_FACTOR_ILU) { 1844 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1845 (*B)->canuseordering = PETSC_FALSE; 1846 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1847 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1848 1849 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 1850 PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos)); 1851 PetscFunctionReturn(PETSC_SUCCESS); 1852 } 1853 1854 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A, MatFactorType ftype, Mat *B) 1855 { 1856 PetscInt n = A->rmap->n; 1857 1858 PetscFunctionBegin; 1859 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 1860 PetscCall(MatSetSizes(*B, n, n, n, n)); 1861 (*B)->factortype = ftype; 1862 (*B)->canuseordering = PETSC_TRUE; 1863 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 1864 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1865 1866 if (ftype == MAT_FACTOR_LU) { 1867 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1868 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1869 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for KOKKOS Matrix Types"); 1870 1871 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 1872 PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_kokkos_device)); 1873 PetscFunctionReturn(PETSC_SUCCESS); 1874 } 1875 1876 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1877 { 1878 PetscFunctionBegin; 1879 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1880 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1881 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_seqaijkokkos_kokkos_device)); 1882 PetscFunctionReturn(PETSC_SUCCESS); 1883 } 1884 1885 /* Utility to print out a KokkosCsrMatrix for debugging */ 1886 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 1887 { 1888 const auto &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map); 1889 const auto &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries); 1890 const auto &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values); 1891 const PetscInt *i = iv.data(); 1892 const PetscInt *j = jv.data(); 1893 const PetscScalar *a = av.data(); 1894 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 1895 1896 PetscFunctionBegin; 1897 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 1898 for (PetscInt k = 0; k < m; k++) { 1899 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 1900 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 1901 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 1902 } 1903 PetscFunctionReturn(PETSC_SUCCESS); 1904 } 1905