1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petsc/private/sfimpl.h> 5 #include <petscsystypes.h> 6 #include <petscerror.h> 7 8 #include <Kokkos_Core.hpp> 9 #include <KokkosBlas.hpp> 10 #include <KokkosSparse_CrsMatrix.hpp> 11 #include <KokkosSparse_spmv.hpp> 12 #include <KokkosSparse_spiluk.hpp> 13 #include <KokkosSparse_sptrsv.hpp> 14 #include <KokkosSparse_spgemm.hpp> 15 #include <KokkosSparse_spadd.hpp> 16 17 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 18 19 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 6, 99) 20 #include <KokkosSparse_Utils.hpp> 21 using KokkosSparse::sort_crs_matrix; 22 using KokkosSparse::Impl::transpose_matrix; 23 #else 24 #include <KokkosKernels_Sorting.hpp> 25 using KokkosKernels::sort_crs_matrix; 26 using KokkosKernels::Impl::transpose_matrix; 27 #endif 28 29 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 30 31 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 32 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 33 In the latter case, it is important to set a_dual's sync state correctly. 34 */ 35 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) { 36 Mat_SeqAIJ *aijseq; 37 Mat_SeqAIJKokkos *aijkok; 38 39 PetscFunctionBegin; 40 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 41 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 42 43 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 44 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 45 46 /* If aijkok does not exist, we just copy i, j to device. 47 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 48 In both cases, we build a new aijkok structure. 49 */ 50 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 51 delete aijkok; 52 aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 53 A->spptr = aijkok; 54 } 55 56 if (aijkok->device_mat_d.data()) { 57 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 58 } 59 PetscFunctionReturn(0); 60 } 61 62 /* Sync CSR data to device if not yet */ 63 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) { 64 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 65 66 PetscFunctionBegin; 67 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Cann't sync factorized matrix from host to device"); 68 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 69 if (aijkok->a_dual.need_sync_device()) { 70 aijkok->a_dual.sync_device(); 71 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 72 aijkok->hermitian_updated = PETSC_FALSE; 73 } 74 PetscFunctionReturn(0); 75 } 76 77 /* Mark the CSR data on device as modified */ 78 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) { 79 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 80 81 PetscFunctionBegin; 82 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 83 aijkok->a_dual.clear_sync_state(); 84 aijkok->a_dual.modify_device(); 85 aijkok->transpose_updated = PETSC_FALSE; 86 aijkok->hermitian_updated = PETSC_FALSE; 87 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 88 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 89 PetscFunctionReturn(0); 90 } 91 92 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) { 93 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 94 95 PetscFunctionBegin; 96 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 97 /* We do not expect one needs factors on host */ 98 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Cann't sync factorized matrix from device to host"); 99 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 100 aijkok->a_dual.sync_host(); 101 PetscFunctionReturn(0); 102 } 103 104 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) { 105 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 106 107 PetscFunctionBegin; 108 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 109 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 110 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 111 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 112 */ 113 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 114 aijkok->a_dual.sync_host(); 115 *array = aijkok->a_dual.view_host().data(); 116 } else { /* Happens when calling MatSetValues on a newly created matrix */ 117 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 118 } 119 PetscFunctionReturn(0); 120 } 121 122 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) { 123 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 124 125 PetscFunctionBegin; 126 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 127 PetscFunctionReturn(0); 128 } 129 130 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) { 131 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 132 133 PetscFunctionBegin; 134 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 135 aijkok->a_dual.sync_host(); 136 *array = aijkok->a_dual.view_host().data(); 137 } else { 138 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 139 } 140 PetscFunctionReturn(0); 141 } 142 143 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) { 144 PetscFunctionBegin; 145 *array = NULL; 146 PetscFunctionReturn(0); 147 } 148 149 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) { 150 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 151 152 PetscFunctionBegin; 153 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 154 *array = aijkok->a_dual.view_host().data(); 155 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 156 *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 157 } 158 PetscFunctionReturn(0); 159 } 160 161 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) { 162 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 163 164 PetscFunctionBegin; 165 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 166 aijkok->a_dual.clear_sync_state(); 167 aijkok->a_dual.modify_host(); 168 } 169 PetscFunctionReturn(0); 170 } 171 172 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) { 173 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 174 175 PetscFunctionBegin; 176 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 177 178 if (i) *i = aijkok->i_device_data(); 179 if (j) *j = aijkok->j_device_data(); 180 if (a) { 181 aijkok->a_dual.sync_device(); 182 *a = aijkok->a_device_data(); 183 } 184 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 185 PetscFunctionReturn(0); 186 } 187 188 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 189 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) { 190 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 191 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 192 193 PetscFunctionBegin; 194 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 195 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(), h_mat_k); 196 Kokkos::deep_copy(aijkok->device_mat_d, h_mat_k); 197 PetscFunctionReturn(0); 198 } 199 200 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 201 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) { 202 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 203 204 PetscFunctionBegin; 205 if (aijkok && aijkok->device_mat_d.data()) { 206 *d_mat = aijkok->device_mat_d.data(); 207 } else { 208 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it) 209 *d_mat = NULL; 210 } 211 PetscFunctionReturn(0); 212 } 213 214 /* Generate the transpose on device and cache it internally */ 215 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) { 216 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 217 218 PetscFunctionBegin; 219 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 220 if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 221 /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 222 PetscCallCXX(aijkok->a_dual.sync_device()); 223 PetscCallCXX(aijkok->csrmatT = transpose_matrix(aijkok->csrmat)); 224 PetscCallCXX(sort_crs_matrix(aijkok->csrmatT)); 225 aijkok->transpose_updated = PETSC_TRUE; 226 } 227 *csrmatT = &aijkok->csrmatT; 228 PetscFunctionReturn(0); 229 } 230 231 /* Generate the Hermitian on device and cache it internally */ 232 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) { 233 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 234 235 PetscFunctionBegin; 236 PetscCall(PetscLogGpuTimeBegin()); 237 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 238 if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 239 PetscCallCXX(aijkok->a_dual.sync_device()); 240 PetscCallCXX(aijkok->csrmatH = transpose_matrix(aijkok->csrmat)); 241 PetscCallCXX(sort_crs_matrix(aijkok->csrmatH)); 242 #if defined(PETSC_USE_COMPLEX) 243 const auto &a = aijkok->csrmatH.values; 244 Kokkos::parallel_for( 245 a.extent(0), KOKKOS_LAMBDA(MatRowMapType i) { a(i) = PetscConj(a(i)); }); 246 #endif 247 aijkok->hermitian_updated = PETSC_TRUE; 248 } 249 *csrmatH = &aijkok->csrmatH; 250 PetscCall(PetscLogGpuTimeEnd()); 251 PetscFunctionReturn(0); 252 } 253 254 /* y = A x */ 255 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) { 256 Mat_SeqAIJKokkos *aijkok; 257 ConstPetscScalarKokkosView xv; 258 PetscScalarKokkosView yv; 259 260 PetscFunctionBegin; 261 PetscCall(PetscLogGpuTimeBegin()); 262 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 263 PetscCall(VecGetKokkosView(xx, &xv)); 264 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 265 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 266 KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv); /* y = alpha A x + beta y */ 267 PetscCall(VecRestoreKokkosView(xx, &xv)); 268 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 269 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 270 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 271 PetscCall(PetscLogGpuTimeEnd()); 272 PetscFunctionReturn(0); 273 } 274 275 /* y = A^T x */ 276 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) { 277 Mat_SeqAIJKokkos *aijkok; 278 const char *mode; 279 ConstPetscScalarKokkosView xv; 280 PetscScalarKokkosView yv; 281 KokkosCsrMatrix *csrmat; 282 283 PetscFunctionBegin; 284 PetscCall(PetscLogGpuTimeBegin()); 285 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 286 PetscCall(VecGetKokkosView(xx, &xv)); 287 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 288 if (A->form_explicit_transpose) { 289 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 290 mode = "N"; 291 } else { 292 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 293 csrmat = &aijkok->csrmat; 294 mode = "T"; 295 } 296 KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 0.0 /*beta*/, yv); /* y = alpha A^T x + beta y */ 297 PetscCall(VecRestoreKokkosView(xx, &xv)); 298 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 299 PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz())); 300 PetscCall(PetscLogGpuTimeEnd()); 301 PetscFunctionReturn(0); 302 } 303 304 /* y = A^H x */ 305 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) { 306 Mat_SeqAIJKokkos *aijkok; 307 const char *mode; 308 ConstPetscScalarKokkosView xv; 309 PetscScalarKokkosView yv; 310 KokkosCsrMatrix *csrmat; 311 312 PetscFunctionBegin; 313 PetscCall(PetscLogGpuTimeBegin()); 314 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 315 PetscCall(VecGetKokkosView(xx, &xv)); 316 PetscCall(VecGetKokkosViewWrite(yy, &yv)); 317 if (A->form_explicit_transpose) { 318 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 319 mode = "N"; 320 } else { 321 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 322 csrmat = &aijkok->csrmat; 323 mode = "C"; 324 } 325 KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 0.0 /*beta*/, yv); /* y = alpha A^H x + beta y */ 326 PetscCall(VecRestoreKokkosView(xx, &xv)); 327 PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 328 PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz())); 329 PetscCall(PetscLogGpuTimeEnd()); 330 PetscFunctionReturn(0); 331 } 332 333 /* z = A x + y */ 334 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) { 335 Mat_SeqAIJKokkos *aijkok; 336 ConstPetscScalarKokkosView xv, yv; 337 PetscScalarKokkosView zv; 338 339 PetscFunctionBegin; 340 PetscCall(PetscLogGpuTimeBegin()); 341 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 342 PetscCall(VecGetKokkosView(xx, &xv)); 343 PetscCall(VecGetKokkosView(yy, &yv)); 344 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 345 if (zz != yy) Kokkos::deep_copy(zv, yv); 346 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 347 KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv); /* z = alpha A x + beta z */ 348 PetscCall(VecRestoreKokkosView(xx, &xv)); 349 PetscCall(VecRestoreKokkosView(yy, &yv)); 350 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 351 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 352 PetscCall(PetscLogGpuTimeEnd()); 353 PetscFunctionReturn(0); 354 } 355 356 /* z = A^T x + y */ 357 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) { 358 Mat_SeqAIJKokkos *aijkok; 359 const char *mode; 360 ConstPetscScalarKokkosView xv, yv; 361 PetscScalarKokkosView zv; 362 KokkosCsrMatrix *csrmat; 363 364 PetscFunctionBegin; 365 PetscCall(PetscLogGpuTimeBegin()); 366 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 367 PetscCall(VecGetKokkosView(xx, &xv)); 368 PetscCall(VecGetKokkosView(yy, &yv)); 369 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 370 if (zz != yy) Kokkos::deep_copy(zv, yv); 371 if (A->form_explicit_transpose) { 372 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 373 mode = "N"; 374 } else { 375 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 376 csrmat = &aijkok->csrmat; 377 mode = "T"; 378 } 379 KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 1.0 /*beta*/, zv); /* z = alpha A^T x + beta z */ 380 PetscCall(VecRestoreKokkosView(xx, &xv)); 381 PetscCall(VecRestoreKokkosView(yy, &yv)); 382 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 383 PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz())); 384 PetscCall(PetscLogGpuTimeEnd()); 385 PetscFunctionReturn(0); 386 } 387 388 /* z = A^H x + y */ 389 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) { 390 Mat_SeqAIJKokkos *aijkok; 391 const char *mode; 392 ConstPetscScalarKokkosView xv, yv; 393 PetscScalarKokkosView zv; 394 KokkosCsrMatrix *csrmat; 395 396 PetscFunctionBegin; 397 PetscCall(PetscLogGpuTimeBegin()); 398 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 399 PetscCall(VecGetKokkosView(xx, &xv)); 400 PetscCall(VecGetKokkosView(yy, &yv)); 401 PetscCall(VecGetKokkosViewWrite(zz, &zv)); 402 if (zz != yy) Kokkos::deep_copy(zv, yv); 403 if (A->form_explicit_transpose) { 404 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 405 mode = "N"; 406 } else { 407 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 408 csrmat = &aijkok->csrmat; 409 mode = "C"; 410 } 411 KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 1.0 /*beta*/, zv); /* z = alpha A^H x + beta z */ 412 PetscCall(VecRestoreKokkosView(xx, &xv)); 413 PetscCall(VecRestoreKokkosView(yy, &yv)); 414 PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 415 PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz())); 416 PetscCall(PetscLogGpuTimeEnd()); 417 PetscFunctionReturn(0); 418 } 419 420 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) { 421 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 422 423 PetscFunctionBegin; 424 switch (op) { 425 case MAT_FORM_EXPLICIT_TRANSPOSE: 426 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 427 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 428 A->form_explicit_transpose = flg; 429 break; 430 default: PetscCall(MatSetOption_SeqAIJ(A, op, flg)); break; 431 } 432 PetscFunctionReturn(0); 433 } 434 435 /* Depending on reuse, either build a new mat, or use the existing mat */ 436 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) { 437 Mat_SeqAIJ *aseq; 438 439 PetscFunctionBegin; 440 PetscCall(PetscKokkosInitializeCheck()); 441 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 442 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */ 443 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 444 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 445 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 446 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 447 PetscCall(PetscFree(A->defaultvectype)); 448 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 449 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 450 PetscCall(MatSetOps_SeqAIJKokkos(A)); 451 aseq = static_cast<Mat_SeqAIJ *>(A->data); 452 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 453 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 454 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE); 455 } 456 } 457 PetscFunctionReturn(0); 458 } 459 460 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 461 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 462 */ 463 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) { 464 Mat_SeqAIJ *bseq; 465 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 466 Mat mat; 467 468 PetscFunctionBegin; 469 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 470 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 471 mat = *B; 472 if (A->assembled) { 473 bseq = static_cast<Mat_SeqAIJ *>(mat->data); 474 bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE); 475 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 476 /* Now copy values to B if needed */ 477 if (dupOption == MAT_COPY_VALUES) { 478 if (akok->a_dual.need_sync_device()) { 479 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 480 bkok->a_dual.modify_host(); 481 } else { /* If device has the latest data, we only copy data on device */ 482 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 483 bkok->a_dual.modify_device(); 484 } 485 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 486 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 487 bkok->a_dual.modify_host(); 488 } 489 mat->spptr = bkok; 490 } 491 492 PetscCall(PetscFree(mat->defaultvectype)); 493 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 494 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 495 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 496 PetscFunctionReturn(0); 497 } 498 499 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) { 500 Mat At; 501 KokkosCsrMatrix *internT; 502 Mat_SeqAIJKokkos *atkok, *bkok; 503 504 PetscFunctionBegin; 505 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 506 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 507 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 508 /* Deep copy internT, as we want to isolate the internal transpose */ 509 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", *internT))); 510 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 511 if (reuse == MAT_INITIAL_MATRIX) *B = At; 512 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 513 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 514 if ((*B)->assembled) { 515 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 516 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT->values)); 517 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 518 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 519 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 520 MatScalarKokkosViewHost a_h(bseq->a, internT->nnz()); /* bseq->nz = 0 if unassembled */ 521 MatColIdxKokkosViewHost j_h(bseq->j, internT->nnz()); 522 PetscCallCXX(Kokkos::deep_copy(a_h, internT->values)); 523 PetscCallCXX(Kokkos::deep_copy(j_h, internT->graph.entries)); 524 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 525 } 526 PetscFunctionReturn(0); 527 } 528 529 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) { 530 Mat_SeqAIJKokkos *aijkok; 531 532 PetscFunctionBegin; 533 if (A->factortype == MAT_FACTOR_NONE) { 534 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 535 delete aijkok; 536 } else { 537 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 538 } 539 A->spptr = NULL; 540 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 541 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 542 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 543 PetscCall(MatDestroy_SeqAIJ(A)); 544 PetscFunctionReturn(0); 545 } 546 547 /*MC 548 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 549 550 A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 551 552 Options Database Keys: 553 . -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions() 554 555 Level: beginner 556 557 .seealso: `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 558 M*/ 559 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) { 560 PetscFunctionBegin; 561 PetscCall(PetscKokkosInitializeCheck()); 562 PetscCall(MatCreate_SeqAIJ(A)); 563 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 564 PetscFunctionReturn(0); 565 } 566 567 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 568 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) { 569 Mat_SeqAIJ *a, *b; 570 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 571 MatScalarKokkosView aa, ba, ca; 572 MatRowMapKokkosView ai, bi, ci; 573 MatColIdxKokkosView aj, bj, cj; 574 PetscInt m, n, nnz, aN; 575 576 PetscFunctionBegin; 577 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 578 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 579 PetscValidPointer(C, 4); 580 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 581 PetscCheckTypeName(B, MATSEQAIJKOKKOS); 582 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 583 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 584 585 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 586 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 587 a = static_cast<Mat_SeqAIJ *>(A->data); 588 b = static_cast<Mat_SeqAIJ *>(B->data); 589 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 590 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 591 aa = akok->a_dual.view_device(); 592 ai = akok->i_dual.view_device(); 593 ba = bkok->a_dual.view_device(); 594 bi = bkok->i_dual.view_device(); 595 m = A->rmap->n; /* M, N and nnz of C */ 596 n = A->cmap->n + B->cmap->n; 597 nnz = a->nz + b->nz; 598 aN = A->cmap->n; /* N of A */ 599 if (reuse == MAT_INITIAL_MATRIX) { 600 aj = akok->j_dual.view_device(); 601 bj = bkok->j_dual.view_device(); 602 auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 603 auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 604 auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 605 ca = ca_dual.view_device(); 606 ci = ci_dual.view_device(); 607 cj = cj_dual.view_device(); 608 609 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 610 Kokkos::parallel_for( 611 Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 612 PetscInt i = t.league_rank(); /* row i */ 613 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 614 615 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 616 ci(i) = coffset; 617 if (i == m - 1) ci(m) = ai(m) + bi(m); 618 }); 619 620 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 621 if (k < alen) { 622 ca(coffset + k) = aa(ai(i) + k); 623 cj(coffset + k) = aj(ai(i) + k); 624 } else { 625 ca(coffset + k) = ba(bi(i) + k - alen); 626 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 627 } 628 }); 629 }); 630 ca_dual.modify_device(); 631 ci_dual.modify_device(); 632 cj_dual.modify_device(); 633 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 634 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 635 } else if (reuse == MAT_REUSE_MATRIX) { 636 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 637 PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 638 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 639 ca = ckok->a_dual.view_device(); 640 ci = ckok->i_dual.view_device(); 641 642 Kokkos::parallel_for( 643 Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 644 PetscInt i = t.league_rank(); /* row i */ 645 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 646 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 647 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 648 else ca(ci(i) + k) = ba(bi(i) + k - alen); 649 }); 650 }); 651 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 652 } 653 PetscFunctionReturn(0); 654 } 655 656 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) { 657 PetscFunctionBegin; 658 delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 659 PetscFunctionReturn(0); 660 } 661 662 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) { 663 Mat_Product *product = C->product; 664 Mat A, B; 665 bool transA, transB; /* use bool, since KK needs this type */ 666 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 667 Mat_SeqAIJ *c; 668 MatProductData_SeqAIJKokkos *pdata; 669 KokkosCsrMatrix *csrmatA, *csrmatB; 670 671 PetscFunctionBegin; 672 MatCheckProduct(C, 1); 673 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 674 pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 675 676 if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 677 pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 678 PetscFunctionReturn(0); 679 } 680 681 switch (product->type) { 682 case MATPRODUCT_AB: 683 transA = false; 684 transB = false; 685 break; 686 case MATPRODUCT_AtB: 687 transA = true; 688 transB = false; 689 break; 690 case MATPRODUCT_ABt: 691 transA = false; 692 transB = true; 693 break; 694 default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 695 } 696 697 A = product->A; 698 B = product->B; 699 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 700 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 701 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 702 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 703 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 704 705 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 706 707 csrmatA = &akok->csrmat; 708 csrmatB = &bkok->csrmat; 709 710 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 711 if (transA) { 712 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 713 transA = false; 714 } 715 716 if (transB) { 717 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 718 transB = false; 719 } 720 PetscCall(PetscLogGpuTimeBegin()); 721 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, *csrmatA, transA, *csrmatB, transB, ckok->csrmat)); 722 PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 723 PetscCall(PetscLogGpuTimeEnd()); 724 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 725 /* shorter version of MatAssemblyEnd_SeqAIJ */ 726 c = (Mat_SeqAIJ *)C->data; 727 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 728 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 729 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 730 c->reallocs = 0; 731 C->info.mallocs = 0; 732 C->info.nz_unneeded = 0; 733 C->assembled = C->was_assembled = PETSC_TRUE; 734 C->num_ass++; 735 PetscFunctionReturn(0); 736 } 737 738 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) { 739 Mat_Product *product = C->product; 740 MatProductType ptype; 741 Mat A, B; 742 bool transA, transB; 743 Mat_SeqAIJKokkos *akok, *bkok, *ckok; 744 MatProductData_SeqAIJKokkos *pdata; 745 MPI_Comm comm; 746 KokkosCsrMatrix *csrmatA, *csrmatB, csrmatC; 747 748 PetscFunctionBegin; 749 MatCheckProduct(C, 1); 750 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 751 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 752 A = product->A; 753 B = product->B; 754 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 755 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 756 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 757 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 758 csrmatA = &akok->csrmat; 759 csrmatB = &bkok->csrmat; 760 761 ptype = product->type; 762 switch (ptype) { 763 case MATPRODUCT_AB: 764 transA = false; 765 transB = false; 766 break; 767 case MATPRODUCT_AtB: 768 transA = true; 769 transB = false; 770 break; 771 case MATPRODUCT_ABt: 772 transA = false; 773 transB = true; 774 break; 775 default: SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 776 } 777 778 product->data = pdata = new MatProductData_SeqAIJKokkos(); 779 pdata->kh.set_team_work_size(16); 780 pdata->kh.set_dynamic_scheduling(true); 781 pdata->reusesym = product->api_user; 782 783 /* TODO: add command line options to select spgemm algorithms */ 784 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 785 /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API. 786 We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support. 787 */ 788 pdata->kh.create_spgemm_handle(spgemm_alg); 789 790 PetscCall(PetscLogGpuTimeBegin()); 791 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 792 if (transA) { 793 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 794 transA = false; 795 } 796 797 if (transB) { 798 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 799 transB = false; 800 } 801 802 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, *csrmatA, transA, *csrmatB, transB, csrmatC)); 803 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 804 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 805 calling new Mat_SeqAIJKokkos(). 806 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 807 */ 808 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, *csrmatA, transA, *csrmatB, transB, csrmatC)); 809 PetscCallCXX(sort_crs_matrix(csrmatC)); 810 PetscCall(PetscLogGpuTimeEnd()); 811 812 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 813 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 814 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 815 PetscFunctionReturn(0); 816 } 817 818 /* handles sparse matrix matrix ops */ 819 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) { 820 Mat_Product *product = mat->product; 821 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 822 823 PetscFunctionBegin; 824 MatCheckProduct(mat, 1); 825 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 826 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 827 if (Biskok && Ciskok) { 828 switch (product->type) { 829 case MATPRODUCT_AB: 830 case MATPRODUCT_AtB: 831 case MATPRODUCT_ABt: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; break; 832 case MATPRODUCT_PtAP: 833 case MATPRODUCT_RARt: 834 case MATPRODUCT_ABC: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; break; 835 default: break; 836 } 837 } else { /* fallback for AIJ */ 838 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 839 } 840 PetscFunctionReturn(0); 841 } 842 843 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) { 844 Mat_SeqAIJKokkos *aijkok; 845 846 PetscFunctionBegin; 847 PetscCall(PetscLogGpuTimeBegin()); 848 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 849 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 850 KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 851 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 852 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 853 PetscCall(PetscLogGpuTimeEnd()); 854 PetscFunctionReturn(0); 855 } 856 857 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) { 858 Mat_SeqAIJKokkos *aijkok; 859 860 PetscFunctionBegin; 861 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 862 if (aijkok) { /* Only zero the device if data is already there */ 863 KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0); 864 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 865 } else { /* Might be preallocated but not assembled */ 866 PetscCall(MatZeroEntries_SeqAIJ(A)); 867 } 868 PetscFunctionReturn(0); 869 } 870 871 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) { 872 Mat_SeqAIJ *aijseq; 873 Mat_SeqAIJKokkos *aijkok; 874 PetscInt n; 875 PetscScalarKokkosView xv; 876 877 PetscFunctionBegin; 878 PetscCall(VecGetLocalSize(x, &n)); 879 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 880 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 881 882 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 883 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 884 885 if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */ 886 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 887 aijseq = static_cast<Mat_SeqAIJ *>(A->data); 888 aijkok->SetDiagonal(aijseq->diag); 889 } 890 891 const auto &Aa = aijkok->a_dual.view_device(); 892 const auto &Ai = aijkok->i_dual.view_device(); 893 const auto &Adiag = aijkok->diag_dual.view_device(); 894 895 PetscCall(VecGetKokkosViewWrite(x, &xv)); 896 Kokkos::parallel_for( 897 n, KOKKOS_LAMBDA(const PetscInt i) { 898 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 899 else xv(i) = 0; 900 }); 901 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 902 PetscFunctionReturn(0); 903 } 904 905 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 906 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) { 907 Mat_SeqAIJKokkos *aijkok; 908 909 PetscFunctionBegin; 910 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 911 PetscValidPointer(kv, 2); 912 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 913 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 914 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 915 *kv = aijkok->a_dual.view_device(); 916 PetscFunctionReturn(0); 917 } 918 919 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) { 920 PetscFunctionBegin; 921 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 922 PetscValidPointer(kv, 2); 923 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 924 PetscFunctionReturn(0); 925 } 926 927 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) { 928 Mat_SeqAIJKokkos *aijkok; 929 930 PetscFunctionBegin; 931 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 932 PetscValidPointer(kv, 2); 933 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 934 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 935 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 936 *kv = aijkok->a_dual.view_device(); 937 PetscFunctionReturn(0); 938 } 939 940 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) { 941 PetscFunctionBegin; 942 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 943 PetscValidPointer(kv, 2); 944 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 945 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 946 PetscFunctionReturn(0); 947 } 948 949 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) { 950 Mat_SeqAIJKokkos *aijkok; 951 952 PetscFunctionBegin; 953 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 954 PetscValidPointer(kv, 2); 955 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 956 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 957 *kv = aijkok->a_dual.view_device(); 958 PetscFunctionReturn(0); 959 } 960 961 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) { 962 PetscFunctionBegin; 963 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 964 PetscValidPointer(kv, 2); 965 PetscCheckTypeName(A, MATSEQAIJKOKKOS); 966 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 967 PetscFunctionReturn(0); 968 } 969 970 /* Computes Y += alpha X */ 971 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) { 972 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 973 Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 974 ConstMatScalarKokkosView Xa; 975 MatScalarKokkosView Ya; 976 977 PetscFunctionBegin; 978 PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 979 PetscCheckTypeName(X, MATSEQAIJKOKKOS); 980 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 981 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 982 PetscCall(PetscLogGpuTimeBegin()); 983 984 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 985 /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 986 PetscBool e; 987 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 988 if (e) { 989 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 990 if (e) pattern = SAME_NONZERO_PATTERN; 991 } 992 } 993 994 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 995 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 996 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 997 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 998 */ 999 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1000 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1001 Xa = xkok->a_dual.view_device(); 1002 Ya = ykok->a_dual.view_device(); 1003 1004 if (pattern == SAME_NONZERO_PATTERN) { 1005 KokkosBlas::axpy(alpha, Xa, Ya); 1006 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1007 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1008 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1009 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1010 1011 Kokkos::parallel_for( 1012 Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1013 PetscInt i = t.league_rank(); /* row i */ 1014 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* Only one thread works in a team */ 1015 PetscInt p, q = Yi(i); 1016 for (p = Xi(i); p < Xi(i + 1); p++) { /* For each nonzero on row i of X */ 1017 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; /* find the matching nonzero on row i of Y */ 1018 if (Xj(p) == Yj(q)) { /* Found it */ 1019 Ya(q) += alpha * Xa(p); 1020 q++; 1021 } else { 1022 /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1023 Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1024 */ 1025 if (Yi(i) != Yi(i + 1)) 1026 Ya(Yi(i)) = 1027 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 6, 99) 1028 Kokkos::nan("1"); /* auto promote the double NaN if needed */ 1029 #else 1030 Kokkos::Experimental::nan("1"); 1031 #endif 1032 } 1033 } 1034 }); 1035 }); 1036 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1037 } else { /* different nonzero patterns */ 1038 Mat Z; 1039 KokkosCsrMatrix zcsr; 1040 KernelHandle kh; 1041 kh.create_spadd_handle(false); 1042 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1043 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1044 zkok = new Mat_SeqAIJKokkos(zcsr); 1045 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 1046 PetscCall(MatHeaderReplace(Y, &Z)); 1047 kh.destroy_spadd_handle(); 1048 } 1049 PetscCall(PetscLogGpuTimeEnd()); 1050 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); /* Because we scaled X and then added it to Y */ 1051 PetscFunctionReturn(0); 1052 } 1053 1054 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) { 1055 Mat_SeqAIJKokkos *akok; 1056 Mat_SeqAIJ *aseq; 1057 1058 PetscFunctionBegin; 1059 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1060 aseq = static_cast<Mat_SeqAIJ *>(mat->data); 1061 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1062 delete akok; 1063 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE); 1064 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1065 akok->SetUpCOO(aseq); 1066 PetscFunctionReturn(0); 1067 } 1068 1069 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) { 1070 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 1071 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1072 PetscCount Annz = aseq->nz; 1073 const PetscCountKokkosView &jmap = akok->jmap_d; 1074 const PetscCountKokkosView &perm = akok->perm_d; 1075 MatScalarKokkosView Aa; 1076 ConstMatScalarKokkosView kv; 1077 PetscMemType memtype; 1078 1079 PetscFunctionBegin; 1080 PetscCall(PetscGetMemType(v, &memtype)); 1081 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1082 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, aseq->coo_n)); 1083 } else { 1084 kv = ConstMatScalarKokkosView(v, aseq->coo_n); /* Directly use v[]'s memory */ 1085 } 1086 1087 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1088 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 1089 1090 Kokkos::parallel_for( 1091 Annz, KOKKOS_LAMBDA(const PetscCount i) { 1092 PetscScalar sum = 0.0; 1093 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1094 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1095 }); 1096 1097 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1098 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 1099 PetscFunctionReturn(0); 1100 } 1101 1102 PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A, const PetscInt *diag) { 1103 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1104 MatScalarKokkosView Aa; 1105 const MatRowMapKokkosView &Ai = akok->i_dual.view_device(); 1106 PetscInt m = A->rmap->n; 1107 ConstMatRowMapKokkosView Adiag(diag, m); /* diag is a device pointer */ 1108 1109 PetscFunctionBegin; 1110 PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); 1111 Kokkos::parallel_for( 1112 m, KOKKOS_LAMBDA(const PetscInt i) { 1113 PetscScalar tmp; 1114 if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i + 1)) { /* The diagonal element exists */ 1115 tmp = Aa(Ai(i)); 1116 Aa(Ai(i)) = Aa(Adiag(i)); 1117 Aa(Adiag(i)) = tmp; 1118 } 1119 }); 1120 PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 1121 PetscFunctionReturn(0); 1122 } 1123 1124 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) { 1125 PetscFunctionBegin; 1126 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1127 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1128 B->offloadmask = PETSC_OFFLOAD_CPU; 1129 PetscFunctionReturn(0); 1130 } 1131 1132 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) { 1133 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1134 1135 PetscFunctionBegin; 1136 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1137 A->boundtocpu = PETSC_FALSE; 1138 1139 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1140 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1141 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1142 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1143 A->ops->scale = MatScale_SeqAIJKokkos; 1144 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1145 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1146 A->ops->mult = MatMult_SeqAIJKokkos; 1147 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1148 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1149 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1150 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1151 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1152 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1153 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1154 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1155 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1156 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1157 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1158 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1159 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1160 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1161 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1162 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1163 1164 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 1165 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1166 PetscFunctionReturn(0); 1167 } 1168 1169 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) { 1170 Mat_SeqAIJ *aseq; 1171 PetscInt i, m, n; 1172 1173 PetscFunctionBegin; 1174 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1175 1176 m = akok->nrows(); 1177 n = akok->ncols(); 1178 PetscCall(MatSetSizes(A, m, n, m, n)); 1179 PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1180 1181 /* Set up data structures of A as a MATSEQAIJ */ 1182 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1183 aseq = (Mat_SeqAIJ *)(A)->data; 1184 1185 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1186 akok->j_dual.sync_host(); 1187 1188 aseq->i = akok->i_host_data(); 1189 aseq->j = akok->j_host_data(); 1190 aseq->a = akok->a_host_data(); 1191 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1192 aseq->singlemalloc = PETSC_FALSE; 1193 aseq->free_a = PETSC_FALSE; 1194 aseq->free_ij = PETSC_FALSE; 1195 aseq->nz = akok->nnz(); 1196 aseq->maxnz = aseq->nz; 1197 1198 PetscCall(PetscMalloc1(m, &aseq->imax)); 1199 PetscCall(PetscMalloc1(m, &aseq->ilen)); 1200 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1201 1202 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1203 akok->nonzerostate = A->nonzerostate; 1204 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1205 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1206 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1207 PetscFunctionReturn(0); 1208 } 1209 1210 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1211 1212 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1213 */ 1214 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) { 1215 PetscFunctionBegin; 1216 PetscCall(MatCreate(comm, A)); 1217 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1218 PetscFunctionReturn(0); 1219 } 1220 1221 /* --------------------------------------------------------------------------------*/ 1222 /*@C 1223 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 1224 (the default parallel PETSc format). This matrix will ultimately be handled by 1225 Kokkos for calculations. For good matrix 1226 assembly performance the user should preallocate the matrix storage by setting 1227 the parameter nz (or the array nnz). By setting these parameters accurately, 1228 performance during matrix assembly can be increased by more than a factor of 50. 1229 1230 Collective 1231 1232 Input Parameters: 1233 + comm - MPI communicator, set to PETSC_COMM_SELF 1234 . m - number of rows 1235 . n - number of columns 1236 . nz - number of nonzeros per row (same for all rows) 1237 - nnz - array containing the number of nonzeros in the various rows 1238 (possibly different for each row) or NULL 1239 1240 Output Parameter: 1241 . A - the matrix 1242 1243 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1244 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1245 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1246 1247 Notes: 1248 If nnz is given then nz is ignored 1249 1250 The AIJ format (also called the Yale sparse matrix format or 1251 compressed row storage), is fully compatible with standard Fortran 77 1252 storage. That is, the stored row and column indices can begin at 1253 either one (as in Fortran) or zero. See the users' manual for details. 1254 1255 Specify the preallocated storage with either nz or nnz (not both). 1256 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1257 allocation. For large problems you MUST preallocate memory or you 1258 will get TERRIBLE performance, see the users' manual chapter on matrices. 1259 1260 By default, this format uses inodes (identical nodes) when possible, to 1261 improve numerical efficiency of matrix-vector products and solves. We 1262 search for consecutive rows with the same nonzero structure, thereby 1263 reusing matrix information to achieve increased efficiency. 1264 1265 Level: intermediate 1266 1267 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()` 1268 @*/ 1269 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) { 1270 PetscFunctionBegin; 1271 PetscCall(PetscKokkosInitializeCheck()); 1272 PetscCall(MatCreate(comm, A)); 1273 PetscCall(MatSetSizes(*A, m, n, m, n)); 1274 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 1275 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 1276 PetscFunctionReturn(0); 1277 } 1278 1279 typedef Kokkos::TeamPolicy<>::member_type team_member; 1280 // 1281 // This factorization exploits block diagonal matrices with "Nf" (not used). 1282 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1283 // 1284 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B, Mat A, const MatFactorInfo *info) { 1285 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1286 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 1287 IS isrow = b->row, isicol = b->icol; 1288 const PetscInt *r_h, *ic_h; 1289 const PetscInt n = A->rmap->n, *ai_d = aijkok->i_dual.view_device().data(), *aj_d = aijkok->j_dual.view_device().data(), *bi_d = baijkok->i_dual.view_device().data(), *bj_d = baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data(); 1290 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1291 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1292 PetscBool row_identity, col_identity; 1293 PetscInt nc, Nf = 1, nVec = 32; // should be a parameter, Nf is batch size - not used 1294 1295 PetscFunctionBegin; 1296 PetscCheck(A->rmap->n == n, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT, A->rmap->n, n); 1297 PetscCall(MatIsStructurallySymmetric(A, &row_identity)); 1298 PetscCheck(row_identity, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "structurally symmetric matrices only supported"); 1299 PetscCall(ISGetIndices(isrow, &r_h)); 1300 PetscCall(ISGetIndices(isicol, &ic_h)); 1301 PetscCall(ISGetSize(isicol, &nc)); 1302 PetscCall(PetscLogGpuTimeBegin()); 1303 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1304 { 1305 #define KOKKOS_SHARED_LEVEL 1 1306 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1307 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1308 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1309 const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_r_k(r_h, n); 1310 Kokkos::View<PetscInt *, Kokkos::LayoutLeft> d_r_k("r", n); 1311 const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_ic_k(ic_h, nc); 1312 Kokkos::View<PetscInt *, Kokkos::LayoutLeft> d_ic_k("ic", nc); 1313 size_t flops_h = 0.0; 1314 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k(&flops_h); 1315 Kokkos::View<size_t> d_flops_k("flops"); 1316 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1317 const int nloc = n / Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 1318 Kokkos::deep_copy(d_flops_k, h_flops_k); 1319 Kokkos::deep_copy(d_r_k, h_r_k); 1320 Kokkos::deep_copy(d_ic_k, h_ic_k); 1321 // Fill A --> fact 1322 Kokkos::parallel_for( 1323 Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec), KOKKOS_LAMBDA(const team_member team) { 1324 const PetscInt field = team.league_rank() / Ni, field_block = team.league_rank() % Ni; // use grid.x/y in CUDA 1325 const PetscInt nloc_i = (nloc / Ni + !!(nloc % Ni)), start_i = field * nloc + field_block * nloc_i, end_i = (start_i + nloc_i) > (field + 1) * nloc ? (field + 1) * nloc : (start_i + nloc_i); 1326 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1327 // zero rows of B 1328 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) { 1329 PetscInt nzbL = bi_d[rowb + 1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb + 1]; // with diag 1330 PetscScalar *baL = ba_d + bi_d[rowb]; 1331 PetscScalar *baU = ba_d + bdiag_d[rowb + 1] + 1; 1332 /* zero (unfactored row) */ 1333 for (int j = 0; j < nzbL; j++) baL[j] = 0; 1334 for (int j = 0; j < nzbU; j++) baU[j] = 0; 1335 }); 1336 // copy A into B 1337 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) { 1338 PetscInt rowa = r[rowb], nza = ai_d[rowa + 1] - ai_d[rowa]; 1339 const PetscScalar *av = aa_d + ai_d[rowa]; 1340 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1341 /* load in initial (unfactored row) */ 1342 for (int j = 0; j < nza; j++) { 1343 PetscInt colb = ic[ajtmp[j]]; 1344 PetscScalar vala = av[j]; 1345 if (colb == rowb) { 1346 *(ba_d + bdiag_d[rowb]) = vala; 1347 } else { 1348 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]); 1349 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]); 1350 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb + 1] + 1) : bi_d[rowb + 1] - bi_d[rowb], set = 0; 1351 for (int j = 0; j < nz; j++) { 1352 if (pbj[j] == colb) { 1353 pba[j] = vala; 1354 set++; 1355 break; 1356 } 1357 } 1358 #if !defined(PETSC_HAVE_SYCL) 1359 if (set != 1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1360 #endif 1361 } 1362 } 1363 }); 1364 }); 1365 Kokkos::fence(); 1366 1367 Kokkos::parallel_for( 1368 Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size() + scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA(const team_member team) { 1369 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1370 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1371 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1372 const PetscInt field = team.league_rank() / Ni, field_block_idx = team.league_rank() % Ni; // use grid.x/y in CUDA 1373 const PetscInt start = field * nloc, end = start + nloc; 1374 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1375 // A22 panel update for each row A(1,:) and col A(:,1) 1376 for (int ii = start; ii < end - 1; ii++) { 1377 const PetscInt *bjUi = bj_d + bdiag_d[ii + 1] + 1, nzUi = bdiag_d[ii] - (bdiag_d[ii + 1] + 1); // vector, and vector size, of column indices of U(i,(i+1):end) 1378 const PetscScalar *baUi = ba_d + bdiag_d[ii + 1] + 1; // vector of data U(i,i+1:end) 1379 const PetscInt nUi_its = nzUi / Ni + !!(nzUi % Ni); 1380 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1381 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=](const int j) { 1382 PetscInt kIdx = j * Ni + field_block_idx; 1383 if (kIdx >= nzUi) /* void */ 1384 ; 1385 else { 1386 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1387 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1388 const PetscInt nzL = bi_d[myk + 1] - bi_d[myk]; // size of L_k(:) 1389 size_t st_idx; 1390 // find and do L(k,i) = A(:k,i) / A(i,i) 1391 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1392 // get column, there has got to be a better way 1393 Kokkos::parallel_reduce( 1394 Kokkos::ThreadVectorRange(team, nzL), 1395 [&](const int &j, size_t &idx) { 1396 if (pjL[j] == ii) { 1397 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1398 idx = j; // output 1399 *pLki = *pLki / Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1400 } 1401 }, 1402 st_idx); 1403 Kokkos::single(Kokkos::PerThread(team), [=]() { 1404 colkIdx() = st_idx; 1405 L_ki() = *(ba_d + bi_d[myk] + st_idx); 1406 }); 1407 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1408 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n", (int)myk, ii); // uses a register 1409 #endif 1410 // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1411 // U(i+1,:end) 1412 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, nzUi), [=](const int &uiIdx) { // index into i (U) 1413 PetscScalar Uij = baUi[uiIdx]; 1414 PetscInt col = bjUi[uiIdx]; 1415 if (col == myk) { 1416 // A_kk = A_kk - L_ki * U_ij(k) 1417 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1418 *Akkv = *Akkv - L_ki() * Uij; // UiK 1419 } else { 1420 PetscScalar *start, *end, *pAkjv = NULL; 1421 PetscInt high, low; 1422 const PetscInt *startj; 1423 if (col < myk) { // L 1424 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1425 PetscInt idx = (pLki + 1) - (ba_d + bi_d[myk]); // index into row 1426 start = pLki + 1; // start at pLki+1, A22(myk,1) 1427 startj = bj_d + bi_d[myk] + idx; 1428 end = ba_d + bi_d[myk + 1]; 1429 } else { 1430 PetscInt idx = bdiag_d[myk + 1] + 1; 1431 start = ba_d + idx; 1432 startj = bj_d + idx; 1433 end = ba_d + bdiag_d[myk]; 1434 } 1435 // search for 'col', use bisection search - TODO 1436 low = 0; 1437 high = (PetscInt)(end - start); 1438 while (high - low > 5) { 1439 int t = (low + high) / 2; 1440 if (startj[t] > col) high = t; 1441 else low = t; 1442 } 1443 for (pAkjv = start + low; pAkjv < start + high; pAkjv++) { 1444 if (startj[pAkjv - start] == col) break; 1445 } 1446 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1447 if (pAkjv == start + high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n", (int)myk, (int)col); // uses a register 1448 #endif 1449 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1450 } 1451 }); 1452 } 1453 }); 1454 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1455 if (field_block_idx == 0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add(flops.data(), (size_t)(2 * (nzUi * nzUi) + 2)); }); 1456 } /* endof for (i=0; i<n; i++) { */ 1457 Kokkos::single(Kokkos::PerTeam(team), [=]() { 1458 Kokkos::atomic_add(&d_flops_k(), flops()); 1459 flops() = 0; 1460 }); 1461 }); 1462 Kokkos::fence(); 1463 Kokkos::deep_copy(h_flops_k, d_flops_k); 1464 PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k())); 1465 Kokkos::parallel_for( 1466 Kokkos::TeamPolicy<>(Nf * Ni, 1, 256), KOKKOS_LAMBDA(const team_member team) { 1467 const PetscInt lg_rank = team.league_rank(), field = lg_rank / Ni; //, field_offset = lg_rank%Ni; 1468 const PetscInt start = field * nloc, end = start + nloc, n_its = (nloc / Ni + !!(nloc % Ni)); // 1/Ni iters 1469 /* Invert diagonal for simpler triangular solves */ 1470 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=](int outer_index) { 1471 int i = start + outer_index * Ni + lg_rank % Ni; 1472 if (i < end) { 1473 PetscScalar *pv = ba_d + bdiag_d[i]; 1474 *pv = 1.0 / (*pv); 1475 } 1476 }); 1477 }); 1478 } 1479 PetscCall(PetscLogGpuTimeEnd()); 1480 PetscCall(ISRestoreIndices(isicol, &ic_h)); 1481 PetscCall(ISRestoreIndices(isrow, &r_h)); 1482 1483 PetscCall(ISIdentity(isrow, &row_identity)); 1484 PetscCall(ISIdentity(isicol, &col_identity)); 1485 if (b->inode.size) { 1486 B->ops->solve = MatSolve_SeqAIJ_Inode; 1487 } else if (row_identity && col_identity) { 1488 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1489 } else { 1490 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1491 } 1492 B->offloadmask = PETSC_OFFLOAD_GPU; 1493 PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU 1494 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1495 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1496 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1497 B->ops->matsolve = MatMatSolve_SeqAIJ; 1498 B->assembled = PETSC_TRUE; 1499 B->preallocated = PETSC_TRUE; 1500 1501 PetscFunctionReturn(0); 1502 } 1503 1504 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) { 1505 PetscFunctionBegin; 1506 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1507 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1508 PetscFunctionReturn(0); 1509 } 1510 1511 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) { 1512 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1513 1514 PetscFunctionBegin; 1515 if (!factors->sptrsv_symbolic_completed) { 1516 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d); 1517 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d); 1518 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1519 } 1520 PetscFunctionReturn(0); 1521 } 1522 1523 /* Check if we need to update factors etc for transpose solve */ 1524 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) { 1525 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1526 MatColIdxType n = A->rmap->n; 1527 1528 PetscFunctionBegin; 1529 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1530 /* Update L^T and do sptrsv symbolic */ 1531 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); 1532 Kokkos::deep_copy(factors->iLt_d, 0); /* KK requires 0 */ 1533 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d", factors->jL_d.extent(0)); 1534 factors->aLt_d = MatScalarKokkosView("factors->aLt_d", factors->aL_d.extent(0)); 1535 1536 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d, 1537 factors->iLt_d, factors->jLt_d, factors->aLt_d); 1538 1539 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1540 We have to sort the indices, until KK provides finer control options. 1541 */ 1542 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d); 1543 1544 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d); 1545 1546 /* Update U^T and do sptrsv symbolic */ 1547 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); 1548 Kokkos::deep_copy(factors->iUt_d, 0); /* KK requires 0 */ 1549 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d", factors->jU_d.extent(0)); 1550 factors->aUt_d = MatScalarKokkosView("factors->aUt_d", factors->aU_d.extent(0)); 1551 1552 transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d, 1553 factors->iUt_d, factors->jUt_d, factors->aUt_d); 1554 1555 /* Sort indices. See comments above */ 1556 sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d); 1557 1558 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d); 1559 factors->transpose_updated = PETSC_TRUE; 1560 } 1561 PetscFunctionReturn(0); 1562 } 1563 1564 /* Solve Ax = b, with A = LU */ 1565 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x) { 1566 ConstPetscScalarKokkosView bv; 1567 PetscScalarKokkosView xv; 1568 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1569 1570 PetscFunctionBegin; 1571 PetscCall(PetscLogGpuTimeBegin()); 1572 PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 1573 PetscCall(VecGetKokkosView(b, &bv)); 1574 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1575 /* Solve L tmpv = b */ 1576 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector)); 1577 /* Solve Ux = tmpv */ 1578 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv)); 1579 PetscCall(VecRestoreKokkosView(b, &bv)); 1580 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1581 PetscCall(PetscLogGpuTimeEnd()); 1582 PetscFunctionReturn(0); 1583 } 1584 1585 /* Solve A^T x = b, where A^T = U^T L^T */ 1586 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x) { 1587 ConstPetscScalarKokkosView bv; 1588 PetscScalarKokkosView xv; 1589 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1590 1591 PetscFunctionBegin; 1592 PetscCall(PetscLogGpuTimeBegin()); 1593 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 1594 PetscCall(VecGetKokkosView(b, &bv)); 1595 PetscCall(VecGetKokkosViewWrite(x, &xv)); 1596 /* Solve U^T tmpv = b */ 1597 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector); 1598 1599 /* Solve L^T x = tmpv */ 1600 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv); 1601 PetscCall(VecRestoreKokkosView(b, &bv)); 1602 PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 1603 PetscCall(PetscLogGpuTimeEnd()); 1604 PetscFunctionReturn(0); 1605 } 1606 1607 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) { 1608 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1609 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1610 PetscInt fill_lev = info->levels; 1611 1612 PetscFunctionBegin; 1613 PetscCall(PetscLogGpuTimeBegin()); 1614 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1615 1616 auto a_d = aijkok->a_dual.view_device(); 1617 auto i_d = aijkok->i_dual.view_device(); 1618 auto j_d = aijkok->j_dual.view_device(); 1619 1620 KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d); 1621 1622 B->assembled = PETSC_TRUE; 1623 B->preallocated = PETSC_TRUE; 1624 B->ops->solve = MatSolve_SeqAIJKokkos; 1625 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1626 B->ops->matsolve = NULL; 1627 B->ops->matsolvetranspose = NULL; 1628 B->offloadmask = PETSC_OFFLOAD_GPU; 1629 1630 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1631 factors->transpose_updated = PETSC_FALSE; 1632 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1633 /* TODO: log flops, but how to know that? */ 1634 PetscCall(PetscLogGpuTimeEnd()); 1635 PetscFunctionReturn(0); 1636 } 1637 1638 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) { 1639 Mat_SeqAIJKokkos *aijkok; 1640 Mat_SeqAIJ *b; 1641 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1642 PetscInt fill_lev = info->levels; 1643 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 1644 PetscInt n = A->rmap->n; 1645 1646 PetscFunctionBegin; 1647 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1648 /* Rebuild factors */ 1649 if (factors) { 1650 factors->Destroy(); 1651 } /* Destroy the old if it exists */ 1652 else { 1653 B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n); 1654 } 1655 1656 /* Create a spiluk handle and then do symbolic factorization */ 1657 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 1658 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 1659 1660 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1661 1662 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 1663 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 1664 Kokkos::realloc(factors->iU_d, n + 1); 1665 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 1666 1667 aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1668 auto i_d = aijkok->i_dual.view_device(); 1669 auto j_d = aijkok->j_dual.view_device(); 1670 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d); 1671 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1672 1673 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1674 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 1675 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 1676 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 1677 1678 /* TODO: add options to select sptrsv algorithms */ 1679 /* Create sptrsv handles for L, U and their transpose */ 1680 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1681 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1682 #else 1683 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1684 #endif 1685 1686 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 1687 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 1688 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 1689 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 1690 1691 /* Fill fields of the factor matrix B */ 1692 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 1693 b = (Mat_SeqAIJ *)B->data; 1694 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 1695 B->info.fill_ratio_given = info->fill; 1696 B->info.fill_ratio_needed = ((PetscReal)b->nz) / ((PetscReal)nnzA); 1697 1698 B->offloadmask = PETSC_OFFLOAD_GPU; 1699 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1700 PetscFunctionReturn(0); 1701 } 1702 1703 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) { 1704 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1705 const PetscInt nrows = A->rmap->n; 1706 1707 PetscFunctionBegin; 1708 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1709 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1710 // move B data into Kokkos 1711 PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok 1712 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok 1713 { 1714 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 1715 if (!baijkok->diag_d.extent(0)) { 1716 const Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_diag(b->diag, nrows + 1); 1717 baijkok->diag_d = Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_diag)); 1718 Kokkos::deep_copy(baijkok->diag_d, h_diag); 1719 } 1720 } 1721 PetscFunctionReturn(0); 1722 } 1723 1724 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type) { 1725 PetscFunctionBegin; 1726 *type = MATSOLVERKOKKOS; 1727 PetscFunctionReturn(0); 1728 } 1729 1730 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A, MatSolverType *type) { 1731 PetscFunctionBegin; 1732 *type = MATSOLVERKOKKOSDEVICE; 1733 PetscFunctionReturn(0); 1734 } 1735 1736 /*MC 1737 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1738 on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1739 1740 Level: beginner 1741 1742 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1743 M*/ 1744 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1745 { 1746 PetscInt n = A->rmap->n; 1747 1748 PetscFunctionBegin; 1749 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 1750 PetscCall(MatSetSizes(*B, n, n, n, n)); 1751 (*B)->factortype = ftype; 1752 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 1753 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1754 1755 if (ftype == MAT_FACTOR_LU) { 1756 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1757 (*B)->canuseordering = PETSC_TRUE; 1758 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1759 } else if (ftype == MAT_FACTOR_ILU) { 1760 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1761 (*B)->canuseordering = PETSC_FALSE; 1762 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1763 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1764 1765 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 1766 PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos)); 1767 PetscFunctionReturn(0); 1768 } 1769 1770 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A, MatFactorType ftype, Mat *B) { 1771 PetscInt n = A->rmap->n; 1772 1773 PetscFunctionBegin; 1774 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 1775 PetscCall(MatSetSizes(*B, n, n, n, n)); 1776 (*B)->factortype = ftype; 1777 (*B)->canuseordering = PETSC_TRUE; 1778 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 1779 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1780 1781 if (ftype == MAT_FACTOR_LU) { 1782 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 1783 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1784 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for KOKKOS Matrix Types"); 1785 1786 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 1787 PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_kokkos_device)); 1788 PetscFunctionReturn(0); 1789 } 1790 1791 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) { 1792 PetscFunctionBegin; 1793 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1794 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 1795 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_seqaijkokkos_kokkos_device)); 1796 PetscFunctionReturn(0); 1797 } 1798 1799 /* Utility to print out a KokkosCsrMatrix for debugging */ 1800 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) { 1801 const auto &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map); 1802 const auto &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries); 1803 const auto &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values); 1804 const PetscInt *i = iv.data(); 1805 const PetscInt *j = jv.data(); 1806 const PetscScalar *a = av.data(); 1807 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 1808 1809 PetscFunctionBegin; 1810 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 1811 for (PetscInt k = 0; k < m; k++) { 1812 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 1813 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 1814 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 1815 } 1816 PetscFunctionReturn(0); 1817 } 1818