1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petsc/private/sfimpl.h> 5 #include <petscsystypes.h> 6 #include <petscerror.h> 7 8 #include <Kokkos_Core.hpp> 9 #include <KokkosBlas.hpp> 10 #include <KokkosKernels_Sorting.hpp> 11 #include <KokkosSparse_CrsMatrix.hpp> 12 #include <KokkosSparse_spmv.hpp> 13 #include <KokkosSparse_spiluk.hpp> 14 #include <KokkosSparse_sptrsv.hpp> 15 #include <KokkosSparse_spgemm.hpp> 16 #include <KokkosSparse_spadd.hpp> 17 18 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 19 20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 21 22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 23 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 24 In the latter case, it is important to set a_dual's sync state correctly. 25 */ 26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 27 { 28 Mat_SeqAIJ *aijseq; 29 Mat_SeqAIJKokkos *aijkok; 30 31 PetscFunctionBegin; 32 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 33 PetscCall(MatAssemblyEnd_SeqAIJ(A,mode)); 34 35 aijseq = static_cast<Mat_SeqAIJ*>(A->data); 36 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 37 38 /* If aijkok does not exist, we just copy i, j to device. 39 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 40 In both cases, we build a new aijkok structure. 41 */ 42 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 43 delete aijkok; 44 aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 45 A->spptr = aijkok; 46 } 47 48 if (aijkok && aijkok->device_mat_d.data()) { 49 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 50 } 51 PetscFunctionReturn(0); 52 } 53 54 /* Sync CSR data to device if not yet */ 55 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 56 { 57 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 58 59 PetscFunctionBegin; 60 PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 61 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 62 if (aijkok->a_dual.need_sync_device()) { 63 aijkok->a_dual.sync_device(); 64 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 65 aijkok->hermitian_updated = PETSC_FALSE; 66 } 67 PetscFunctionReturn(0); 68 } 69 70 /* Mark the CSR data on device as modified */ 71 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 72 { 73 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 74 75 PetscFunctionBegin; 76 PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 77 aijkok->a_dual.clear_sync_state(); 78 aijkok->a_dual.modify_device(); 79 aijkok->transpose_updated = PETSC_FALSE; 80 aijkok->hermitian_updated = PETSC_FALSE; 81 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 82 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 83 PetscFunctionReturn(0); 84 } 85 86 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 87 { 88 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 89 90 PetscFunctionBegin; 91 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 92 /* We do not expect one needs factors on host */ 93 PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 94 PetscCheck(aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 95 aijkok->a_dual.sync_host(); 96 PetscFunctionReturn(0); 97 } 98 99 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 100 { 101 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 102 103 PetscFunctionBegin; 104 if (aijkok) { 105 aijkok->a_dual.sync_host(); 106 *array = aijkok->a_dual.view_host().data(); 107 } else { /* Happens when calling MatSetValues on a newly created matrix */ 108 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 109 } 110 PetscFunctionReturn(0); 111 } 112 113 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 114 { 115 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 116 117 PetscFunctionBegin; 118 if (aijkok) aijkok->a_dual.modify_host(); 119 PetscFunctionReturn(0); 120 } 121 122 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 123 { 124 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 125 126 PetscFunctionBegin; 127 if (aijkok) { 128 aijkok->a_dual.sync_host(); 129 *array = aijkok->a_dual.view_host().data(); 130 } else { 131 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 132 } 133 PetscFunctionReturn(0); 134 } 135 136 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 137 { 138 PetscFunctionBegin; 139 *array = NULL; 140 PetscFunctionReturn(0); 141 } 142 143 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 144 { 145 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 146 147 PetscFunctionBegin; 148 if (aijkok) { 149 *array = aijkok->a_dual.view_host().data(); 150 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 151 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 152 } 153 PetscFunctionReturn(0); 154 } 155 156 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 157 { 158 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 159 160 PetscFunctionBegin; 161 if (aijkok) { 162 aijkok->a_dual.clear_sync_state(); 163 aijkok->a_dual.modify_host(); 164 } 165 PetscFunctionReturn(0); 166 } 167 168 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 169 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 170 { 171 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 172 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 173 174 PetscFunctionBegin; 175 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 176 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 177 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 178 PetscFunctionReturn(0); 179 } 180 181 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 182 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 183 { 184 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 185 186 PetscFunctionBegin; 187 if (aijkok && aijkok->device_mat_d.data()) { 188 *d_mat = aijkok->device_mat_d.data(); 189 } else { 190 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it) 191 *d_mat = NULL; 192 } 193 PetscFunctionReturn(0); 194 } 195 196 /* Generate the transpose on device and cache it internally */ 197 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 198 { 199 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 200 201 PetscFunctionBegin; 202 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 203 if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 204 /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 205 PetscCallCXX(aijkok->a_dual.sync_device()); 206 PetscCallCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 207 PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 208 aijkok->transpose_updated = PETSC_TRUE; 209 } 210 *csrmatT = &aijkok->csrmatT; 211 PetscFunctionReturn(0); 212 } 213 214 /* Generate the Hermitian on device and cache it internally */ 215 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 216 { 217 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 218 219 PetscFunctionBegin; 220 PetscCall(PetscLogGpuTimeBegin()); 221 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 222 if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 223 PetscCallCXX(aijkok->a_dual.sync_device()); 224 PetscCallCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 225 PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 226 #if defined(PETSC_USE_COMPLEX) 227 const auto& a = aijkok->csrmatH.values; 228 Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 229 #endif 230 aijkok->hermitian_updated = PETSC_TRUE; 231 } 232 *csrmatH = &aijkok->csrmatH; 233 PetscCall(PetscLogGpuTimeEnd()); 234 PetscFunctionReturn(0); 235 } 236 237 /* y = A x */ 238 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 239 { 240 Mat_SeqAIJKokkos *aijkok; 241 ConstPetscScalarKokkosView xv; 242 PetscScalarKokkosView yv; 243 244 PetscFunctionBegin; 245 PetscCall(PetscLogGpuTimeBegin()); 246 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 247 PetscCall(VecGetKokkosView(xx,&xv)); 248 PetscCall(VecGetKokkosViewWrite(yy,&yv)); 249 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 250 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 251 PetscCall(VecRestoreKokkosView(xx,&xv)); 252 PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 253 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 254 PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz())); 255 PetscCall(PetscLogGpuTimeEnd()); 256 PetscFunctionReturn(0); 257 } 258 259 /* y = A^T x */ 260 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 261 { 262 Mat_SeqAIJKokkos *aijkok; 263 const char *mode; 264 ConstPetscScalarKokkosView xv; 265 PetscScalarKokkosView yv; 266 KokkosCsrMatrix *csrmat; 267 268 PetscFunctionBegin; 269 PetscCall(PetscLogGpuTimeBegin()); 270 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 271 PetscCall(VecGetKokkosView(xx,&xv)); 272 PetscCall(VecGetKokkosViewWrite(yy,&yv)); 273 if (A->form_explicit_transpose) { 274 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat)); 275 mode = "N"; 276 } else { 277 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 278 csrmat = &aijkok->csrmat; 279 mode = "T"; 280 } 281 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 282 PetscCall(VecRestoreKokkosView(xx,&xv)); 283 PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 284 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 285 PetscCall(PetscLogGpuTimeEnd()); 286 PetscFunctionReturn(0); 287 } 288 289 /* y = A^H x */ 290 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 291 { 292 Mat_SeqAIJKokkos *aijkok; 293 const char *mode; 294 ConstPetscScalarKokkosView xv; 295 PetscScalarKokkosView yv; 296 KokkosCsrMatrix *csrmat; 297 298 PetscFunctionBegin; 299 PetscCall(PetscLogGpuTimeBegin()); 300 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 301 PetscCall(VecGetKokkosView(xx,&xv)); 302 PetscCall(VecGetKokkosViewWrite(yy,&yv)); 303 if (A->form_explicit_transpose) { 304 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat)); 305 mode = "N"; 306 } else { 307 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 308 csrmat = &aijkok->csrmat; 309 mode = "C"; 310 } 311 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 312 PetscCall(VecRestoreKokkosView(xx,&xv)); 313 PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 314 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 315 PetscCall(PetscLogGpuTimeEnd()); 316 PetscFunctionReturn(0); 317 } 318 319 /* z = A x + y */ 320 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 321 { 322 Mat_SeqAIJKokkos *aijkok; 323 ConstPetscScalarKokkosView xv,yv; 324 PetscScalarKokkosView zv; 325 326 PetscFunctionBegin; 327 PetscCall(PetscLogGpuTimeBegin()); 328 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 329 PetscCall(VecGetKokkosView(xx,&xv)); 330 PetscCall(VecGetKokkosView(yy,&yv)); 331 PetscCall(VecGetKokkosViewWrite(zz,&zv)); 332 if (zz != yy) Kokkos::deep_copy(zv,yv); 333 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 334 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 335 PetscCall(VecRestoreKokkosView(xx,&xv)); 336 PetscCall(VecRestoreKokkosView(yy,&yv)); 337 PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 338 PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz())); 339 PetscCall(PetscLogGpuTimeEnd()); 340 PetscFunctionReturn(0); 341 } 342 343 /* z = A^T x + y */ 344 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 345 { 346 Mat_SeqAIJKokkos *aijkok; 347 const char *mode; 348 ConstPetscScalarKokkosView xv,yv; 349 PetscScalarKokkosView zv; 350 KokkosCsrMatrix *csrmat; 351 352 PetscFunctionBegin; 353 PetscCall(PetscLogGpuTimeBegin()); 354 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 355 PetscCall(VecGetKokkosView(xx,&xv)); 356 PetscCall(VecGetKokkosView(yy,&yv)); 357 PetscCall(VecGetKokkosViewWrite(zz,&zv)); 358 if (zz != yy) Kokkos::deep_copy(zv,yv); 359 if (A->form_explicit_transpose) { 360 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat)); 361 mode = "N"; 362 } else { 363 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 364 csrmat = &aijkok->csrmat; 365 mode = "T"; 366 } 367 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 368 PetscCall(VecRestoreKokkosView(xx,&xv)); 369 PetscCall(VecRestoreKokkosView(yy,&yv)); 370 PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 371 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 372 PetscCall(PetscLogGpuTimeEnd()); 373 PetscFunctionReturn(0); 374 } 375 376 /* z = A^H x + y */ 377 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 378 { 379 Mat_SeqAIJKokkos *aijkok; 380 const char *mode; 381 ConstPetscScalarKokkosView xv,yv; 382 PetscScalarKokkosView zv; 383 KokkosCsrMatrix *csrmat; 384 385 PetscFunctionBegin; 386 PetscCall(PetscLogGpuTimeBegin()); 387 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 388 PetscCall(VecGetKokkosView(xx,&xv)); 389 PetscCall(VecGetKokkosView(yy,&yv)); 390 PetscCall(VecGetKokkosViewWrite(zz,&zv)); 391 if (zz != yy) Kokkos::deep_copy(zv,yv); 392 if (A->form_explicit_transpose) { 393 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat)); 394 mode = "N"; 395 } else { 396 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 397 csrmat = &aijkok->csrmat; 398 mode = "C"; 399 } 400 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 401 PetscCall(VecRestoreKokkosView(xx,&xv)); 402 PetscCall(VecRestoreKokkosView(yy,&yv)); 403 PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 404 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 405 PetscCall(PetscLogGpuTimeEnd()); 406 PetscFunctionReturn(0); 407 } 408 409 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 410 { 411 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 412 413 PetscFunctionBegin; 414 switch (op) { 415 case MAT_FORM_EXPLICIT_TRANSPOSE: 416 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 417 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 418 A->form_explicit_transpose = flg; 419 break; 420 default: 421 PetscCall(MatSetOption_SeqAIJ(A,op,flg)); 422 break; 423 } 424 PetscFunctionReturn(0); 425 } 426 427 /* Depending on reuse, either build a new mat, or use the existing mat */ 428 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 429 { 430 Mat_SeqAIJ *aseq; 431 432 PetscFunctionBegin; 433 PetscCall(PetscKokkosInitializeCheck()); 434 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 435 PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); /* the returned newmat is a SeqAIJKokkos */ 436 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 437 PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 438 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 439 PetscCheck(A == *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 440 PetscCall(PetscFree(A->defaultvectype)); 441 PetscCall(PetscStrallocpy(VECKOKKOS,&A->defaultvectype)); /* Allocate and copy the string */ 442 PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS)); 443 PetscCall(MatSetOps_SeqAIJKokkos(A)); 444 aseq = static_cast<Mat_SeqAIJ*>(A->data); 445 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 446 PetscCheck(!A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 447 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 448 } 449 } 450 PetscFunctionReturn(0); 451 } 452 453 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 454 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 455 */ 456 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 457 { 458 Mat_SeqAIJ *bseq; 459 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 460 Mat mat; 461 462 PetscFunctionBegin; 463 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 464 PetscCall(MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B)); 465 mat = *B; 466 if (A->assembled) { 467 bseq = static_cast<Mat_SeqAIJ*>(mat->data); 468 bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 469 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 470 /* Now copy values to B if needed */ 471 if (dupOption == MAT_COPY_VALUES) { 472 if (akok->a_dual.need_sync_device()) { 473 Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 474 bkok->a_dual.modify_host(); 475 } else { /* If device has the latest data, we only copy data on device */ 476 Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 477 bkok->a_dual.modify_device(); 478 } 479 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 480 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 481 bkok->a_dual.modify_host(); 482 } 483 mat->spptr = bkok; 484 } 485 486 PetscCall(PetscFree(mat->defaultvectype)); 487 PetscCall(PetscStrallocpy(VECKOKKOS,&mat->defaultvectype)); /* Allocate and copy the string */ 488 PetscCall(PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS)); 489 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 490 PetscFunctionReturn(0); 491 } 492 493 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 494 { 495 Mat At; 496 KokkosCsrMatrix *internT; 497 Mat_SeqAIJKokkos *atkok,*bkok; 498 499 PetscFunctionBegin; 500 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&internT)); /* Generate a transpose internally */ 501 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 502 /* Deep copy internT, as we want to isolate the internal transpose */ 503 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT))); 504 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At)); 505 if (reuse == MAT_INITIAL_MATRIX) *B = At; 506 else PetscCall(MatHeaderReplace(A,&At)); /* Replace A with At inplace */ 507 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 508 if ((*B)->assembled) { 509 bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 510 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 511 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 512 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 513 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 514 MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 515 MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 516 PetscCallCXX(Kokkos::deep_copy(a_h,internT->values)); 517 PetscCallCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 518 } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 519 } 520 PetscFunctionReturn(0); 521 } 522 523 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 524 { 525 Mat_SeqAIJKokkos *aijkok; 526 527 PetscFunctionBegin; 528 if (A->factortype == MAT_FACTOR_NONE) { 529 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 530 delete aijkok; 531 } else { 532 delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 533 } 534 A->spptr = NULL; 535 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL)); 536 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 537 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 538 PetscCall(MatDestroy_SeqAIJ(A)); 539 PetscFunctionReturn(0); 540 } 541 542 /*MC 543 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 544 545 A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 546 547 Options Database Keys: 548 . -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions() 549 550 Level: beginner 551 552 .seealso: MatCreateSeqAIJKokkos(), MATMPIAIJKOKKOS 553 M*/ 554 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 555 { 556 PetscFunctionBegin; 557 PetscCall(PetscKokkosInitializeCheck()); 558 PetscCall(MatCreate_SeqAIJ(A)); 559 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A)); 560 PetscFunctionReturn(0); 561 } 562 563 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 564 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 565 { 566 Mat_SeqAIJ *a,*b; 567 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 568 MatScalarKokkosView aa,ba,ca; 569 MatRowMapKokkosView ai,bi,ci; 570 MatColIdxKokkosView aj,bj,cj; 571 PetscInt m,n,nnz,aN; 572 573 PetscFunctionBegin; 574 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 575 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 576 PetscValidPointer(C,4); 577 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 578 PetscCheckTypeName(B,MATSEQAIJKOKKOS); 579 PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 580 PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 581 582 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 583 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 584 a = static_cast<Mat_SeqAIJ*>(A->data); 585 b = static_cast<Mat_SeqAIJ*>(B->data); 586 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 587 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 588 aa = akok->a_dual.view_device(); 589 ai = akok->i_dual.view_device(); 590 ba = bkok->a_dual.view_device(); 591 bi = bkok->i_dual.view_device(); 592 m = A->rmap->n; /* M, N and nnz of C */ 593 n = A->cmap->n + B->cmap->n; 594 nnz = a->nz + b->nz; 595 aN = A->cmap->n; /* N of A */ 596 if (reuse == MAT_INITIAL_MATRIX) { 597 aj = akok->j_dual.view_device(); 598 bj = bkok->j_dual.view_device(); 599 auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 600 auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 601 auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 602 ca = ca_dual.view_device(); 603 ci = ci_dual.view_device(); 604 cj = cj_dual.view_device(); 605 606 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 607 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 608 PetscInt i = t.league_rank(); /* row i */ 609 PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 610 611 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 612 ci(i) = coffset; 613 if (i == m-1) ci(m) = ai(m) + bi(m); 614 }); 615 616 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 617 if (k < alen) { 618 ca(coffset+k) = aa(ai(i)+k); 619 cj(coffset+k) = aj(ai(i)+k); 620 } else { 621 ca(coffset+k) = ba(bi(i)+k-alen); 622 cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 623 } 624 }); 625 }); 626 ca_dual.modify_device(); 627 ci_dual.modify_device(); 628 cj_dual.modify_device(); 629 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 630 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C)); 631 } else if (reuse == MAT_REUSE_MATRIX) { 632 PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 633 PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 634 ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 635 ca = ckok->a_dual.view_device(); 636 ci = ckok->i_dual.view_device(); 637 638 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 639 PetscInt i = t.league_rank(); /* row i */ 640 PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 641 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 642 if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 643 else ca(ci(i)+k) = ba(bi(i)+k-alen); 644 }); 645 }); 646 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 647 } 648 PetscFunctionReturn(0); 649 } 650 651 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 652 { 653 PetscFunctionBegin; 654 delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 655 PetscFunctionReturn(0); 656 } 657 658 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 659 { 660 Mat_Product *product = C->product; 661 Mat A,B; 662 bool transA,transB; /* use bool, since KK needs this type */ 663 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 664 Mat_SeqAIJ *c; 665 MatProductData_SeqAIJKokkos *pdata; 666 KokkosCsrMatrix *csrmatA,*csrmatB; 667 668 PetscFunctionBegin; 669 MatCheckProduct(C,1); 670 PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 671 pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 672 673 if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 674 pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 675 PetscFunctionReturn(0); 676 } 677 678 switch (product->type) { 679 case MATPRODUCT_AB: transA = false; transB = false; break; 680 case MATPRODUCT_AtB: transA = true; transB = false; break; 681 case MATPRODUCT_ABt: transA = false; transB = true; break; 682 default: 683 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 684 } 685 686 A = product->A; 687 B = product->B; 688 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 689 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 690 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 691 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 692 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 693 694 PetscCheck(ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 695 696 csrmatA = &akok->csrmat; 697 csrmatB = &bkok->csrmat; 698 699 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 700 if (transA) { 701 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA)); 702 transA = false; 703 } 704 705 if (transB) { 706 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB)); 707 transB = false; 708 } 709 PetscCall(PetscLogGpuTimeBegin()); 710 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 711 PetscCallCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 712 PetscCall(PetscLogGpuTimeEnd()); 713 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 714 /* shorter version of MatAssemblyEnd_SeqAIJ */ 715 c = (Mat_SeqAIJ*)C->data; 716 PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz)); 717 PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n")); 718 PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax)); 719 c->reallocs = 0; 720 C->info.mallocs = 0; 721 C->info.nz_unneeded = 0; 722 C->assembled = C->was_assembled = PETSC_TRUE; 723 C->num_ass++; 724 PetscFunctionReturn(0); 725 } 726 727 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 728 { 729 Mat_Product *product = C->product; 730 MatProductType ptype; 731 Mat A,B; 732 bool transA,transB; 733 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 734 MatProductData_SeqAIJKokkos *pdata; 735 MPI_Comm comm; 736 KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 737 738 PetscFunctionBegin; 739 MatCheckProduct(C,1); 740 PetscCall(PetscObjectGetComm((PetscObject)C,&comm)); 741 PetscCheck(!product->data,comm,PETSC_ERR_PLIB,"Product data not empty"); 742 A = product->A; 743 B = product->B; 744 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 745 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 746 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 747 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 748 csrmatA = &akok->csrmat; 749 csrmatB = &bkok->csrmat; 750 751 ptype = product->type; 752 switch (ptype) { 753 case MATPRODUCT_AB: transA = false; transB = false; break; 754 case MATPRODUCT_AtB: transA = true; transB = false; break; 755 case MATPRODUCT_ABt: transA = false; transB = true; break; 756 default: 757 SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 758 } 759 760 product->data = pdata = new MatProductData_SeqAIJKokkos(); 761 pdata->kh.set_team_work_size(16); 762 pdata->kh.set_dynamic_scheduling(true); 763 pdata->reusesym = product->api_user; 764 765 /* TODO: add command line options to select spgemm algorithms */ 766 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 767 /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API. 768 We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support. 769 */ 770 pdata->kh.create_spgemm_handle(spgemm_alg); 771 772 PetscCall(PetscLogGpuTimeBegin()); 773 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 774 if (transA) { 775 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA)); 776 transA = false; 777 } 778 779 if (transB) { 780 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB)); 781 transB = false; 782 } 783 784 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 785 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 786 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 787 calling new Mat_SeqAIJKokkos(). 788 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 789 */ 790 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 791 PetscCallCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 792 PetscCall(PetscLogGpuTimeEnd()); 793 794 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 795 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C,ckok)); 796 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 797 PetscFunctionReturn(0); 798 } 799 800 /* handles sparse matrix matrix ops */ 801 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 802 { 803 Mat_Product *product = mat->product; 804 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 805 806 PetscFunctionBegin; 807 MatCheckProduct(mat,1); 808 PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok)); 809 if (product->type == MATPRODUCT_ABC) { 810 PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok)); 811 } 812 if (Biskok && Ciskok) { 813 switch (product->type) { 814 case MATPRODUCT_AB: 815 case MATPRODUCT_AtB: 816 case MATPRODUCT_ABt: 817 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 818 break; 819 case MATPRODUCT_PtAP: 820 case MATPRODUCT_RARt: 821 case MATPRODUCT_ABC: 822 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 823 break; 824 default: 825 break; 826 } 827 } else { /* fallback for AIJ */ 828 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 829 } 830 PetscFunctionReturn(0); 831 } 832 833 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 834 { 835 Mat_SeqAIJKokkos *aijkok; 836 837 PetscFunctionBegin; 838 PetscCall(PetscLogGpuTimeBegin()); 839 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 840 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 841 KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 842 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 843 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 844 PetscCall(PetscLogGpuTimeEnd()); 845 PetscFunctionReturn(0); 846 } 847 848 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 849 { 850 Mat_SeqAIJKokkos *aijkok; 851 852 PetscFunctionBegin; 853 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 854 if (aijkok) { /* Only zero the device if data is already there */ 855 KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 856 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 857 } else { /* Might be preallocated but not assembled */ 858 PetscCall(MatZeroEntries_SeqAIJ(A)); 859 } 860 PetscFunctionReturn(0); 861 } 862 863 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 864 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv) 865 { 866 Mat_SeqAIJKokkos *aijkok; 867 868 PetscFunctionBegin; 869 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 870 PetscValidPointer(kv,2); 871 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 872 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 873 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 874 *kv = aijkok->a_dual.view_device(); 875 PetscFunctionReturn(0); 876 } 877 878 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv) 879 { 880 PetscFunctionBegin; 881 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 882 PetscValidPointer(kv,2); 883 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 884 PetscFunctionReturn(0); 885 } 886 887 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv) 888 { 889 Mat_SeqAIJKokkos *aijkok; 890 891 PetscFunctionBegin; 892 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 893 PetscValidPointer(kv,2); 894 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 895 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 896 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 897 *kv = aijkok->a_dual.view_device(); 898 PetscFunctionReturn(0); 899 } 900 901 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv) 902 { 903 PetscFunctionBegin; 904 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 905 PetscValidPointer(kv,2); 906 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 907 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 908 PetscFunctionReturn(0); 909 } 910 911 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 912 { 913 Mat_SeqAIJKokkos *aijkok; 914 915 PetscFunctionBegin; 916 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 917 PetscValidPointer(kv,2); 918 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 919 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 920 *kv = aijkok->a_dual.view_device(); 921 PetscFunctionReturn(0); 922 } 923 924 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 925 { 926 PetscFunctionBegin; 927 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 928 PetscValidPointer(kv,2); 929 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 930 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 931 PetscFunctionReturn(0); 932 } 933 934 /* Computes Y += alpha X */ 935 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 936 { 937 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 938 Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 939 ConstMatScalarKokkosView Xa; 940 MatScalarKokkosView Ya; 941 942 PetscFunctionBegin; 943 PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 944 PetscCheckTypeName(X,MATSEQAIJKOKKOS); 945 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 946 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 947 PetscCall(PetscLogGpuTimeBegin()); 948 949 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 950 /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 951 PetscBool e; 952 PetscCall(PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e)); 953 if (e) { 954 PetscCall(PetscArraycmp(x->j,y->j,y->nz,&e)); 955 if (e) pattern = SAME_NONZERO_PATTERN; 956 } 957 } 958 959 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 960 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 961 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 962 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 963 */ 964 ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 965 xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 966 Xa = xkok->a_dual.view_device(); 967 Ya = ykok->a_dual.view_device(); 968 969 if (pattern == SAME_NONZERO_PATTERN) { 970 KokkosBlas::axpy(alpha,Xa,Ya); 971 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 972 } else if (pattern == SUBSET_NONZERO_PATTERN) { 973 MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 974 MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 975 976 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 977 PetscInt i = t.league_rank(); /* row i */ 978 Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 979 PetscInt p,q = Yi(i); 980 for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 981 while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 982 if (Xj(p) == Yj(q)) { /* Found it */ 983 Ya(q) += alpha * Xa(p); 984 q++; 985 } else { 986 /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 987 Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 988 */ 989 if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */ 990 } 991 } 992 }); 993 }); 994 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 995 } else { /* different nonzero patterns */ 996 Mat Z; 997 KokkosCsrMatrix zcsr; 998 KernelHandle kh; 999 kh.create_spadd_handle(false); 1000 KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1001 KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1002 zkok = new Mat_SeqAIJKokkos(zcsr); 1003 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z)); 1004 PetscCall(MatHeaderReplace(Y,&Z)); 1005 kh.destroy_spadd_handle(); 1006 } 1007 PetscCall(PetscLogGpuTimeEnd()); 1008 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0)*2)); /* Because we scaled X and then added it to Y */ 1009 PetscFunctionReturn(0); 1010 } 1011 1012 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 1013 { 1014 Mat_SeqAIJKokkos *akok; 1015 Mat_SeqAIJ *aseq; 1016 1017 PetscFunctionBegin; 1018 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j)); 1019 aseq = static_cast<Mat_SeqAIJ*>(mat->data); 1020 akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr); 1021 delete akok; 1022 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,mat->nonzerostate+1,PETSC_FALSE); 1023 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1024 akok->SetUpCOO(aseq); 1025 PetscFunctionReturn(0); 1026 } 1027 1028 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode) 1029 { 1030 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ*>(A->data); 1031 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1032 PetscCount Annz = aseq->nz; 1033 const PetscCountKokkosView& jmap = akok->jmap_d; 1034 const PetscCountKokkosView& perm = akok->perm_d; 1035 MatScalarKokkosView Aa; 1036 ConstMatScalarKokkosView kv; 1037 PetscMemType memtype; 1038 1039 PetscFunctionBegin; 1040 PetscCall(PetscGetMemType(v,&memtype)); 1041 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1042 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n)); 1043 } else { 1044 kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */ 1045 } 1046 1047 if (imode == INSERT_VALUES) { 1048 PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); /* write matrix values */ 1049 Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */ 1050 } else PetscCall(MatSeqAIJGetKokkosView(A,&Aa)); /* read & write matrix values */ 1051 1052 Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=jmap(i); k<jmap(i+1); k++) Aa(i) += kv(perm(k));}); 1053 1054 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa)); 1055 else PetscCall(MatSeqAIJRestoreKokkosView(A,&Aa)); 1056 PetscFunctionReturn(0); 1057 } 1058 1059 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1060 { 1061 PetscFunctionBegin; 1062 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1063 PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info)); 1064 B->offloadmask = PETSC_OFFLOAD_CPU; 1065 PetscFunctionReturn(0); 1066 } 1067 1068 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1069 { 1070 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1071 1072 PetscFunctionBegin; 1073 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1074 A->boundtocpu = PETSC_FALSE; 1075 1076 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1077 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1078 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1079 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1080 A->ops->scale = MatScale_SeqAIJKokkos; 1081 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1082 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1083 A->ops->mult = MatMult_SeqAIJKokkos; 1084 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1085 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1086 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1087 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1088 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1089 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1090 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1091 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1092 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1093 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1094 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1095 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1096 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1097 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1098 1099 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos)); 1100 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJKokkos)); 1101 PetscFunctionReturn(0); 1102 } 1103 1104 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1105 { 1106 Mat_SeqAIJ *aseq; 1107 PetscInt i,m,n; 1108 1109 PetscFunctionBegin; 1110 PetscCheck(!A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1111 1112 m = akok->nrows(); 1113 n = akok->ncols(); 1114 PetscCall(MatSetSizes(A,m,n,m,n)); 1115 PetscCall(MatSetType(A,MATSEQAIJKOKKOS)); 1116 1117 /* Set up data structures of A as a MATSEQAIJ */ 1118 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL)); 1119 aseq = (Mat_SeqAIJ*)(A)->data; 1120 1121 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1122 akok->j_dual.sync_host(); 1123 1124 aseq->i = akok->i_host_data(); 1125 aseq->j = akok->j_host_data(); 1126 aseq->a = akok->a_host_data(); 1127 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1128 aseq->singlemalloc = PETSC_FALSE; 1129 aseq->free_a = PETSC_FALSE; 1130 aseq->free_ij = PETSC_FALSE; 1131 aseq->nz = akok->nnz(); 1132 aseq->maxnz = aseq->nz; 1133 1134 PetscCall(PetscMalloc1(m,&aseq->imax)); 1135 PetscCall(PetscMalloc1(m,&aseq->ilen)); 1136 for (i=0; i<m; i++) { 1137 aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1138 } 1139 1140 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1141 akok->nonzerostate = A->nonzerostate; 1142 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1143 PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY)); 1144 PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY)); 1145 PetscFunctionReturn(0); 1146 } 1147 1148 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1149 1150 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1151 */ 1152 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1153 { 1154 PetscFunctionBegin; 1155 PetscCall(MatCreate(comm,A)); 1156 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A,akok)); 1157 PetscFunctionReturn(0); 1158 } 1159 1160 /* --------------------------------------------------------------------------------*/ 1161 /*@C 1162 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 1163 (the default parallel PETSc format). This matrix will ultimately be handled by 1164 Kokkos for calculations. For good matrix 1165 assembly performance the user should preallocate the matrix storage by setting 1166 the parameter nz (or the array nnz). By setting these parameters accurately, 1167 performance during matrix assembly can be increased by more than a factor of 50. 1168 1169 Collective 1170 1171 Input Parameters: 1172 + comm - MPI communicator, set to PETSC_COMM_SELF 1173 . m - number of rows 1174 . n - number of columns 1175 . nz - number of nonzeros per row (same for all rows) 1176 - nnz - array containing the number of nonzeros in the various rows 1177 (possibly different for each row) or NULL 1178 1179 Output Parameter: 1180 . A - the matrix 1181 1182 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1183 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1184 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1185 1186 Notes: 1187 If nnz is given then nz is ignored 1188 1189 The AIJ format (also called the Yale sparse matrix format or 1190 compressed row storage), is fully compatible with standard Fortran 77 1191 storage. That is, the stored row and column indices can begin at 1192 either one (as in Fortran) or zero. See the users' manual for details. 1193 1194 Specify the preallocated storage with either nz or nnz (not both). 1195 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1196 allocation. For large problems you MUST preallocate memory or you 1197 will get TERRIBLE performance, see the users' manual chapter on matrices. 1198 1199 By default, this format uses inodes (identical nodes) when possible, to 1200 improve numerical efficiency of matrix-vector products and solves. We 1201 search for consecutive rows with the same nonzero structure, thereby 1202 reusing matrix information to achieve increased efficiency. 1203 1204 Level: intermediate 1205 1206 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 1207 @*/ 1208 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1209 { 1210 PetscFunctionBegin; 1211 PetscCall(PetscKokkosInitializeCheck()); 1212 PetscCall(MatCreate(comm,A)); 1213 PetscCall(MatSetSizes(*A,m,n,m,n)); 1214 PetscCall(MatSetType(*A,MATSEQAIJKOKKOS)); 1215 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz)); 1216 PetscFunctionReturn(0); 1217 } 1218 1219 typedef Kokkos::TeamPolicy<>::member_type team_member; 1220 // 1221 // This factorization exploits block diagonal matrices with "Nf" (not used). 1222 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1223 // 1224 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1225 { 1226 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1227 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1228 IS isrow = b->row,isicol = b->icol; 1229 const PetscInt *r_h,*ic_h; 1230 const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data(); 1231 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1232 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1233 PetscBool row_identity,col_identity; 1234 PetscInt nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used 1235 1236 PetscFunctionBegin; 1237 PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n); 1238 PetscCall(MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity)); 1239 PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 1240 PetscCall(ISGetIndices(isrow,&r_h)); 1241 PetscCall(ISGetIndices(isicol,&ic_h)); 1242 PetscCall(ISGetSize(isicol,&nc)); 1243 PetscCall(PetscLogGpuTimeBegin()); 1244 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1245 { 1246 #define KOKKOS_SHARED_LEVEL 1 1247 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1248 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1249 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1250 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 1251 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 1252 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 1253 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 1254 size_t flops_h = 0.0; 1255 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 1256 Kokkos::View<size_t> d_flops_k ("flops"); 1257 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1258 const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 1259 Kokkos::deep_copy (d_flops_k, h_flops_k); 1260 Kokkos::deep_copy (d_r_k, h_r_k); 1261 Kokkos::deep_copy (d_ic_k, h_ic_k); 1262 // Fill A --> fact 1263 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1264 const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 1265 const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 1266 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1267 // zero rows of B 1268 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1269 PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 1270 PetscScalar *baL = ba_d + bi_d[rowb]; 1271 PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 1272 /* zero (unfactored row) */ 1273 for (int j=0;j<nzbL;j++) baL[j] = 0; 1274 for (int j=0;j<nzbU;j++) baU[j] = 0; 1275 }); 1276 // copy A into B 1277 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1278 PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 1279 const PetscScalar *av = aa_d + ai_d[rowa]; 1280 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1281 /* load in initial (unfactored row) */ 1282 for (int j=0;j<nza;j++) { 1283 PetscInt colb = ic[ajtmp[j]]; 1284 PetscScalar vala = av[j]; 1285 if (colb == rowb) { 1286 *(ba_d + bdiag_d[rowb]) = vala; 1287 } else { 1288 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1289 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1290 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 1291 for (int j=0; j<nz ; j++) { 1292 if (pbj[j] == colb) { 1293 pba[j] = vala; 1294 set++; 1295 break; 1296 } 1297 } 1298 #if !defined(PETSC_HAVE_SYCL) 1299 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1300 #endif 1301 } 1302 } 1303 }); 1304 }); 1305 Kokkos::fence(); 1306 1307 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 1308 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1309 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1310 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1311 const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 1312 const PetscInt start = field*nloc, end = start + nloc; 1313 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1314 // A22 panel update for each row A(1,:) and col A(:,1) 1315 for (int ii=start; ii<end-1; ii++) { 1316 const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 1317 const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 1318 const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 1319 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1320 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 1321 PetscInt kIdx = j*Ni + field_block_idx; 1322 if (kIdx >= nzUi) /* void */ ; 1323 else { 1324 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1325 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1326 const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 1327 size_t st_idx; 1328 // find and do L(k,i) = A(:k,i) / A(i,i) 1329 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1330 // get column, there has got to be a better way 1331 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 1332 if (pjL[j] == ii) { 1333 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1334 idx = j; // output 1335 *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1336 } 1337 }, st_idx); 1338 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 1339 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1340 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register 1341 #endif 1342 // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1343 // U(i+1,:end) 1344 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 1345 PetscScalar Uij = baUi[uiIdx]; 1346 PetscInt col = bjUi[uiIdx]; 1347 if (col==myk) { 1348 // A_kk = A_kk - L_ki * U_ij(k) 1349 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1350 *Akkv = *Akkv - L_ki() * Uij; // UiK 1351 } else { 1352 PetscScalar *start, *end, *pAkjv=NULL; 1353 PetscInt high, low; 1354 const PetscInt *startj; 1355 if (col<myk) { // L 1356 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1357 PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 1358 start = pLki+1; // start at pLki+1, A22(myk,1) 1359 startj= bj_d + bi_d[myk] + idx; 1360 end = ba_d + bi_d[myk+1]; 1361 } else { 1362 PetscInt idx = bdiag_d[myk+1]+1; 1363 start = ba_d + idx; 1364 startj= bj_d + idx; 1365 end = ba_d + bdiag_d[myk]; 1366 } 1367 // search for 'col', use bisection search - TODO 1368 low = 0; 1369 high = (PetscInt)(end-start); 1370 while (high-low > 5) { 1371 int t = (low+high)/2; 1372 if (startj[t] > col) high = t; 1373 else low = t; 1374 } 1375 for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 1376 if (startj[pAkjv-start] == col) break; 1377 } 1378 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1379 if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register 1380 #endif 1381 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1382 } 1383 }); 1384 } 1385 }); 1386 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1387 if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 1388 } /* endof for (i=0; i<n; i++) { */ 1389 Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 1390 }); 1391 Kokkos::fence(); 1392 Kokkos::deep_copy (h_flops_k, d_flops_k); 1393 PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k())); 1394 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 1395 const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 1396 const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 1397 /* Invert diagonal for simpler triangular solves */ 1398 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 1399 int i = start + outer_index*Ni + lg_rank%Ni; 1400 if (i < end) { 1401 PetscScalar *pv = ba_d + bdiag_d[i]; 1402 *pv = 1.0/(*pv); 1403 } 1404 }); 1405 }); 1406 } 1407 PetscCall(PetscLogGpuTimeEnd()); 1408 PetscCall(ISRestoreIndices(isicol,&ic_h)); 1409 PetscCall(ISRestoreIndices(isrow,&r_h)); 1410 1411 PetscCall(ISIdentity(isrow,&row_identity)); 1412 PetscCall(ISIdentity(isicol,&col_identity)); 1413 if (b->inode.size) { 1414 B->ops->solve = MatSolve_SeqAIJ_Inode; 1415 } else if (row_identity && col_identity) { 1416 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1417 } else { 1418 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1419 } 1420 B->offloadmask = PETSC_OFFLOAD_GPU; 1421 PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU 1422 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1423 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1424 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1425 B->ops->matsolve = MatMatSolve_SeqAIJ; 1426 B->assembled = PETSC_TRUE; 1427 B->preallocated = PETSC_TRUE; 1428 1429 PetscFunctionReturn(0); 1430 } 1431 1432 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1433 { 1434 PetscFunctionBegin; 1435 PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 1436 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1437 PetscFunctionReturn(0); 1438 } 1439 1440 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1441 { 1442 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1443 1444 PetscFunctionBegin; 1445 if (!factors->sptrsv_symbolic_completed) { 1446 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 1447 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 1448 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1449 } 1450 PetscFunctionReturn(0); 1451 } 1452 1453 /* Check if we need to update factors etc for transpose solve */ 1454 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1455 { 1456 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1457 MatColIdxType n = A->rmap->n; 1458 1459 PetscFunctionBegin; 1460 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1461 /* Update L^T and do sptrsv symbolic */ 1462 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 1463 Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1464 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1465 factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 1466 1467 KokkosKernels::Impl::transpose_matrix< 1468 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1469 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1470 MatRowMapKokkosView,DefaultExecutionSpace>( 1471 n,n,factors->iL_d,factors->jL_d,factors->aL_d, 1472 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1473 1474 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1475 We have to sort the indices, until KK provides finer control options. 1476 */ 1477 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1478 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1479 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1480 1481 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 1482 1483 /* Update U^T and do sptrsv symbolic */ 1484 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 1485 Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1486 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1487 factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 1488 1489 KokkosKernels::Impl::transpose_matrix< 1490 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1491 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1492 MatRowMapKokkosView,DefaultExecutionSpace>( 1493 n,n,factors->iU_d, factors->jU_d, factors->aU_d, 1494 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1495 1496 /* Sort indices. See comments above */ 1497 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1498 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1499 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1500 1501 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 1502 factors->transpose_updated = PETSC_TRUE; 1503 } 1504 PetscFunctionReturn(0); 1505 } 1506 1507 /* Solve Ax = b, with A = LU */ 1508 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 1509 { 1510 ConstPetscScalarKokkosView bv; 1511 PetscScalarKokkosView xv; 1512 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1513 1514 PetscFunctionBegin; 1515 PetscCall(PetscLogGpuTimeBegin()); 1516 PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 1517 PetscCall(VecGetKokkosView(b,&bv)); 1518 PetscCall(VecGetKokkosViewWrite(x,&xv)); 1519 /* Solve L tmpv = b */ 1520 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 1521 /* Solve Ux = tmpv */ 1522 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 1523 PetscCall(VecRestoreKokkosView(b,&bv)); 1524 PetscCall(VecRestoreKokkosViewWrite(x,&xv)); 1525 PetscCall(PetscLogGpuTimeEnd()); 1526 PetscFunctionReturn(0); 1527 } 1528 1529 /* Solve A^T x = b, where A^T = U^T L^T */ 1530 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 1531 { 1532 ConstPetscScalarKokkosView bv; 1533 PetscScalarKokkosView xv; 1534 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1535 1536 PetscFunctionBegin; 1537 PetscCall(PetscLogGpuTimeBegin()); 1538 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 1539 PetscCall(VecGetKokkosView(b,&bv)); 1540 PetscCall(VecGetKokkosViewWrite(x,&xv)); 1541 /* Solve U^T tmpv = b */ 1542 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 1543 1544 /* Solve L^T x = tmpv */ 1545 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 1546 PetscCall(VecRestoreKokkosView(b,&bv)); 1547 PetscCall(VecRestoreKokkosViewWrite(x,&xv)); 1548 PetscCall(PetscLogGpuTimeEnd()); 1549 PetscFunctionReturn(0); 1550 } 1551 1552 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1553 { 1554 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1555 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1556 PetscInt fill_lev = info->levels; 1557 1558 PetscFunctionBegin; 1559 PetscCall(PetscLogGpuTimeBegin()); 1560 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1561 1562 auto a_d = aijkok->a_dual.view_device(); 1563 auto i_d = aijkok->i_dual.view_device(); 1564 auto j_d = aijkok->j_dual.view_device(); 1565 1566 KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 1567 1568 B->assembled = PETSC_TRUE; 1569 B->preallocated = PETSC_TRUE; 1570 B->ops->solve = MatSolve_SeqAIJKokkos; 1571 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1572 B->ops->matsolve = NULL; 1573 B->ops->matsolvetranspose = NULL; 1574 B->offloadmask = PETSC_OFFLOAD_GPU; 1575 1576 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1577 factors->transpose_updated = PETSC_FALSE; 1578 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1579 /* TODO: log flops, but how to know that? */ 1580 PetscCall(PetscLogGpuTimeEnd()); 1581 PetscFunctionReturn(0); 1582 } 1583 1584 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1585 { 1586 Mat_SeqAIJKokkos *aijkok; 1587 Mat_SeqAIJ *b; 1588 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1589 PetscInt fill_lev = info->levels; 1590 PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 1591 PetscInt n = A->rmap->n; 1592 1593 PetscFunctionBegin; 1594 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1595 /* Rebuild factors */ 1596 if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 1597 else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 1598 1599 /* Create a spiluk handle and then do symbolic factorization */ 1600 nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 1601 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 1602 1603 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1604 1605 Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 1606 Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 1607 Kokkos::realloc(factors->iU_d,n+1); 1608 Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 1609 1610 aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1611 auto i_d = aijkok->i_dual.view_device(); 1612 auto j_d = aijkok->j_dual.view_device(); 1613 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 1614 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1615 1616 Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1617 Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 1618 Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 1619 Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 1620 1621 /* TODO: add options to select sptrsv algorithms */ 1622 /* Create sptrsv handles for L, U and their transpose */ 1623 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1624 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1625 #else 1626 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1627 #endif 1628 1629 factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 1630 factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 1631 factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 1632 factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 1633 1634 /* Fill fields of the factor matrix B */ 1635 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL)); 1636 b = (Mat_SeqAIJ*)B->data; 1637 b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 1638 B->info.fill_ratio_given = info->fill; 1639 B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 1640 1641 B->offloadmask = PETSC_OFFLOAD_GPU; 1642 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1643 PetscFunctionReturn(0); 1644 } 1645 1646 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1647 { 1648 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1649 const PetscInt nrows = A->rmap->n; 1650 1651 PetscFunctionBegin; 1652 PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 1653 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1654 // move B data into Kokkos 1655 PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok 1656 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok 1657 { 1658 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1659 if (!baijkok->diag_d.extent(0)) { 1660 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1661 baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1662 Kokkos::deep_copy (baijkok->diag_d, h_diag); 1663 } 1664 } 1665 PetscFunctionReturn(0); 1666 } 1667 1668 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1669 { 1670 PetscFunctionBegin; 1671 *type = MATSOLVERKOKKOS; 1672 PetscFunctionReturn(0); 1673 } 1674 1675 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 1676 { 1677 PetscFunctionBegin; 1678 *type = MATSOLVERKOKKOSDEVICE; 1679 PetscFunctionReturn(0); 1680 } 1681 1682 /*MC 1683 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1684 on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1685 1686 Level: beginner 1687 1688 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1689 M*/ 1690 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1691 { 1692 PetscInt n = A->rmap->n; 1693 1694 PetscFunctionBegin; 1695 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 1696 PetscCall(MatSetSizes(*B,n,n,n,n)); 1697 (*B)->factortype = ftype; 1698 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 1699 PetscCall(MatSetType(*B,MATSEQAIJKOKKOS)); 1700 1701 if (ftype == MAT_FACTOR_LU) { 1702 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 1703 (*B)->canuseordering = PETSC_TRUE; 1704 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1705 } else if (ftype == MAT_FACTOR_ILU) { 1706 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 1707 (*B)->canuseordering = PETSC_FALSE; 1708 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1709 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1710 1711 PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 1712 PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos)); 1713 PetscFunctionReturn(0); 1714 } 1715 1716 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 1717 { 1718 PetscInt n = A->rmap->n; 1719 1720 PetscFunctionBegin; 1721 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 1722 PetscCall(MatSetSizes(*B,n,n,n,n)); 1723 (*B)->factortype = ftype; 1724 (*B)->canuseordering = PETSC_TRUE; 1725 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 1726 PetscCall(MatSetType(*B,MATSEQAIJKOKKOS)); 1727 1728 if (ftype == MAT_FACTOR_LU) { 1729 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 1730 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1731 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1732 1733 PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 1734 PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device)); 1735 PetscFunctionReturn(0); 1736 } 1737 1738 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1739 { 1740 PetscFunctionBegin; 1741 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos)); 1742 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos)); 1743 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device)); 1744 PetscFunctionReturn(0); 1745 } 1746 1747 /* Utility to print out a KokkosCsrMatrix for debugging */ 1748 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1749 { 1750 const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1751 const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1752 const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1753 const PetscInt *i = iv.data(); 1754 const PetscInt *j = jv.data(); 1755 const PetscScalar *a = av.data(); 1756 PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1757 1758 PetscFunctionBegin; 1759 PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz)); 1760 for (PetscInt k=0; k<m; k++) { 1761 PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k)); 1762 for (PetscInt p=i[k]; p<i[k+1]; p++) { 1763 PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p])); 1764 } 1765 PetscCall(PetscPrintf(PETSC_COMM_SELF,"\n")); 1766 } 1767 PetscFunctionReturn(0); 1768 } 1769