1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petsc/private/sfimpl.h> 5 #include <petscsystypes.h> 6 #include <petscerror.h> 7 8 #include <Kokkos_Core.hpp> 9 #include <KokkosBlas.hpp> 10 #include <KokkosKernels_Sorting.hpp> 11 #include <KokkosSparse_CrsMatrix.hpp> 12 #include <KokkosSparse_spmv.hpp> 13 #include <KokkosSparse_spiluk.hpp> 14 #include <KokkosSparse_sptrsv.hpp> 15 #include <KokkosSparse_spgemm.hpp> 16 #include <KokkosSparse_spadd.hpp> 17 18 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 19 20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 21 22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 23 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 24 In the latter case, it is important to set a_dual's sync state correctly. 25 */ 26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 27 { 28 Mat_SeqAIJ *aijseq; 29 Mat_SeqAIJKokkos *aijkok; 30 31 PetscFunctionBegin; 32 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 33 PetscCall(MatAssemblyEnd_SeqAIJ(A,mode)); 34 35 aijseq = static_cast<Mat_SeqAIJ*>(A->data); 36 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 37 38 /* If aijkok does not exist, we just copy i, j to device. 39 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 40 In both cases, we build a new aijkok structure. 41 */ 42 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 43 delete aijkok; 44 aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 45 A->spptr = aijkok; 46 } 47 48 if (aijkok && aijkok->device_mat_d.data()) { 49 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 50 } 51 PetscFunctionReturn(0); 52 } 53 54 /* Sync CSR data to device if not yet */ 55 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 56 { 57 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 58 59 PetscFunctionBegin; 60 PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 61 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 62 if (aijkok->a_dual.need_sync_device()) { 63 aijkok->a_dual.sync_device(); 64 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 65 aijkok->hermitian_updated = PETSC_FALSE; 66 } 67 PetscFunctionReturn(0); 68 } 69 70 /* Mark the CSR data on device as modified */ 71 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 72 { 73 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 74 75 PetscFunctionBegin; 76 PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 77 aijkok->a_dual.clear_sync_state(); 78 aijkok->a_dual.modify_device(); 79 aijkok->transpose_updated = PETSC_FALSE; 80 aijkok->hermitian_updated = PETSC_FALSE; 81 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 82 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 83 PetscFunctionReturn(0); 84 } 85 86 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 87 { 88 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 89 90 PetscFunctionBegin; 91 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 92 /* We do not expect one needs factors on host */ 93 PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 94 PetscCheck(aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 95 aijkok->a_dual.sync_host(); 96 PetscFunctionReturn(0); 97 } 98 99 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 100 { 101 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 102 103 PetscFunctionBegin; 104 if (aijkok) { 105 aijkok->a_dual.sync_host(); 106 *array = aijkok->a_dual.view_host().data(); 107 } else { /* Happens when calling MatSetValues on a newly created matrix */ 108 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 109 } 110 PetscFunctionReturn(0); 111 } 112 113 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 114 { 115 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 116 117 PetscFunctionBegin; 118 if (aijkok) aijkok->a_dual.modify_host(); 119 PetscFunctionReturn(0); 120 } 121 122 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 123 { 124 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 125 126 PetscFunctionBegin; 127 if (aijkok) { 128 aijkok->a_dual.sync_host(); 129 *array = aijkok->a_dual.view_host().data(); 130 } else { 131 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 132 } 133 PetscFunctionReturn(0); 134 } 135 136 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 137 { 138 PetscFunctionBegin; 139 *array = NULL; 140 PetscFunctionReturn(0); 141 } 142 143 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 144 { 145 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 146 147 PetscFunctionBegin; 148 if (aijkok) { 149 *array = aijkok->a_dual.view_host().data(); 150 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 151 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 152 } 153 PetscFunctionReturn(0); 154 } 155 156 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 157 { 158 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 159 160 PetscFunctionBegin; 161 if (aijkok) { 162 aijkok->a_dual.clear_sync_state(); 163 aijkok->a_dual.modify_host(); 164 } 165 PetscFunctionReturn(0); 166 } 167 168 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype) 169 { 170 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 171 172 PetscFunctionBegin; 173 PetscCheck(aijkok != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"aijkok is NULL"); 174 175 if (i) *i = aijkok->i_device_data(); 176 if (j) *j = aijkok->j_device_data(); 177 if (a) { 178 aijkok->a_dual.sync_device(); 179 *a = aijkok->a_device_data(); 180 } 181 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 182 PetscFunctionReturn(0); 183 } 184 185 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 186 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 187 { 188 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 189 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 190 191 PetscFunctionBegin; 192 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 193 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 194 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 195 PetscFunctionReturn(0); 196 } 197 198 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 199 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 200 { 201 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 202 203 PetscFunctionBegin; 204 if (aijkok && aijkok->device_mat_d.data()) { 205 *d_mat = aijkok->device_mat_d.data(); 206 } else { 207 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it) 208 *d_mat = NULL; 209 } 210 PetscFunctionReturn(0); 211 } 212 213 /* Generate the transpose on device and cache it internally */ 214 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 215 { 216 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 217 218 PetscFunctionBegin; 219 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 220 if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 221 /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 222 PetscCallCXX(aijkok->a_dual.sync_device()); 223 PetscCallCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 224 PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 225 aijkok->transpose_updated = PETSC_TRUE; 226 } 227 *csrmatT = &aijkok->csrmatT; 228 PetscFunctionReturn(0); 229 } 230 231 /* Generate the Hermitian on device and cache it internally */ 232 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 233 { 234 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 235 236 PetscFunctionBegin; 237 PetscCall(PetscLogGpuTimeBegin()); 238 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 239 if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 240 PetscCallCXX(aijkok->a_dual.sync_device()); 241 PetscCallCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 242 PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 243 #if defined(PETSC_USE_COMPLEX) 244 const auto& a = aijkok->csrmatH.values; 245 Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 246 #endif 247 aijkok->hermitian_updated = PETSC_TRUE; 248 } 249 *csrmatH = &aijkok->csrmatH; 250 PetscCall(PetscLogGpuTimeEnd()); 251 PetscFunctionReturn(0); 252 } 253 254 /* y = A x */ 255 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 256 { 257 Mat_SeqAIJKokkos *aijkok; 258 ConstPetscScalarKokkosView xv; 259 PetscScalarKokkosView yv; 260 261 PetscFunctionBegin; 262 PetscCall(PetscLogGpuTimeBegin()); 263 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 264 PetscCall(VecGetKokkosView(xx,&xv)); 265 PetscCall(VecGetKokkosViewWrite(yy,&yv)); 266 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 267 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 268 PetscCall(VecRestoreKokkosView(xx,&xv)); 269 PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 270 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 271 PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz())); 272 PetscCall(PetscLogGpuTimeEnd()); 273 PetscFunctionReturn(0); 274 } 275 276 /* y = A^T x */ 277 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 278 { 279 Mat_SeqAIJKokkos *aijkok; 280 const char *mode; 281 ConstPetscScalarKokkosView xv; 282 PetscScalarKokkosView yv; 283 KokkosCsrMatrix *csrmat; 284 285 PetscFunctionBegin; 286 PetscCall(PetscLogGpuTimeBegin()); 287 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 288 PetscCall(VecGetKokkosView(xx,&xv)); 289 PetscCall(VecGetKokkosViewWrite(yy,&yv)); 290 if (A->form_explicit_transpose) { 291 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat)); 292 mode = "N"; 293 } else { 294 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 295 csrmat = &aijkok->csrmat; 296 mode = "T"; 297 } 298 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 299 PetscCall(VecRestoreKokkosView(xx,&xv)); 300 PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 301 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 302 PetscCall(PetscLogGpuTimeEnd()); 303 PetscFunctionReturn(0); 304 } 305 306 /* y = A^H x */ 307 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 308 { 309 Mat_SeqAIJKokkos *aijkok; 310 const char *mode; 311 ConstPetscScalarKokkosView xv; 312 PetscScalarKokkosView yv; 313 KokkosCsrMatrix *csrmat; 314 315 PetscFunctionBegin; 316 PetscCall(PetscLogGpuTimeBegin()); 317 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 318 PetscCall(VecGetKokkosView(xx,&xv)); 319 PetscCall(VecGetKokkosViewWrite(yy,&yv)); 320 if (A->form_explicit_transpose) { 321 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat)); 322 mode = "N"; 323 } else { 324 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 325 csrmat = &aijkok->csrmat; 326 mode = "C"; 327 } 328 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 329 PetscCall(VecRestoreKokkosView(xx,&xv)); 330 PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 331 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 332 PetscCall(PetscLogGpuTimeEnd()); 333 PetscFunctionReturn(0); 334 } 335 336 /* z = A x + y */ 337 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 338 { 339 Mat_SeqAIJKokkos *aijkok; 340 ConstPetscScalarKokkosView xv,yv; 341 PetscScalarKokkosView zv; 342 343 PetscFunctionBegin; 344 PetscCall(PetscLogGpuTimeBegin()); 345 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 346 PetscCall(VecGetKokkosView(xx,&xv)); 347 PetscCall(VecGetKokkosView(yy,&yv)); 348 PetscCall(VecGetKokkosViewWrite(zz,&zv)); 349 if (zz != yy) Kokkos::deep_copy(zv,yv); 350 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 351 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 352 PetscCall(VecRestoreKokkosView(xx,&xv)); 353 PetscCall(VecRestoreKokkosView(yy,&yv)); 354 PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 355 PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz())); 356 PetscCall(PetscLogGpuTimeEnd()); 357 PetscFunctionReturn(0); 358 } 359 360 /* z = A^T x + y */ 361 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 362 { 363 Mat_SeqAIJKokkos *aijkok; 364 const char *mode; 365 ConstPetscScalarKokkosView xv,yv; 366 PetscScalarKokkosView zv; 367 KokkosCsrMatrix *csrmat; 368 369 PetscFunctionBegin; 370 PetscCall(PetscLogGpuTimeBegin()); 371 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 372 PetscCall(VecGetKokkosView(xx,&xv)); 373 PetscCall(VecGetKokkosView(yy,&yv)); 374 PetscCall(VecGetKokkosViewWrite(zz,&zv)); 375 if (zz != yy) Kokkos::deep_copy(zv,yv); 376 if (A->form_explicit_transpose) { 377 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat)); 378 mode = "N"; 379 } else { 380 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 381 csrmat = &aijkok->csrmat; 382 mode = "T"; 383 } 384 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 385 PetscCall(VecRestoreKokkosView(xx,&xv)); 386 PetscCall(VecRestoreKokkosView(yy,&yv)); 387 PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 388 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 389 PetscCall(PetscLogGpuTimeEnd()); 390 PetscFunctionReturn(0); 391 } 392 393 /* z = A^H x + y */ 394 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 395 { 396 Mat_SeqAIJKokkos *aijkok; 397 const char *mode; 398 ConstPetscScalarKokkosView xv,yv; 399 PetscScalarKokkosView zv; 400 KokkosCsrMatrix *csrmat; 401 402 PetscFunctionBegin; 403 PetscCall(PetscLogGpuTimeBegin()); 404 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 405 PetscCall(VecGetKokkosView(xx,&xv)); 406 PetscCall(VecGetKokkosView(yy,&yv)); 407 PetscCall(VecGetKokkosViewWrite(zz,&zv)); 408 if (zz != yy) Kokkos::deep_copy(zv,yv); 409 if (A->form_explicit_transpose) { 410 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat)); 411 mode = "N"; 412 } else { 413 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 414 csrmat = &aijkok->csrmat; 415 mode = "C"; 416 } 417 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 418 PetscCall(VecRestoreKokkosView(xx,&xv)); 419 PetscCall(VecRestoreKokkosView(yy,&yv)); 420 PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 421 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 422 PetscCall(PetscLogGpuTimeEnd()); 423 PetscFunctionReturn(0); 424 } 425 426 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 427 { 428 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 429 430 PetscFunctionBegin; 431 switch (op) { 432 case MAT_FORM_EXPLICIT_TRANSPOSE: 433 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 434 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 435 A->form_explicit_transpose = flg; 436 break; 437 default: 438 PetscCall(MatSetOption_SeqAIJ(A,op,flg)); 439 break; 440 } 441 PetscFunctionReturn(0); 442 } 443 444 /* Depending on reuse, either build a new mat, or use the existing mat */ 445 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 446 { 447 Mat_SeqAIJ *aseq; 448 449 PetscFunctionBegin; 450 PetscCall(PetscKokkosInitializeCheck()); 451 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 452 PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); /* the returned newmat is a SeqAIJKokkos */ 453 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 454 PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 455 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 456 PetscCheck(A == *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 457 PetscCall(PetscFree(A->defaultvectype)); 458 PetscCall(PetscStrallocpy(VECKOKKOS,&A->defaultvectype)); /* Allocate and copy the string */ 459 PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS)); 460 PetscCall(MatSetOps_SeqAIJKokkos(A)); 461 aseq = static_cast<Mat_SeqAIJ*>(A->data); 462 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 463 PetscCheck(!A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 464 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 465 } 466 } 467 PetscFunctionReturn(0); 468 } 469 470 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 471 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 472 */ 473 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 474 { 475 Mat_SeqAIJ *bseq; 476 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 477 Mat mat; 478 479 PetscFunctionBegin; 480 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 481 PetscCall(MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B)); 482 mat = *B; 483 if (A->assembled) { 484 bseq = static_cast<Mat_SeqAIJ*>(mat->data); 485 bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 486 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 487 /* Now copy values to B if needed */ 488 if (dupOption == MAT_COPY_VALUES) { 489 if (akok->a_dual.need_sync_device()) { 490 Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 491 bkok->a_dual.modify_host(); 492 } else { /* If device has the latest data, we only copy data on device */ 493 Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 494 bkok->a_dual.modify_device(); 495 } 496 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 497 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 498 bkok->a_dual.modify_host(); 499 } 500 mat->spptr = bkok; 501 } 502 503 PetscCall(PetscFree(mat->defaultvectype)); 504 PetscCall(PetscStrallocpy(VECKOKKOS,&mat->defaultvectype)); /* Allocate and copy the string */ 505 PetscCall(PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS)); 506 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 507 PetscFunctionReturn(0); 508 } 509 510 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 511 { 512 Mat At; 513 KokkosCsrMatrix *internT; 514 Mat_SeqAIJKokkos *atkok,*bkok; 515 516 PetscFunctionBegin; 517 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&internT)); /* Generate a transpose internally */ 518 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 519 /* Deep copy internT, as we want to isolate the internal transpose */ 520 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT))); 521 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At)); 522 if (reuse == MAT_INITIAL_MATRIX) *B = At; 523 else PetscCall(MatHeaderReplace(A,&At)); /* Replace A with At inplace */ 524 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 525 if ((*B)->assembled) { 526 bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 527 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 528 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 529 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 530 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 531 MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 532 MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 533 PetscCallCXX(Kokkos::deep_copy(a_h,internT->values)); 534 PetscCallCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 535 } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 536 } 537 PetscFunctionReturn(0); 538 } 539 540 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 541 { 542 Mat_SeqAIJKokkos *aijkok; 543 544 PetscFunctionBegin; 545 if (A->factortype == MAT_FACTOR_NONE) { 546 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 547 delete aijkok; 548 } else { 549 delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 550 } 551 A->spptr = NULL; 552 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL)); 553 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 554 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 555 PetscCall(MatDestroy_SeqAIJ(A)); 556 PetscFunctionReturn(0); 557 } 558 559 /*MC 560 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 561 562 A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 563 564 Options Database Keys: 565 . -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions() 566 567 Level: beginner 568 569 .seealso: MatCreateSeqAIJKokkos(), MATMPIAIJKOKKOS 570 M*/ 571 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 572 { 573 PetscFunctionBegin; 574 PetscCall(PetscKokkosInitializeCheck()); 575 PetscCall(MatCreate_SeqAIJ(A)); 576 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A)); 577 PetscFunctionReturn(0); 578 } 579 580 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 581 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 582 { 583 Mat_SeqAIJ *a,*b; 584 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 585 MatScalarKokkosView aa,ba,ca; 586 MatRowMapKokkosView ai,bi,ci; 587 MatColIdxKokkosView aj,bj,cj; 588 PetscInt m,n,nnz,aN; 589 590 PetscFunctionBegin; 591 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 592 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 593 PetscValidPointer(C,4); 594 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 595 PetscCheckTypeName(B,MATSEQAIJKOKKOS); 596 PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 597 PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 598 599 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 600 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 601 a = static_cast<Mat_SeqAIJ*>(A->data); 602 b = static_cast<Mat_SeqAIJ*>(B->data); 603 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 604 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 605 aa = akok->a_dual.view_device(); 606 ai = akok->i_dual.view_device(); 607 ba = bkok->a_dual.view_device(); 608 bi = bkok->i_dual.view_device(); 609 m = A->rmap->n; /* M, N and nnz of C */ 610 n = A->cmap->n + B->cmap->n; 611 nnz = a->nz + b->nz; 612 aN = A->cmap->n; /* N of A */ 613 if (reuse == MAT_INITIAL_MATRIX) { 614 aj = akok->j_dual.view_device(); 615 bj = bkok->j_dual.view_device(); 616 auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 617 auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 618 auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 619 ca = ca_dual.view_device(); 620 ci = ci_dual.view_device(); 621 cj = cj_dual.view_device(); 622 623 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 624 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 625 PetscInt i = t.league_rank(); /* row i */ 626 PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 627 628 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 629 ci(i) = coffset; 630 if (i == m-1) ci(m) = ai(m) + bi(m); 631 }); 632 633 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 634 if (k < alen) { 635 ca(coffset+k) = aa(ai(i)+k); 636 cj(coffset+k) = aj(ai(i)+k); 637 } else { 638 ca(coffset+k) = ba(bi(i)+k-alen); 639 cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 640 } 641 }); 642 }); 643 ca_dual.modify_device(); 644 ci_dual.modify_device(); 645 cj_dual.modify_device(); 646 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 647 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C)); 648 } else if (reuse == MAT_REUSE_MATRIX) { 649 PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 650 PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 651 ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 652 ca = ckok->a_dual.view_device(); 653 ci = ckok->i_dual.view_device(); 654 655 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 656 PetscInt i = t.league_rank(); /* row i */ 657 PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 658 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 659 if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 660 else ca(ci(i)+k) = ba(bi(i)+k-alen); 661 }); 662 }); 663 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 664 } 665 PetscFunctionReturn(0); 666 } 667 668 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 669 { 670 PetscFunctionBegin; 671 delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 672 PetscFunctionReturn(0); 673 } 674 675 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 676 { 677 Mat_Product *product = C->product; 678 Mat A,B; 679 bool transA,transB; /* use bool, since KK needs this type */ 680 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 681 Mat_SeqAIJ *c; 682 MatProductData_SeqAIJKokkos *pdata; 683 KokkosCsrMatrix *csrmatA,*csrmatB; 684 685 PetscFunctionBegin; 686 MatCheckProduct(C,1); 687 PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 688 pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 689 690 if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 691 pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 692 PetscFunctionReturn(0); 693 } 694 695 switch (product->type) { 696 case MATPRODUCT_AB: transA = false; transB = false; break; 697 case MATPRODUCT_AtB: transA = true; transB = false; break; 698 case MATPRODUCT_ABt: transA = false; transB = true; break; 699 default: 700 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 701 } 702 703 A = product->A; 704 B = product->B; 705 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 706 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 707 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 708 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 709 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 710 711 PetscCheck(ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 712 713 csrmatA = &akok->csrmat; 714 csrmatB = &bkok->csrmat; 715 716 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 717 if (transA) { 718 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA)); 719 transA = false; 720 } 721 722 if (transB) { 723 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB)); 724 transB = false; 725 } 726 PetscCall(PetscLogGpuTimeBegin()); 727 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 728 PetscCallCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 729 PetscCall(PetscLogGpuTimeEnd()); 730 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 731 /* shorter version of MatAssemblyEnd_SeqAIJ */ 732 c = (Mat_SeqAIJ*)C->data; 733 PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz)); 734 PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n")); 735 PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax)); 736 c->reallocs = 0; 737 C->info.mallocs = 0; 738 C->info.nz_unneeded = 0; 739 C->assembled = C->was_assembled = PETSC_TRUE; 740 C->num_ass++; 741 PetscFunctionReturn(0); 742 } 743 744 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 745 { 746 Mat_Product *product = C->product; 747 MatProductType ptype; 748 Mat A,B; 749 bool transA,transB; 750 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 751 MatProductData_SeqAIJKokkos *pdata; 752 MPI_Comm comm; 753 KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 754 755 PetscFunctionBegin; 756 MatCheckProduct(C,1); 757 PetscCall(PetscObjectGetComm((PetscObject)C,&comm)); 758 PetscCheck(!product->data,comm,PETSC_ERR_PLIB,"Product data not empty"); 759 A = product->A; 760 B = product->B; 761 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 762 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 763 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 764 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 765 csrmatA = &akok->csrmat; 766 csrmatB = &bkok->csrmat; 767 768 ptype = product->type; 769 switch (ptype) { 770 case MATPRODUCT_AB: transA = false; transB = false; break; 771 case MATPRODUCT_AtB: transA = true; transB = false; break; 772 case MATPRODUCT_ABt: transA = false; transB = true; break; 773 default: 774 SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 775 } 776 777 product->data = pdata = new MatProductData_SeqAIJKokkos(); 778 pdata->kh.set_team_work_size(16); 779 pdata->kh.set_dynamic_scheduling(true); 780 pdata->reusesym = product->api_user; 781 782 /* TODO: add command line options to select spgemm algorithms */ 783 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 784 /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API. 785 We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support. 786 */ 787 pdata->kh.create_spgemm_handle(spgemm_alg); 788 789 PetscCall(PetscLogGpuTimeBegin()); 790 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 791 if (transA) { 792 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA)); 793 transA = false; 794 } 795 796 if (transB) { 797 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB)); 798 transB = false; 799 } 800 801 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 802 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 803 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 804 calling new Mat_SeqAIJKokkos(). 805 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 806 */ 807 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 808 PetscCallCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 809 PetscCall(PetscLogGpuTimeEnd()); 810 811 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 812 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C,ckok)); 813 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 814 PetscFunctionReturn(0); 815 } 816 817 /* handles sparse matrix matrix ops */ 818 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 819 { 820 Mat_Product *product = mat->product; 821 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 822 823 PetscFunctionBegin; 824 MatCheckProduct(mat,1); 825 PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok)); 826 if (product->type == MATPRODUCT_ABC) { 827 PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok)); 828 } 829 if (Biskok && Ciskok) { 830 switch (product->type) { 831 case MATPRODUCT_AB: 832 case MATPRODUCT_AtB: 833 case MATPRODUCT_ABt: 834 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 835 break; 836 case MATPRODUCT_PtAP: 837 case MATPRODUCT_RARt: 838 case MATPRODUCT_ABC: 839 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 840 break; 841 default: 842 break; 843 } 844 } else { /* fallback for AIJ */ 845 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 846 } 847 PetscFunctionReturn(0); 848 } 849 850 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 851 { 852 Mat_SeqAIJKokkos *aijkok; 853 854 PetscFunctionBegin; 855 PetscCall(PetscLogGpuTimeBegin()); 856 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 857 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 858 KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 859 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 860 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 861 PetscCall(PetscLogGpuTimeEnd()); 862 PetscFunctionReturn(0); 863 } 864 865 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 866 { 867 Mat_SeqAIJKokkos *aijkok; 868 869 PetscFunctionBegin; 870 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 871 if (aijkok) { /* Only zero the device if data is already there */ 872 KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 873 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 874 } else { /* Might be preallocated but not assembled */ 875 PetscCall(MatZeroEntries_SeqAIJ(A)); 876 } 877 PetscFunctionReturn(0); 878 } 879 880 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 881 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv) 882 { 883 Mat_SeqAIJKokkos *aijkok; 884 885 PetscFunctionBegin; 886 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 887 PetscValidPointer(kv,2); 888 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 889 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 890 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 891 *kv = aijkok->a_dual.view_device(); 892 PetscFunctionReturn(0); 893 } 894 895 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv) 896 { 897 PetscFunctionBegin; 898 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 899 PetscValidPointer(kv,2); 900 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 901 PetscFunctionReturn(0); 902 } 903 904 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv) 905 { 906 Mat_SeqAIJKokkos *aijkok; 907 908 PetscFunctionBegin; 909 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 910 PetscValidPointer(kv,2); 911 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 912 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 913 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 914 *kv = aijkok->a_dual.view_device(); 915 PetscFunctionReturn(0); 916 } 917 918 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv) 919 { 920 PetscFunctionBegin; 921 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 922 PetscValidPointer(kv,2); 923 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 924 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 925 PetscFunctionReturn(0); 926 } 927 928 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 929 { 930 Mat_SeqAIJKokkos *aijkok; 931 932 PetscFunctionBegin; 933 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 934 PetscValidPointer(kv,2); 935 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 936 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 937 *kv = aijkok->a_dual.view_device(); 938 PetscFunctionReturn(0); 939 } 940 941 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 942 { 943 PetscFunctionBegin; 944 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 945 PetscValidPointer(kv,2); 946 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 947 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 948 PetscFunctionReturn(0); 949 } 950 951 /* Computes Y += alpha X */ 952 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 953 { 954 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 955 Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 956 ConstMatScalarKokkosView Xa; 957 MatScalarKokkosView Ya; 958 959 PetscFunctionBegin; 960 PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 961 PetscCheckTypeName(X,MATSEQAIJKOKKOS); 962 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 963 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 964 PetscCall(PetscLogGpuTimeBegin()); 965 966 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 967 /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 968 PetscBool e; 969 PetscCall(PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e)); 970 if (e) { 971 PetscCall(PetscArraycmp(x->j,y->j,y->nz,&e)); 972 if (e) pattern = SAME_NONZERO_PATTERN; 973 } 974 } 975 976 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 977 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 978 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 979 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 980 */ 981 ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 982 xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 983 Xa = xkok->a_dual.view_device(); 984 Ya = ykok->a_dual.view_device(); 985 986 if (pattern == SAME_NONZERO_PATTERN) { 987 KokkosBlas::axpy(alpha,Xa,Ya); 988 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 989 } else if (pattern == SUBSET_NONZERO_PATTERN) { 990 MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 991 MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 992 993 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 994 PetscInt i = t.league_rank(); /* row i */ 995 Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 996 PetscInt p,q = Yi(i); 997 for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 998 while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 999 if (Xj(p) == Yj(q)) { /* Found it */ 1000 Ya(q) += alpha * Xa(p); 1001 q++; 1002 } else { 1003 /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1004 Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1005 */ 1006 if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */ 1007 } 1008 } 1009 }); 1010 }); 1011 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1012 } else { /* different nonzero patterns */ 1013 Mat Z; 1014 KokkosCsrMatrix zcsr; 1015 KernelHandle kh; 1016 kh.create_spadd_handle(false); 1017 KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1018 KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1019 zkok = new Mat_SeqAIJKokkos(zcsr); 1020 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z)); 1021 PetscCall(MatHeaderReplace(Y,&Z)); 1022 kh.destroy_spadd_handle(); 1023 } 1024 PetscCall(PetscLogGpuTimeEnd()); 1025 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0)*2)); /* Because we scaled X and then added it to Y */ 1026 PetscFunctionReturn(0); 1027 } 1028 1029 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 1030 { 1031 Mat_SeqAIJKokkos *akok; 1032 Mat_SeqAIJ *aseq; 1033 1034 PetscFunctionBegin; 1035 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j)); 1036 aseq = static_cast<Mat_SeqAIJ*>(mat->data); 1037 akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr); 1038 delete akok; 1039 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,mat->nonzerostate+1,PETSC_FALSE); 1040 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1041 akok->SetUpCOO(aseq); 1042 PetscFunctionReturn(0); 1043 } 1044 1045 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode) 1046 { 1047 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ*>(A->data); 1048 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1049 PetscCount Annz = aseq->nz; 1050 const PetscCountKokkosView& jmap = akok->jmap_d; 1051 const PetscCountKokkosView& perm = akok->perm_d; 1052 MatScalarKokkosView Aa; 1053 ConstMatScalarKokkosView kv; 1054 PetscMemType memtype; 1055 1056 PetscFunctionBegin; 1057 PetscCall(PetscGetMemType(v,&memtype)); 1058 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1059 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n)); 1060 } else { 1061 kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */ 1062 } 1063 1064 if (imode == INSERT_VALUES) { 1065 PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); /* write matrix values */ 1066 Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */ 1067 } else PetscCall(MatSeqAIJGetKokkosView(A,&Aa)); /* read & write matrix values */ 1068 1069 Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=jmap(i); k<jmap(i+1); k++) Aa(i) += kv(perm(k));}); 1070 1071 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa)); 1072 else PetscCall(MatSeqAIJRestoreKokkosView(A,&Aa)); 1073 PetscFunctionReturn(0); 1074 } 1075 1076 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1077 { 1078 PetscFunctionBegin; 1079 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1080 PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info)); 1081 B->offloadmask = PETSC_OFFLOAD_CPU; 1082 PetscFunctionReturn(0); 1083 } 1084 1085 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1086 { 1087 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1088 1089 PetscFunctionBegin; 1090 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1091 A->boundtocpu = PETSC_FALSE; 1092 1093 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1094 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1095 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1096 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1097 A->ops->scale = MatScale_SeqAIJKokkos; 1098 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1099 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1100 A->ops->mult = MatMult_SeqAIJKokkos; 1101 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1102 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1103 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1104 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1105 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1106 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1107 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1108 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1109 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1110 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1111 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1112 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1113 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1114 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1115 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1116 1117 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos)); 1118 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJKokkos)); 1119 PetscFunctionReturn(0); 1120 } 1121 1122 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1123 { 1124 Mat_SeqAIJ *aseq; 1125 PetscInt i,m,n; 1126 1127 PetscFunctionBegin; 1128 PetscCheck(!A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1129 1130 m = akok->nrows(); 1131 n = akok->ncols(); 1132 PetscCall(MatSetSizes(A,m,n,m,n)); 1133 PetscCall(MatSetType(A,MATSEQAIJKOKKOS)); 1134 1135 /* Set up data structures of A as a MATSEQAIJ */ 1136 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL)); 1137 aseq = (Mat_SeqAIJ*)(A)->data; 1138 1139 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1140 akok->j_dual.sync_host(); 1141 1142 aseq->i = akok->i_host_data(); 1143 aseq->j = akok->j_host_data(); 1144 aseq->a = akok->a_host_data(); 1145 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1146 aseq->singlemalloc = PETSC_FALSE; 1147 aseq->free_a = PETSC_FALSE; 1148 aseq->free_ij = PETSC_FALSE; 1149 aseq->nz = akok->nnz(); 1150 aseq->maxnz = aseq->nz; 1151 1152 PetscCall(PetscMalloc1(m,&aseq->imax)); 1153 PetscCall(PetscMalloc1(m,&aseq->ilen)); 1154 for (i=0; i<m; i++) { 1155 aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1156 } 1157 1158 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1159 akok->nonzerostate = A->nonzerostate; 1160 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1161 PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY)); 1162 PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY)); 1163 PetscFunctionReturn(0); 1164 } 1165 1166 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1167 1168 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1169 */ 1170 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1171 { 1172 PetscFunctionBegin; 1173 PetscCall(MatCreate(comm,A)); 1174 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A,akok)); 1175 PetscFunctionReturn(0); 1176 } 1177 1178 /* --------------------------------------------------------------------------------*/ 1179 /*@C 1180 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 1181 (the default parallel PETSc format). This matrix will ultimately be handled by 1182 Kokkos for calculations. For good matrix 1183 assembly performance the user should preallocate the matrix storage by setting 1184 the parameter nz (or the array nnz). By setting these parameters accurately, 1185 performance during matrix assembly can be increased by more than a factor of 50. 1186 1187 Collective 1188 1189 Input Parameters: 1190 + comm - MPI communicator, set to PETSC_COMM_SELF 1191 . m - number of rows 1192 . n - number of columns 1193 . nz - number of nonzeros per row (same for all rows) 1194 - nnz - array containing the number of nonzeros in the various rows 1195 (possibly different for each row) or NULL 1196 1197 Output Parameter: 1198 . A - the matrix 1199 1200 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1201 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1202 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1203 1204 Notes: 1205 If nnz is given then nz is ignored 1206 1207 The AIJ format (also called the Yale sparse matrix format or 1208 compressed row storage), is fully compatible with standard Fortran 77 1209 storage. That is, the stored row and column indices can begin at 1210 either one (as in Fortran) or zero. See the users' manual for details. 1211 1212 Specify the preallocated storage with either nz or nnz (not both). 1213 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1214 allocation. For large problems you MUST preallocate memory or you 1215 will get TERRIBLE performance, see the users' manual chapter on matrices. 1216 1217 By default, this format uses inodes (identical nodes) when possible, to 1218 improve numerical efficiency of matrix-vector products and solves. We 1219 search for consecutive rows with the same nonzero structure, thereby 1220 reusing matrix information to achieve increased efficiency. 1221 1222 Level: intermediate 1223 1224 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 1225 @*/ 1226 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1227 { 1228 PetscFunctionBegin; 1229 PetscCall(PetscKokkosInitializeCheck()); 1230 PetscCall(MatCreate(comm,A)); 1231 PetscCall(MatSetSizes(*A,m,n,m,n)); 1232 PetscCall(MatSetType(*A,MATSEQAIJKOKKOS)); 1233 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz)); 1234 PetscFunctionReturn(0); 1235 } 1236 1237 typedef Kokkos::TeamPolicy<>::member_type team_member; 1238 // 1239 // This factorization exploits block diagonal matrices with "Nf" (not used). 1240 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1241 // 1242 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1243 { 1244 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1245 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1246 IS isrow = b->row,isicol = b->icol; 1247 const PetscInt *r_h,*ic_h; 1248 const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data(); 1249 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1250 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1251 PetscBool row_identity,col_identity; 1252 PetscInt nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used 1253 1254 PetscFunctionBegin; 1255 PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n); 1256 PetscCall(MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity)); 1257 PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 1258 PetscCall(ISGetIndices(isrow,&r_h)); 1259 PetscCall(ISGetIndices(isicol,&ic_h)); 1260 PetscCall(ISGetSize(isicol,&nc)); 1261 PetscCall(PetscLogGpuTimeBegin()); 1262 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1263 { 1264 #define KOKKOS_SHARED_LEVEL 1 1265 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1266 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1267 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1268 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 1269 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 1270 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 1271 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 1272 size_t flops_h = 0.0; 1273 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 1274 Kokkos::View<size_t> d_flops_k ("flops"); 1275 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1276 const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 1277 Kokkos::deep_copy (d_flops_k, h_flops_k); 1278 Kokkos::deep_copy (d_r_k, h_r_k); 1279 Kokkos::deep_copy (d_ic_k, h_ic_k); 1280 // Fill A --> fact 1281 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1282 const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 1283 const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 1284 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1285 // zero rows of B 1286 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1287 PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 1288 PetscScalar *baL = ba_d + bi_d[rowb]; 1289 PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 1290 /* zero (unfactored row) */ 1291 for (int j=0;j<nzbL;j++) baL[j] = 0; 1292 for (int j=0;j<nzbU;j++) baU[j] = 0; 1293 }); 1294 // copy A into B 1295 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1296 PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 1297 const PetscScalar *av = aa_d + ai_d[rowa]; 1298 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1299 /* load in initial (unfactored row) */ 1300 for (int j=0;j<nza;j++) { 1301 PetscInt colb = ic[ajtmp[j]]; 1302 PetscScalar vala = av[j]; 1303 if (colb == rowb) { 1304 *(ba_d + bdiag_d[rowb]) = vala; 1305 } else { 1306 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1307 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1308 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 1309 for (int j=0; j<nz ; j++) { 1310 if (pbj[j] == colb) { 1311 pba[j] = vala; 1312 set++; 1313 break; 1314 } 1315 } 1316 #if !defined(PETSC_HAVE_SYCL) 1317 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1318 #endif 1319 } 1320 } 1321 }); 1322 }); 1323 Kokkos::fence(); 1324 1325 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 1326 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1327 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1328 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1329 const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 1330 const PetscInt start = field*nloc, end = start + nloc; 1331 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1332 // A22 panel update for each row A(1,:) and col A(:,1) 1333 for (int ii=start; ii<end-1; ii++) { 1334 const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 1335 const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 1336 const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 1337 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1338 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 1339 PetscInt kIdx = j*Ni + field_block_idx; 1340 if (kIdx >= nzUi) /* void */ ; 1341 else { 1342 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1343 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1344 const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 1345 size_t st_idx; 1346 // find and do L(k,i) = A(:k,i) / A(i,i) 1347 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1348 // get column, there has got to be a better way 1349 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 1350 if (pjL[j] == ii) { 1351 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1352 idx = j; // output 1353 *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1354 } 1355 }, st_idx); 1356 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 1357 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1358 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register 1359 #endif 1360 // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1361 // U(i+1,:end) 1362 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 1363 PetscScalar Uij = baUi[uiIdx]; 1364 PetscInt col = bjUi[uiIdx]; 1365 if (col==myk) { 1366 // A_kk = A_kk - L_ki * U_ij(k) 1367 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1368 *Akkv = *Akkv - L_ki() * Uij; // UiK 1369 } else { 1370 PetscScalar *start, *end, *pAkjv=NULL; 1371 PetscInt high, low; 1372 const PetscInt *startj; 1373 if (col<myk) { // L 1374 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1375 PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 1376 start = pLki+1; // start at pLki+1, A22(myk,1) 1377 startj= bj_d + bi_d[myk] + idx; 1378 end = ba_d + bi_d[myk+1]; 1379 } else { 1380 PetscInt idx = bdiag_d[myk+1]+1; 1381 start = ba_d + idx; 1382 startj= bj_d + idx; 1383 end = ba_d + bdiag_d[myk]; 1384 } 1385 // search for 'col', use bisection search - TODO 1386 low = 0; 1387 high = (PetscInt)(end-start); 1388 while (high-low > 5) { 1389 int t = (low+high)/2; 1390 if (startj[t] > col) high = t; 1391 else low = t; 1392 } 1393 for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 1394 if (startj[pAkjv-start] == col) break; 1395 } 1396 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1397 if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register 1398 #endif 1399 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1400 } 1401 }); 1402 } 1403 }); 1404 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1405 if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 1406 } /* endof for (i=0; i<n; i++) { */ 1407 Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 1408 }); 1409 Kokkos::fence(); 1410 Kokkos::deep_copy (h_flops_k, d_flops_k); 1411 PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k())); 1412 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 1413 const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 1414 const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 1415 /* Invert diagonal for simpler triangular solves */ 1416 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 1417 int i = start + outer_index*Ni + lg_rank%Ni; 1418 if (i < end) { 1419 PetscScalar *pv = ba_d + bdiag_d[i]; 1420 *pv = 1.0/(*pv); 1421 } 1422 }); 1423 }); 1424 } 1425 PetscCall(PetscLogGpuTimeEnd()); 1426 PetscCall(ISRestoreIndices(isicol,&ic_h)); 1427 PetscCall(ISRestoreIndices(isrow,&r_h)); 1428 1429 PetscCall(ISIdentity(isrow,&row_identity)); 1430 PetscCall(ISIdentity(isicol,&col_identity)); 1431 if (b->inode.size) { 1432 B->ops->solve = MatSolve_SeqAIJ_Inode; 1433 } else if (row_identity && col_identity) { 1434 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1435 } else { 1436 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1437 } 1438 B->offloadmask = PETSC_OFFLOAD_GPU; 1439 PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU 1440 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1441 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1442 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1443 B->ops->matsolve = MatMatSolve_SeqAIJ; 1444 B->assembled = PETSC_TRUE; 1445 B->preallocated = PETSC_TRUE; 1446 1447 PetscFunctionReturn(0); 1448 } 1449 1450 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1451 { 1452 PetscFunctionBegin; 1453 PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 1454 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1455 PetscFunctionReturn(0); 1456 } 1457 1458 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1459 { 1460 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1461 1462 PetscFunctionBegin; 1463 if (!factors->sptrsv_symbolic_completed) { 1464 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 1465 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 1466 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1467 } 1468 PetscFunctionReturn(0); 1469 } 1470 1471 /* Check if we need to update factors etc for transpose solve */ 1472 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1473 { 1474 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1475 MatColIdxType n = A->rmap->n; 1476 1477 PetscFunctionBegin; 1478 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1479 /* Update L^T and do sptrsv symbolic */ 1480 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 1481 Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1482 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1483 factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 1484 1485 KokkosKernels::Impl::transpose_matrix< 1486 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1487 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1488 MatRowMapKokkosView,DefaultExecutionSpace>( 1489 n,n,factors->iL_d,factors->jL_d,factors->aL_d, 1490 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1491 1492 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1493 We have to sort the indices, until KK provides finer control options. 1494 */ 1495 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1496 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1497 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1498 1499 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 1500 1501 /* Update U^T and do sptrsv symbolic */ 1502 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 1503 Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1504 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1505 factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 1506 1507 KokkosKernels::Impl::transpose_matrix< 1508 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1509 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1510 MatRowMapKokkosView,DefaultExecutionSpace>( 1511 n,n,factors->iU_d, factors->jU_d, factors->aU_d, 1512 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1513 1514 /* Sort indices. See comments above */ 1515 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1516 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1517 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1518 1519 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 1520 factors->transpose_updated = PETSC_TRUE; 1521 } 1522 PetscFunctionReturn(0); 1523 } 1524 1525 /* Solve Ax = b, with A = LU */ 1526 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 1527 { 1528 ConstPetscScalarKokkosView bv; 1529 PetscScalarKokkosView xv; 1530 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1531 1532 PetscFunctionBegin; 1533 PetscCall(PetscLogGpuTimeBegin()); 1534 PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 1535 PetscCall(VecGetKokkosView(b,&bv)); 1536 PetscCall(VecGetKokkosViewWrite(x,&xv)); 1537 /* Solve L tmpv = b */ 1538 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 1539 /* Solve Ux = tmpv */ 1540 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 1541 PetscCall(VecRestoreKokkosView(b,&bv)); 1542 PetscCall(VecRestoreKokkosViewWrite(x,&xv)); 1543 PetscCall(PetscLogGpuTimeEnd()); 1544 PetscFunctionReturn(0); 1545 } 1546 1547 /* Solve A^T x = b, where A^T = U^T L^T */ 1548 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 1549 { 1550 ConstPetscScalarKokkosView bv; 1551 PetscScalarKokkosView xv; 1552 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1553 1554 PetscFunctionBegin; 1555 PetscCall(PetscLogGpuTimeBegin()); 1556 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 1557 PetscCall(VecGetKokkosView(b,&bv)); 1558 PetscCall(VecGetKokkosViewWrite(x,&xv)); 1559 /* Solve U^T tmpv = b */ 1560 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 1561 1562 /* Solve L^T x = tmpv */ 1563 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 1564 PetscCall(VecRestoreKokkosView(b,&bv)); 1565 PetscCall(VecRestoreKokkosViewWrite(x,&xv)); 1566 PetscCall(PetscLogGpuTimeEnd()); 1567 PetscFunctionReturn(0); 1568 } 1569 1570 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1571 { 1572 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1573 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1574 PetscInt fill_lev = info->levels; 1575 1576 PetscFunctionBegin; 1577 PetscCall(PetscLogGpuTimeBegin()); 1578 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1579 1580 auto a_d = aijkok->a_dual.view_device(); 1581 auto i_d = aijkok->i_dual.view_device(); 1582 auto j_d = aijkok->j_dual.view_device(); 1583 1584 KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 1585 1586 B->assembled = PETSC_TRUE; 1587 B->preallocated = PETSC_TRUE; 1588 B->ops->solve = MatSolve_SeqAIJKokkos; 1589 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1590 B->ops->matsolve = NULL; 1591 B->ops->matsolvetranspose = NULL; 1592 B->offloadmask = PETSC_OFFLOAD_GPU; 1593 1594 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1595 factors->transpose_updated = PETSC_FALSE; 1596 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1597 /* TODO: log flops, but how to know that? */ 1598 PetscCall(PetscLogGpuTimeEnd()); 1599 PetscFunctionReturn(0); 1600 } 1601 1602 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1603 { 1604 Mat_SeqAIJKokkos *aijkok; 1605 Mat_SeqAIJ *b; 1606 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1607 PetscInt fill_lev = info->levels; 1608 PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 1609 PetscInt n = A->rmap->n; 1610 1611 PetscFunctionBegin; 1612 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1613 /* Rebuild factors */ 1614 if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 1615 else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 1616 1617 /* Create a spiluk handle and then do symbolic factorization */ 1618 nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 1619 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 1620 1621 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1622 1623 Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 1624 Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 1625 Kokkos::realloc(factors->iU_d,n+1); 1626 Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 1627 1628 aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1629 auto i_d = aijkok->i_dual.view_device(); 1630 auto j_d = aijkok->j_dual.view_device(); 1631 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 1632 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1633 1634 Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1635 Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 1636 Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 1637 Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 1638 1639 /* TODO: add options to select sptrsv algorithms */ 1640 /* Create sptrsv handles for L, U and their transpose */ 1641 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1642 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1643 #else 1644 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1645 #endif 1646 1647 factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 1648 factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 1649 factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 1650 factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 1651 1652 /* Fill fields of the factor matrix B */ 1653 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL)); 1654 b = (Mat_SeqAIJ*)B->data; 1655 b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 1656 B->info.fill_ratio_given = info->fill; 1657 B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 1658 1659 B->offloadmask = PETSC_OFFLOAD_GPU; 1660 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1661 PetscFunctionReturn(0); 1662 } 1663 1664 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1665 { 1666 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1667 const PetscInt nrows = A->rmap->n; 1668 1669 PetscFunctionBegin; 1670 PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 1671 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1672 // move B data into Kokkos 1673 PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok 1674 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok 1675 { 1676 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1677 if (!baijkok->diag_d.extent(0)) { 1678 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1679 baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1680 Kokkos::deep_copy (baijkok->diag_d, h_diag); 1681 } 1682 } 1683 PetscFunctionReturn(0); 1684 } 1685 1686 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1687 { 1688 PetscFunctionBegin; 1689 *type = MATSOLVERKOKKOS; 1690 PetscFunctionReturn(0); 1691 } 1692 1693 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 1694 { 1695 PetscFunctionBegin; 1696 *type = MATSOLVERKOKKOSDEVICE; 1697 PetscFunctionReturn(0); 1698 } 1699 1700 /*MC 1701 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1702 on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1703 1704 Level: beginner 1705 1706 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1707 M*/ 1708 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1709 { 1710 PetscInt n = A->rmap->n; 1711 1712 PetscFunctionBegin; 1713 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 1714 PetscCall(MatSetSizes(*B,n,n,n,n)); 1715 (*B)->factortype = ftype; 1716 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 1717 PetscCall(MatSetType(*B,MATSEQAIJKOKKOS)); 1718 1719 if (ftype == MAT_FACTOR_LU) { 1720 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 1721 (*B)->canuseordering = PETSC_TRUE; 1722 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1723 } else if (ftype == MAT_FACTOR_ILU) { 1724 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 1725 (*B)->canuseordering = PETSC_FALSE; 1726 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1727 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1728 1729 PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 1730 PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos)); 1731 PetscFunctionReturn(0); 1732 } 1733 1734 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 1735 { 1736 PetscInt n = A->rmap->n; 1737 1738 PetscFunctionBegin; 1739 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 1740 PetscCall(MatSetSizes(*B,n,n,n,n)); 1741 (*B)->factortype = ftype; 1742 (*B)->canuseordering = PETSC_TRUE; 1743 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 1744 PetscCall(MatSetType(*B,MATSEQAIJKOKKOS)); 1745 1746 if (ftype == MAT_FACTOR_LU) { 1747 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 1748 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1749 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1750 1751 PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 1752 PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device)); 1753 PetscFunctionReturn(0); 1754 } 1755 1756 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1757 { 1758 PetscFunctionBegin; 1759 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos)); 1760 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos)); 1761 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device)); 1762 PetscFunctionReturn(0); 1763 } 1764 1765 /* Utility to print out a KokkosCsrMatrix for debugging */ 1766 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1767 { 1768 const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1769 const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1770 const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1771 const PetscInt *i = iv.data(); 1772 const PetscInt *j = jv.data(); 1773 const PetscScalar *a = av.data(); 1774 PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1775 1776 PetscFunctionBegin; 1777 PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz)); 1778 for (PetscInt k=0; k<m; k++) { 1779 PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k)); 1780 for (PetscInt p=i[k]; p<i[k+1]; p++) { 1781 PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p])); 1782 } 1783 PetscCall(PetscPrintf(PETSC_COMM_SELF,"\n")); 1784 } 1785 PetscFunctionReturn(0); 1786 } 1787