1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petsc/private/sfimpl.h> 5 #include <petscsystypes.h> 6 #include <petscerror.h> 7 8 #include <Kokkos_Core.hpp> 9 #include <KokkosBlas.hpp> 10 #include <KokkosKernels_Sorting.hpp> 11 #include <KokkosSparse_CrsMatrix.hpp> 12 #include <KokkosSparse_spmv.hpp> 13 #include <KokkosSparse_spiluk.hpp> 14 #include <KokkosSparse_sptrsv.hpp> 15 #include <KokkosSparse_spgemm.hpp> 16 #include <KokkosSparse_spadd.hpp> 17 18 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 19 20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 21 22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 23 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 24 In the latter case, it is important to set a_dual's sync state correctly. 25 */ 26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 27 { 28 Mat_SeqAIJ *aijseq; 29 Mat_SeqAIJKokkos *aijkok; 30 31 PetscFunctionBegin; 32 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 33 PetscCall(MatAssemblyEnd_SeqAIJ(A,mode)); 34 35 aijseq = static_cast<Mat_SeqAIJ*>(A->data); 36 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 37 38 /* If aijkok does not exist, we just copy i, j to device. 39 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 40 In both cases, we build a new aijkok structure. 41 */ 42 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 43 delete aijkok; 44 aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 45 A->spptr = aijkok; 46 } 47 48 if (aijkok && aijkok->device_mat_d.data()) { 49 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 50 } 51 PetscFunctionReturn(0); 52 } 53 54 /* Sync CSR data to device if not yet */ 55 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 56 { 57 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 58 59 PetscFunctionBegin; 60 PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 61 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 62 if (aijkok->a_dual.need_sync_device()) { 63 aijkok->a_dual.sync_device(); 64 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 65 aijkok->hermitian_updated = PETSC_FALSE; 66 } 67 PetscFunctionReturn(0); 68 } 69 70 /* Mark the CSR data on device as modified */ 71 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 72 { 73 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 74 75 PetscFunctionBegin; 76 PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 77 aijkok->a_dual.clear_sync_state(); 78 aijkok->a_dual.modify_device(); 79 aijkok->transpose_updated = PETSC_FALSE; 80 aijkok->hermitian_updated = PETSC_FALSE; 81 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 82 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 83 PetscFunctionReturn(0); 84 } 85 86 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 87 { 88 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 89 90 PetscFunctionBegin; 91 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 92 /* We do not expect one needs factors on host */ 93 PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 94 PetscCheck(aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 95 aijkok->a_dual.sync_host(); 96 PetscFunctionReturn(0); 97 } 98 99 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 100 { 101 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 102 103 PetscFunctionBegin; 104 if (aijkok) { 105 aijkok->a_dual.sync_host(); 106 *array = aijkok->a_dual.view_host().data(); 107 } else { /* Happens when calling MatSetValues on a newly created matrix */ 108 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 109 } 110 PetscFunctionReturn(0); 111 } 112 113 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 114 { 115 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 116 117 PetscFunctionBegin; 118 if (aijkok) aijkok->a_dual.modify_host(); 119 PetscFunctionReturn(0); 120 } 121 122 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 123 { 124 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 125 126 PetscFunctionBegin; 127 if (aijkok) { 128 aijkok->a_dual.sync_host(); 129 *array = aijkok->a_dual.view_host().data(); 130 } else { 131 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 132 } 133 PetscFunctionReturn(0); 134 } 135 136 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 137 { 138 PetscFunctionBegin; 139 *array = NULL; 140 PetscFunctionReturn(0); 141 } 142 143 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 144 { 145 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 146 147 PetscFunctionBegin; 148 if (aijkok) { 149 *array = aijkok->a_dual.view_host().data(); 150 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 151 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 152 } 153 PetscFunctionReturn(0); 154 } 155 156 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 157 { 158 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 159 160 PetscFunctionBegin; 161 if (aijkok) { 162 aijkok->a_dual.clear_sync_state(); 163 aijkok->a_dual.modify_host(); 164 } 165 PetscFunctionReturn(0); 166 } 167 168 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype) 169 { 170 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 171 172 PetscFunctionBegin; 173 PetscCheck(aijkok != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"aijkok is NULL"); 174 175 if (i) *i = aijkok->i_device_data(); 176 if (j) *j = aijkok->j_device_data(); 177 if (a) { 178 aijkok->a_dual.sync_device(); 179 *a = aijkok->a_device_data(); 180 } 181 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 182 PetscFunctionReturn(0); 183 } 184 185 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 186 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 187 { 188 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 189 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 190 191 PetscFunctionBegin; 192 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 193 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 194 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 195 PetscFunctionReturn(0); 196 } 197 198 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 199 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 200 { 201 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 202 203 PetscFunctionBegin; 204 if (aijkok && aijkok->device_mat_d.data()) { 205 *d_mat = aijkok->device_mat_d.data(); 206 } else { 207 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it) 208 *d_mat = NULL; 209 } 210 PetscFunctionReturn(0); 211 } 212 213 /* Generate the transpose on device and cache it internally */ 214 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 215 { 216 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 217 218 PetscFunctionBegin; 219 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 220 if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 221 /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 222 PetscCallCXX(aijkok->a_dual.sync_device()); 223 PetscCallCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 224 PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 225 aijkok->transpose_updated = PETSC_TRUE; 226 } 227 *csrmatT = &aijkok->csrmatT; 228 PetscFunctionReturn(0); 229 } 230 231 /* Generate the Hermitian on device and cache it internally */ 232 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 233 { 234 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 235 236 PetscFunctionBegin; 237 PetscCall(PetscLogGpuTimeBegin()); 238 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 239 if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 240 PetscCallCXX(aijkok->a_dual.sync_device()); 241 PetscCallCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 242 PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 243 #if defined(PETSC_USE_COMPLEX) 244 const auto& a = aijkok->csrmatH.values; 245 Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 246 #endif 247 aijkok->hermitian_updated = PETSC_TRUE; 248 } 249 *csrmatH = &aijkok->csrmatH; 250 PetscCall(PetscLogGpuTimeEnd()); 251 PetscFunctionReturn(0); 252 } 253 254 /* y = A x */ 255 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 256 { 257 Mat_SeqAIJKokkos *aijkok; 258 ConstPetscScalarKokkosView xv; 259 PetscScalarKokkosView yv; 260 261 PetscFunctionBegin; 262 PetscCall(PetscLogGpuTimeBegin()); 263 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 264 PetscCall(VecGetKokkosView(xx,&xv)); 265 PetscCall(VecGetKokkosViewWrite(yy,&yv)); 266 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 267 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 268 PetscCall(VecRestoreKokkosView(xx,&xv)); 269 PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 270 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 271 PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz())); 272 PetscCall(PetscLogGpuTimeEnd()); 273 PetscFunctionReturn(0); 274 } 275 276 /* y = A^T x */ 277 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 278 { 279 Mat_SeqAIJKokkos *aijkok; 280 const char *mode; 281 ConstPetscScalarKokkosView xv; 282 PetscScalarKokkosView yv; 283 KokkosCsrMatrix *csrmat; 284 285 PetscFunctionBegin; 286 PetscCall(PetscLogGpuTimeBegin()); 287 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 288 PetscCall(VecGetKokkosView(xx,&xv)); 289 PetscCall(VecGetKokkosViewWrite(yy,&yv)); 290 if (A->form_explicit_transpose) { 291 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat)); 292 mode = "N"; 293 } else { 294 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 295 csrmat = &aijkok->csrmat; 296 mode = "T"; 297 } 298 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 299 PetscCall(VecRestoreKokkosView(xx,&xv)); 300 PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 301 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 302 PetscCall(PetscLogGpuTimeEnd()); 303 PetscFunctionReturn(0); 304 } 305 306 /* y = A^H x */ 307 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 308 { 309 Mat_SeqAIJKokkos *aijkok; 310 const char *mode; 311 ConstPetscScalarKokkosView xv; 312 PetscScalarKokkosView yv; 313 KokkosCsrMatrix *csrmat; 314 315 PetscFunctionBegin; 316 PetscCall(PetscLogGpuTimeBegin()); 317 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 318 PetscCall(VecGetKokkosView(xx,&xv)); 319 PetscCall(VecGetKokkosViewWrite(yy,&yv)); 320 if (A->form_explicit_transpose) { 321 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat)); 322 mode = "N"; 323 } else { 324 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 325 csrmat = &aijkok->csrmat; 326 mode = "C"; 327 } 328 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 329 PetscCall(VecRestoreKokkosView(xx,&xv)); 330 PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 331 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 332 PetscCall(PetscLogGpuTimeEnd()); 333 PetscFunctionReturn(0); 334 } 335 336 /* z = A x + y */ 337 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 338 { 339 Mat_SeqAIJKokkos *aijkok; 340 ConstPetscScalarKokkosView xv,yv; 341 PetscScalarKokkosView zv; 342 343 PetscFunctionBegin; 344 PetscCall(PetscLogGpuTimeBegin()); 345 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 346 PetscCall(VecGetKokkosView(xx,&xv)); 347 PetscCall(VecGetKokkosView(yy,&yv)); 348 PetscCall(VecGetKokkosViewWrite(zz,&zv)); 349 if (zz != yy) Kokkos::deep_copy(zv,yv); 350 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 351 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 352 PetscCall(VecRestoreKokkosView(xx,&xv)); 353 PetscCall(VecRestoreKokkosView(yy,&yv)); 354 PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 355 PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz())); 356 PetscCall(PetscLogGpuTimeEnd()); 357 PetscFunctionReturn(0); 358 } 359 360 /* z = A^T x + y */ 361 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 362 { 363 Mat_SeqAIJKokkos *aijkok; 364 const char *mode; 365 ConstPetscScalarKokkosView xv,yv; 366 PetscScalarKokkosView zv; 367 KokkosCsrMatrix *csrmat; 368 369 PetscFunctionBegin; 370 PetscCall(PetscLogGpuTimeBegin()); 371 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 372 PetscCall(VecGetKokkosView(xx,&xv)); 373 PetscCall(VecGetKokkosView(yy,&yv)); 374 PetscCall(VecGetKokkosViewWrite(zz,&zv)); 375 if (zz != yy) Kokkos::deep_copy(zv,yv); 376 if (A->form_explicit_transpose) { 377 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat)); 378 mode = "N"; 379 } else { 380 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 381 csrmat = &aijkok->csrmat; 382 mode = "T"; 383 } 384 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 385 PetscCall(VecRestoreKokkosView(xx,&xv)); 386 PetscCall(VecRestoreKokkosView(yy,&yv)); 387 PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 388 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 389 PetscCall(PetscLogGpuTimeEnd()); 390 PetscFunctionReturn(0); 391 } 392 393 /* z = A^H x + y */ 394 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 395 { 396 Mat_SeqAIJKokkos *aijkok; 397 const char *mode; 398 ConstPetscScalarKokkosView xv,yv; 399 PetscScalarKokkosView zv; 400 KokkosCsrMatrix *csrmat; 401 402 PetscFunctionBegin; 403 PetscCall(PetscLogGpuTimeBegin()); 404 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 405 PetscCall(VecGetKokkosView(xx,&xv)); 406 PetscCall(VecGetKokkosView(yy,&yv)); 407 PetscCall(VecGetKokkosViewWrite(zz,&zv)); 408 if (zz != yy) Kokkos::deep_copy(zv,yv); 409 if (A->form_explicit_transpose) { 410 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat)); 411 mode = "N"; 412 } else { 413 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 414 csrmat = &aijkok->csrmat; 415 mode = "C"; 416 } 417 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 418 PetscCall(VecRestoreKokkosView(xx,&xv)); 419 PetscCall(VecRestoreKokkosView(yy,&yv)); 420 PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 421 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 422 PetscCall(PetscLogGpuTimeEnd()); 423 PetscFunctionReturn(0); 424 } 425 426 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 427 { 428 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 429 430 PetscFunctionBegin; 431 switch (op) { 432 case MAT_FORM_EXPLICIT_TRANSPOSE: 433 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 434 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 435 A->form_explicit_transpose = flg; 436 break; 437 default: 438 PetscCall(MatSetOption_SeqAIJ(A,op,flg)); 439 break; 440 } 441 PetscFunctionReturn(0); 442 } 443 444 /* Depending on reuse, either build a new mat, or use the existing mat */ 445 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 446 { 447 Mat_SeqAIJ *aseq; 448 449 PetscFunctionBegin; 450 PetscCall(PetscKokkosInitializeCheck()); 451 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 452 PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); /* the returned newmat is a SeqAIJKokkos */ 453 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 454 PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 455 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 456 PetscCheck(A == *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 457 PetscCall(PetscFree(A->defaultvectype)); 458 PetscCall(PetscStrallocpy(VECKOKKOS,&A->defaultvectype)); /* Allocate and copy the string */ 459 PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS)); 460 PetscCall(MatSetOps_SeqAIJKokkos(A)); 461 aseq = static_cast<Mat_SeqAIJ*>(A->data); 462 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 463 PetscCheck(!A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 464 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 465 } 466 } 467 PetscFunctionReturn(0); 468 } 469 470 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 471 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 472 */ 473 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 474 { 475 Mat_SeqAIJ *bseq; 476 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 477 Mat mat; 478 479 PetscFunctionBegin; 480 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 481 PetscCall(MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B)); 482 mat = *B; 483 if (A->assembled) { 484 bseq = static_cast<Mat_SeqAIJ*>(mat->data); 485 bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 486 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 487 /* Now copy values to B if needed */ 488 if (dupOption == MAT_COPY_VALUES) { 489 if (akok->a_dual.need_sync_device()) { 490 Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 491 bkok->a_dual.modify_host(); 492 } else { /* If device has the latest data, we only copy data on device */ 493 Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 494 bkok->a_dual.modify_device(); 495 } 496 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 497 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 498 bkok->a_dual.modify_host(); 499 } 500 mat->spptr = bkok; 501 } 502 503 PetscCall(PetscFree(mat->defaultvectype)); 504 PetscCall(PetscStrallocpy(VECKOKKOS,&mat->defaultvectype)); /* Allocate and copy the string */ 505 PetscCall(PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS)); 506 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 507 PetscFunctionReturn(0); 508 } 509 510 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 511 { 512 Mat At; 513 KokkosCsrMatrix *internT; 514 Mat_SeqAIJKokkos *atkok,*bkok; 515 516 PetscFunctionBegin; 517 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&internT)); /* Generate a transpose internally */ 518 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 519 /* Deep copy internT, as we want to isolate the internal transpose */ 520 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT))); 521 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At)); 522 if (reuse == MAT_INITIAL_MATRIX) *B = At; 523 else PetscCall(MatHeaderReplace(A,&At)); /* Replace A with At inplace */ 524 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 525 if ((*B)->assembled) { 526 bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 527 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 528 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 529 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 530 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 531 MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 532 MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 533 PetscCallCXX(Kokkos::deep_copy(a_h,internT->values)); 534 PetscCallCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 535 } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 536 } 537 PetscFunctionReturn(0); 538 } 539 540 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 541 { 542 Mat_SeqAIJKokkos *aijkok; 543 544 PetscFunctionBegin; 545 if (A->factortype == MAT_FACTOR_NONE) { 546 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 547 delete aijkok; 548 } else { 549 delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 550 } 551 A->spptr = NULL; 552 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL)); 553 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 554 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 555 PetscCall(MatDestroy_SeqAIJ(A)); 556 PetscFunctionReturn(0); 557 } 558 559 /*MC 560 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 561 562 A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 563 564 Options Database Keys: 565 . -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions() 566 567 Level: beginner 568 569 .seealso: MatCreateSeqAIJKokkos(), MATMPIAIJKOKKOS 570 M*/ 571 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 572 { 573 PetscFunctionBegin; 574 PetscCall(PetscKokkosInitializeCheck()); 575 PetscCall(MatCreate_SeqAIJ(A)); 576 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A)); 577 PetscFunctionReturn(0); 578 } 579 580 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 581 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 582 { 583 Mat_SeqAIJ *a,*b; 584 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 585 MatScalarKokkosView aa,ba,ca; 586 MatRowMapKokkosView ai,bi,ci; 587 MatColIdxKokkosView aj,bj,cj; 588 PetscInt m,n,nnz,aN; 589 590 PetscFunctionBegin; 591 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 592 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 593 PetscValidPointer(C,4); 594 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 595 PetscCheckTypeName(B,MATSEQAIJKOKKOS); 596 PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 597 PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 598 599 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 600 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 601 a = static_cast<Mat_SeqAIJ*>(A->data); 602 b = static_cast<Mat_SeqAIJ*>(B->data); 603 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 604 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 605 aa = akok->a_dual.view_device(); 606 ai = akok->i_dual.view_device(); 607 ba = bkok->a_dual.view_device(); 608 bi = bkok->i_dual.view_device(); 609 m = A->rmap->n; /* M, N and nnz of C */ 610 n = A->cmap->n + B->cmap->n; 611 nnz = a->nz + b->nz; 612 aN = A->cmap->n; /* N of A */ 613 if (reuse == MAT_INITIAL_MATRIX) { 614 aj = akok->j_dual.view_device(); 615 bj = bkok->j_dual.view_device(); 616 auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 617 auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 618 auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 619 ca = ca_dual.view_device(); 620 ci = ci_dual.view_device(); 621 cj = cj_dual.view_device(); 622 623 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 624 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 625 PetscInt i = t.league_rank(); /* row i */ 626 PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 627 628 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 629 ci(i) = coffset; 630 if (i == m-1) ci(m) = ai(m) + bi(m); 631 }); 632 633 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 634 if (k < alen) { 635 ca(coffset+k) = aa(ai(i)+k); 636 cj(coffset+k) = aj(ai(i)+k); 637 } else { 638 ca(coffset+k) = ba(bi(i)+k-alen); 639 cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 640 } 641 }); 642 }); 643 ca_dual.modify_device(); 644 ci_dual.modify_device(); 645 cj_dual.modify_device(); 646 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 647 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C)); 648 } else if (reuse == MAT_REUSE_MATRIX) { 649 PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 650 PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 651 ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 652 ca = ckok->a_dual.view_device(); 653 ci = ckok->i_dual.view_device(); 654 655 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 656 PetscInt i = t.league_rank(); /* row i */ 657 PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 658 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 659 if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 660 else ca(ci(i)+k) = ba(bi(i)+k-alen); 661 }); 662 }); 663 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 664 } 665 PetscFunctionReturn(0); 666 } 667 668 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 669 { 670 PetscFunctionBegin; 671 delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 672 PetscFunctionReturn(0); 673 } 674 675 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 676 { 677 Mat_Product *product = C->product; 678 Mat A,B; 679 bool transA,transB; /* use bool, since KK needs this type */ 680 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 681 Mat_SeqAIJ *c; 682 MatProductData_SeqAIJKokkos *pdata; 683 KokkosCsrMatrix *csrmatA,*csrmatB; 684 685 PetscFunctionBegin; 686 MatCheckProduct(C,1); 687 PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 688 pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 689 690 if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 691 pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 692 PetscFunctionReturn(0); 693 } 694 695 switch (product->type) { 696 case MATPRODUCT_AB: transA = false; transB = false; break; 697 case MATPRODUCT_AtB: transA = true; transB = false; break; 698 case MATPRODUCT_ABt: transA = false; transB = true; break; 699 default: 700 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 701 } 702 703 A = product->A; 704 B = product->B; 705 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 706 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 707 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 708 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 709 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 710 711 PetscCheck(ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 712 713 csrmatA = &akok->csrmat; 714 csrmatB = &bkok->csrmat; 715 716 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 717 if (transA) { 718 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA)); 719 transA = false; 720 } 721 722 if (transB) { 723 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB)); 724 transB = false; 725 } 726 PetscCall(PetscLogGpuTimeBegin()); 727 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 728 PetscCallCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 729 PetscCall(PetscLogGpuTimeEnd()); 730 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 731 /* shorter version of MatAssemblyEnd_SeqAIJ */ 732 c = (Mat_SeqAIJ*)C->data; 733 PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz)); 734 PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n")); 735 PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax)); 736 c->reallocs = 0; 737 C->info.mallocs = 0; 738 C->info.nz_unneeded = 0; 739 C->assembled = C->was_assembled = PETSC_TRUE; 740 C->num_ass++; 741 PetscFunctionReturn(0); 742 } 743 744 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 745 { 746 Mat_Product *product = C->product; 747 MatProductType ptype; 748 Mat A,B; 749 bool transA,transB; 750 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 751 MatProductData_SeqAIJKokkos *pdata; 752 MPI_Comm comm; 753 KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 754 755 PetscFunctionBegin; 756 MatCheckProduct(C,1); 757 PetscCall(PetscObjectGetComm((PetscObject)C,&comm)); 758 PetscCheck(!product->data,comm,PETSC_ERR_PLIB,"Product data not empty"); 759 A = product->A; 760 B = product->B; 761 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 762 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 763 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 764 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 765 csrmatA = &akok->csrmat; 766 csrmatB = &bkok->csrmat; 767 768 ptype = product->type; 769 switch (ptype) { 770 case MATPRODUCT_AB: transA = false; transB = false; break; 771 case MATPRODUCT_AtB: transA = true; transB = false; break; 772 case MATPRODUCT_ABt: transA = false; transB = true; break; 773 default: 774 SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 775 } 776 777 product->data = pdata = new MatProductData_SeqAIJKokkos(); 778 pdata->kh.set_team_work_size(16); 779 pdata->kh.set_dynamic_scheduling(true); 780 pdata->reusesym = product->api_user; 781 782 /* TODO: add command line options to select spgemm algorithms */ 783 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 784 /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API. 785 We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support. 786 */ 787 pdata->kh.create_spgemm_handle(spgemm_alg); 788 789 PetscCall(PetscLogGpuTimeBegin()); 790 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 791 if (transA) { 792 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA)); 793 transA = false; 794 } 795 796 if (transB) { 797 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB)); 798 transB = false; 799 } 800 801 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 802 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 803 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 804 calling new Mat_SeqAIJKokkos(). 805 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 806 */ 807 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 808 PetscCallCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 809 PetscCall(PetscLogGpuTimeEnd()); 810 811 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 812 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C,ckok)); 813 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 814 PetscFunctionReturn(0); 815 } 816 817 /* handles sparse matrix matrix ops */ 818 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 819 { 820 Mat_Product *product = mat->product; 821 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 822 823 PetscFunctionBegin; 824 MatCheckProduct(mat,1); 825 PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok)); 826 if (product->type == MATPRODUCT_ABC) { 827 PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok)); 828 } 829 if (Biskok && Ciskok) { 830 switch (product->type) { 831 case MATPRODUCT_AB: 832 case MATPRODUCT_AtB: 833 case MATPRODUCT_ABt: 834 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 835 break; 836 case MATPRODUCT_PtAP: 837 case MATPRODUCT_RARt: 838 case MATPRODUCT_ABC: 839 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 840 break; 841 default: 842 break; 843 } 844 } else { /* fallback for AIJ */ 845 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 846 } 847 PetscFunctionReturn(0); 848 } 849 850 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 851 { 852 Mat_SeqAIJKokkos *aijkok; 853 854 PetscFunctionBegin; 855 PetscCall(PetscLogGpuTimeBegin()); 856 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 857 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 858 KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 859 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 860 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 861 PetscCall(PetscLogGpuTimeEnd()); 862 PetscFunctionReturn(0); 863 } 864 865 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 866 { 867 Mat_SeqAIJKokkos *aijkok; 868 869 PetscFunctionBegin; 870 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 871 if (aijkok) { /* Only zero the device if data is already there */ 872 KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 873 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 874 } else { /* Might be preallocated but not assembled */ 875 PetscCall(MatZeroEntries_SeqAIJ(A)); 876 } 877 PetscFunctionReturn(0); 878 } 879 880 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 881 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv) 882 { 883 Mat_SeqAIJKokkos *aijkok; 884 885 PetscFunctionBegin; 886 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 887 PetscValidPointer(kv,2); 888 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 889 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 890 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 891 *kv = aijkok->a_dual.view_device(); 892 PetscFunctionReturn(0); 893 } 894 895 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv) 896 { 897 PetscFunctionBegin; 898 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 899 PetscValidPointer(kv,2); 900 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 901 PetscFunctionReturn(0); 902 } 903 904 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv) 905 { 906 Mat_SeqAIJKokkos *aijkok; 907 908 PetscFunctionBegin; 909 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 910 PetscValidPointer(kv,2); 911 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 912 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 913 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 914 *kv = aijkok->a_dual.view_device(); 915 PetscFunctionReturn(0); 916 } 917 918 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv) 919 { 920 PetscFunctionBegin; 921 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 922 PetscValidPointer(kv,2); 923 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 924 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 925 PetscFunctionReturn(0); 926 } 927 928 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 929 { 930 Mat_SeqAIJKokkos *aijkok; 931 932 PetscFunctionBegin; 933 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 934 PetscValidPointer(kv,2); 935 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 936 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 937 *kv = aijkok->a_dual.view_device(); 938 PetscFunctionReturn(0); 939 } 940 941 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 942 { 943 PetscFunctionBegin; 944 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 945 PetscValidPointer(kv,2); 946 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 947 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 948 PetscFunctionReturn(0); 949 } 950 951 /* Computes Y += alpha X */ 952 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 953 { 954 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 955 Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 956 ConstMatScalarKokkosView Xa; 957 MatScalarKokkosView Ya; 958 959 PetscFunctionBegin; 960 PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 961 PetscCheckTypeName(X,MATSEQAIJKOKKOS); 962 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 963 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 964 PetscCall(PetscLogGpuTimeBegin()); 965 966 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 967 /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 968 PetscBool e; 969 PetscCall(PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e)); 970 if (e) { 971 PetscCall(PetscArraycmp(x->j,y->j,y->nz,&e)); 972 if (e) pattern = SAME_NONZERO_PATTERN; 973 } 974 } 975 976 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 977 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 978 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 979 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 980 */ 981 ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 982 xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 983 Xa = xkok->a_dual.view_device(); 984 Ya = ykok->a_dual.view_device(); 985 986 if (pattern == SAME_NONZERO_PATTERN) { 987 KokkosBlas::axpy(alpha,Xa,Ya); 988 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 989 } else if (pattern == SUBSET_NONZERO_PATTERN) { 990 MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 991 MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 992 993 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 994 PetscInt i = t.league_rank(); /* row i */ 995 Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 996 PetscInt p,q = Yi(i); 997 for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 998 while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 999 if (Xj(p) == Yj(q)) { /* Found it */ 1000 Ya(q) += alpha * Xa(p); 1001 q++; 1002 } else { 1003 /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1004 Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1005 */ 1006 if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */ 1007 } 1008 } 1009 }); 1010 }); 1011 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1012 } else { /* different nonzero patterns */ 1013 Mat Z; 1014 KokkosCsrMatrix zcsr; 1015 KernelHandle kh; 1016 kh.create_spadd_handle(false); 1017 KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1018 KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1019 zkok = new Mat_SeqAIJKokkos(zcsr); 1020 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z)); 1021 PetscCall(MatHeaderReplace(Y,&Z)); 1022 kh.destroy_spadd_handle(); 1023 } 1024 PetscCall(PetscLogGpuTimeEnd()); 1025 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0)*2)); /* Because we scaled X and then added it to Y */ 1026 PetscFunctionReturn(0); 1027 } 1028 1029 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 1030 { 1031 Mat_SeqAIJKokkos *akok; 1032 Mat_SeqAIJ *aseq; 1033 1034 PetscFunctionBegin; 1035 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j)); 1036 aseq = static_cast<Mat_SeqAIJ*>(mat->data); 1037 akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr); 1038 delete akok; 1039 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,mat->nonzerostate+1,PETSC_FALSE); 1040 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1041 akok->SetUpCOO(aseq); 1042 PetscFunctionReturn(0); 1043 } 1044 1045 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode) 1046 { 1047 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ*>(A->data); 1048 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1049 PetscCount Annz = aseq->nz; 1050 const PetscCountKokkosView& jmap = akok->jmap_d; 1051 const PetscCountKokkosView& perm = akok->perm_d; 1052 MatScalarKokkosView Aa; 1053 ConstMatScalarKokkosView kv; 1054 PetscMemType memtype; 1055 1056 PetscFunctionBegin; 1057 PetscCall(PetscGetMemType(v,&memtype)); 1058 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1059 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n)); 1060 } else { 1061 kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */ 1062 } 1063 1064 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); /* write matrix values */ 1065 else PetscCall(MatSeqAIJGetKokkosView(A,&Aa)); /* read & write matrix values */ 1066 1067 Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) { 1068 PetscScalar sum = 0.0; 1069 for (PetscCount k=jmap(i); k<jmap(i+1); k++) sum += kv(perm(k)); 1070 Aa(i) = (imode == INSERT_VALUES? 0.0 : Aa(i)) + sum; 1071 }); 1072 1073 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa)); 1074 else PetscCall(MatSeqAIJRestoreKokkosView(A,&Aa)); 1075 PetscFunctionReturn(0); 1076 } 1077 1078 PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A,const PetscInt *diag) 1079 { 1080 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1081 MatScalarKokkosView Aa; 1082 const MatRowMapKokkosView& Ai = akok->i_dual.view_device(); 1083 PetscInt m = A->rmap->n; 1084 ConstMatRowMapKokkosView Adiag(diag,m); /* diag is a device pointer */ 1085 1086 PetscFunctionBegin; 1087 PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); 1088 Kokkos::parallel_for(m,KOKKOS_LAMBDA(const PetscInt i) { 1089 PetscScalar tmp; 1090 if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i+1)) { /* The diagonal element exists */ 1091 tmp = Aa(Ai(i)); 1092 Aa(Ai(i)) = Aa(Adiag(i)); 1093 Aa(Adiag(i)) = tmp; 1094 } 1095 }); 1096 PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa)); 1097 PetscFunctionReturn(0); 1098 } 1099 1100 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1101 { 1102 PetscFunctionBegin; 1103 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1104 PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info)); 1105 B->offloadmask = PETSC_OFFLOAD_CPU; 1106 PetscFunctionReturn(0); 1107 } 1108 1109 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1110 { 1111 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1112 1113 PetscFunctionBegin; 1114 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1115 A->boundtocpu = PETSC_FALSE; 1116 1117 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1118 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1119 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1120 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1121 A->ops->scale = MatScale_SeqAIJKokkos; 1122 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1123 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1124 A->ops->mult = MatMult_SeqAIJKokkos; 1125 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1126 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1127 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1128 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1129 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1130 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1131 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1132 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1133 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1134 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1135 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1136 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1137 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1138 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1139 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1140 1141 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos)); 1142 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJKokkos)); 1143 PetscFunctionReturn(0); 1144 } 1145 1146 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1147 { 1148 Mat_SeqAIJ *aseq; 1149 PetscInt i,m,n; 1150 1151 PetscFunctionBegin; 1152 PetscCheck(!A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1153 1154 m = akok->nrows(); 1155 n = akok->ncols(); 1156 PetscCall(MatSetSizes(A,m,n,m,n)); 1157 PetscCall(MatSetType(A,MATSEQAIJKOKKOS)); 1158 1159 /* Set up data structures of A as a MATSEQAIJ */ 1160 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL)); 1161 aseq = (Mat_SeqAIJ*)(A)->data; 1162 1163 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1164 akok->j_dual.sync_host(); 1165 1166 aseq->i = akok->i_host_data(); 1167 aseq->j = akok->j_host_data(); 1168 aseq->a = akok->a_host_data(); 1169 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1170 aseq->singlemalloc = PETSC_FALSE; 1171 aseq->free_a = PETSC_FALSE; 1172 aseq->free_ij = PETSC_FALSE; 1173 aseq->nz = akok->nnz(); 1174 aseq->maxnz = aseq->nz; 1175 1176 PetscCall(PetscMalloc1(m,&aseq->imax)); 1177 PetscCall(PetscMalloc1(m,&aseq->ilen)); 1178 for (i=0; i<m; i++) { 1179 aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1180 } 1181 1182 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1183 akok->nonzerostate = A->nonzerostate; 1184 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1185 PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY)); 1186 PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY)); 1187 PetscFunctionReturn(0); 1188 } 1189 1190 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1191 1192 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1193 */ 1194 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1195 { 1196 PetscFunctionBegin; 1197 PetscCall(MatCreate(comm,A)); 1198 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A,akok)); 1199 PetscFunctionReturn(0); 1200 } 1201 1202 /* --------------------------------------------------------------------------------*/ 1203 /*@C 1204 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 1205 (the default parallel PETSc format). This matrix will ultimately be handled by 1206 Kokkos for calculations. For good matrix 1207 assembly performance the user should preallocate the matrix storage by setting 1208 the parameter nz (or the array nnz). By setting these parameters accurately, 1209 performance during matrix assembly can be increased by more than a factor of 50. 1210 1211 Collective 1212 1213 Input Parameters: 1214 + comm - MPI communicator, set to PETSC_COMM_SELF 1215 . m - number of rows 1216 . n - number of columns 1217 . nz - number of nonzeros per row (same for all rows) 1218 - nnz - array containing the number of nonzeros in the various rows 1219 (possibly different for each row) or NULL 1220 1221 Output Parameter: 1222 . A - the matrix 1223 1224 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1225 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1226 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1227 1228 Notes: 1229 If nnz is given then nz is ignored 1230 1231 The AIJ format (also called the Yale sparse matrix format or 1232 compressed row storage), is fully compatible with standard Fortran 77 1233 storage. That is, the stored row and column indices can begin at 1234 either one (as in Fortran) or zero. See the users' manual for details. 1235 1236 Specify the preallocated storage with either nz or nnz (not both). 1237 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1238 allocation. For large problems you MUST preallocate memory or you 1239 will get TERRIBLE performance, see the users' manual chapter on matrices. 1240 1241 By default, this format uses inodes (identical nodes) when possible, to 1242 improve numerical efficiency of matrix-vector products and solves. We 1243 search for consecutive rows with the same nonzero structure, thereby 1244 reusing matrix information to achieve increased efficiency. 1245 1246 Level: intermediate 1247 1248 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 1249 @*/ 1250 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1251 { 1252 PetscFunctionBegin; 1253 PetscCall(PetscKokkosInitializeCheck()); 1254 PetscCall(MatCreate(comm,A)); 1255 PetscCall(MatSetSizes(*A,m,n,m,n)); 1256 PetscCall(MatSetType(*A,MATSEQAIJKOKKOS)); 1257 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz)); 1258 PetscFunctionReturn(0); 1259 } 1260 1261 typedef Kokkos::TeamPolicy<>::member_type team_member; 1262 // 1263 // This factorization exploits block diagonal matrices with "Nf" (not used). 1264 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1265 // 1266 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1267 { 1268 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1269 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1270 IS isrow = b->row,isicol = b->icol; 1271 const PetscInt *r_h,*ic_h; 1272 const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data(); 1273 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1274 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1275 PetscBool row_identity,col_identity; 1276 PetscInt nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used 1277 1278 PetscFunctionBegin; 1279 PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n); 1280 PetscCall(MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity)); 1281 PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 1282 PetscCall(ISGetIndices(isrow,&r_h)); 1283 PetscCall(ISGetIndices(isicol,&ic_h)); 1284 PetscCall(ISGetSize(isicol,&nc)); 1285 PetscCall(PetscLogGpuTimeBegin()); 1286 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1287 { 1288 #define KOKKOS_SHARED_LEVEL 1 1289 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1290 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1291 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1292 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 1293 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 1294 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 1295 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 1296 size_t flops_h = 0.0; 1297 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 1298 Kokkos::View<size_t> d_flops_k ("flops"); 1299 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1300 const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 1301 Kokkos::deep_copy (d_flops_k, h_flops_k); 1302 Kokkos::deep_copy (d_r_k, h_r_k); 1303 Kokkos::deep_copy (d_ic_k, h_ic_k); 1304 // Fill A --> fact 1305 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1306 const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 1307 const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 1308 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1309 // zero rows of B 1310 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1311 PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 1312 PetscScalar *baL = ba_d + bi_d[rowb]; 1313 PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 1314 /* zero (unfactored row) */ 1315 for (int j=0;j<nzbL;j++) baL[j] = 0; 1316 for (int j=0;j<nzbU;j++) baU[j] = 0; 1317 }); 1318 // copy A into B 1319 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1320 PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 1321 const PetscScalar *av = aa_d + ai_d[rowa]; 1322 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1323 /* load in initial (unfactored row) */ 1324 for (int j=0;j<nza;j++) { 1325 PetscInt colb = ic[ajtmp[j]]; 1326 PetscScalar vala = av[j]; 1327 if (colb == rowb) { 1328 *(ba_d + bdiag_d[rowb]) = vala; 1329 } else { 1330 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1331 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1332 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 1333 for (int j=0; j<nz ; j++) { 1334 if (pbj[j] == colb) { 1335 pba[j] = vala; 1336 set++; 1337 break; 1338 } 1339 } 1340 #if !defined(PETSC_HAVE_SYCL) 1341 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1342 #endif 1343 } 1344 } 1345 }); 1346 }); 1347 Kokkos::fence(); 1348 1349 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 1350 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1351 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1352 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1353 const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 1354 const PetscInt start = field*nloc, end = start + nloc; 1355 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1356 // A22 panel update for each row A(1,:) and col A(:,1) 1357 for (int ii=start; ii<end-1; ii++) { 1358 const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 1359 const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 1360 const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 1361 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1362 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 1363 PetscInt kIdx = j*Ni + field_block_idx; 1364 if (kIdx >= nzUi) /* void */ ; 1365 else { 1366 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1367 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1368 const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 1369 size_t st_idx; 1370 // find and do L(k,i) = A(:k,i) / A(i,i) 1371 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1372 // get column, there has got to be a better way 1373 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 1374 if (pjL[j] == ii) { 1375 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1376 idx = j; // output 1377 *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1378 } 1379 }, st_idx); 1380 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 1381 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1382 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register 1383 #endif 1384 // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1385 // U(i+1,:end) 1386 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 1387 PetscScalar Uij = baUi[uiIdx]; 1388 PetscInt col = bjUi[uiIdx]; 1389 if (col==myk) { 1390 // A_kk = A_kk - L_ki * U_ij(k) 1391 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1392 *Akkv = *Akkv - L_ki() * Uij; // UiK 1393 } else { 1394 PetscScalar *start, *end, *pAkjv=NULL; 1395 PetscInt high, low; 1396 const PetscInt *startj; 1397 if (col<myk) { // L 1398 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1399 PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 1400 start = pLki+1; // start at pLki+1, A22(myk,1) 1401 startj= bj_d + bi_d[myk] + idx; 1402 end = ba_d + bi_d[myk+1]; 1403 } else { 1404 PetscInt idx = bdiag_d[myk+1]+1; 1405 start = ba_d + idx; 1406 startj= bj_d + idx; 1407 end = ba_d + bdiag_d[myk]; 1408 } 1409 // search for 'col', use bisection search - TODO 1410 low = 0; 1411 high = (PetscInt)(end-start); 1412 while (high-low > 5) { 1413 int t = (low+high)/2; 1414 if (startj[t] > col) high = t; 1415 else low = t; 1416 } 1417 for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 1418 if (startj[pAkjv-start] == col) break; 1419 } 1420 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1421 if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register 1422 #endif 1423 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1424 } 1425 }); 1426 } 1427 }); 1428 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1429 if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 1430 } /* endof for (i=0; i<n; i++) { */ 1431 Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 1432 }); 1433 Kokkos::fence(); 1434 Kokkos::deep_copy (h_flops_k, d_flops_k); 1435 PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k())); 1436 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 1437 const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 1438 const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 1439 /* Invert diagonal for simpler triangular solves */ 1440 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 1441 int i = start + outer_index*Ni + lg_rank%Ni; 1442 if (i < end) { 1443 PetscScalar *pv = ba_d + bdiag_d[i]; 1444 *pv = 1.0/(*pv); 1445 } 1446 }); 1447 }); 1448 } 1449 PetscCall(PetscLogGpuTimeEnd()); 1450 PetscCall(ISRestoreIndices(isicol,&ic_h)); 1451 PetscCall(ISRestoreIndices(isrow,&r_h)); 1452 1453 PetscCall(ISIdentity(isrow,&row_identity)); 1454 PetscCall(ISIdentity(isicol,&col_identity)); 1455 if (b->inode.size) { 1456 B->ops->solve = MatSolve_SeqAIJ_Inode; 1457 } else if (row_identity && col_identity) { 1458 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1459 } else { 1460 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1461 } 1462 B->offloadmask = PETSC_OFFLOAD_GPU; 1463 PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU 1464 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1465 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1466 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1467 B->ops->matsolve = MatMatSolve_SeqAIJ; 1468 B->assembled = PETSC_TRUE; 1469 B->preallocated = PETSC_TRUE; 1470 1471 PetscFunctionReturn(0); 1472 } 1473 1474 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1475 { 1476 PetscFunctionBegin; 1477 PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 1478 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1479 PetscFunctionReturn(0); 1480 } 1481 1482 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1483 { 1484 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1485 1486 PetscFunctionBegin; 1487 if (!factors->sptrsv_symbolic_completed) { 1488 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 1489 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 1490 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1491 } 1492 PetscFunctionReturn(0); 1493 } 1494 1495 /* Check if we need to update factors etc for transpose solve */ 1496 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1497 { 1498 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1499 MatColIdxType n = A->rmap->n; 1500 1501 PetscFunctionBegin; 1502 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1503 /* Update L^T and do sptrsv symbolic */ 1504 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 1505 Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1506 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1507 factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 1508 1509 KokkosKernels::Impl::transpose_matrix< 1510 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1511 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1512 MatRowMapKokkosView,DefaultExecutionSpace>( 1513 n,n,factors->iL_d,factors->jL_d,factors->aL_d, 1514 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1515 1516 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1517 We have to sort the indices, until KK provides finer control options. 1518 */ 1519 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1520 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1521 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1522 1523 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 1524 1525 /* Update U^T and do sptrsv symbolic */ 1526 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 1527 Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1528 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1529 factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 1530 1531 KokkosKernels::Impl::transpose_matrix< 1532 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1533 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1534 MatRowMapKokkosView,DefaultExecutionSpace>( 1535 n,n,factors->iU_d, factors->jU_d, factors->aU_d, 1536 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1537 1538 /* Sort indices. See comments above */ 1539 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1540 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1541 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1542 1543 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 1544 factors->transpose_updated = PETSC_TRUE; 1545 } 1546 PetscFunctionReturn(0); 1547 } 1548 1549 /* Solve Ax = b, with A = LU */ 1550 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 1551 { 1552 ConstPetscScalarKokkosView bv; 1553 PetscScalarKokkosView xv; 1554 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1555 1556 PetscFunctionBegin; 1557 PetscCall(PetscLogGpuTimeBegin()); 1558 PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 1559 PetscCall(VecGetKokkosView(b,&bv)); 1560 PetscCall(VecGetKokkosViewWrite(x,&xv)); 1561 /* Solve L tmpv = b */ 1562 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 1563 /* Solve Ux = tmpv */ 1564 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 1565 PetscCall(VecRestoreKokkosView(b,&bv)); 1566 PetscCall(VecRestoreKokkosViewWrite(x,&xv)); 1567 PetscCall(PetscLogGpuTimeEnd()); 1568 PetscFunctionReturn(0); 1569 } 1570 1571 /* Solve A^T x = b, where A^T = U^T L^T */ 1572 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 1573 { 1574 ConstPetscScalarKokkosView bv; 1575 PetscScalarKokkosView xv; 1576 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1577 1578 PetscFunctionBegin; 1579 PetscCall(PetscLogGpuTimeBegin()); 1580 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 1581 PetscCall(VecGetKokkosView(b,&bv)); 1582 PetscCall(VecGetKokkosViewWrite(x,&xv)); 1583 /* Solve U^T tmpv = b */ 1584 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 1585 1586 /* Solve L^T x = tmpv */ 1587 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 1588 PetscCall(VecRestoreKokkosView(b,&bv)); 1589 PetscCall(VecRestoreKokkosViewWrite(x,&xv)); 1590 PetscCall(PetscLogGpuTimeEnd()); 1591 PetscFunctionReturn(0); 1592 } 1593 1594 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1595 { 1596 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1597 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1598 PetscInt fill_lev = info->levels; 1599 1600 PetscFunctionBegin; 1601 PetscCall(PetscLogGpuTimeBegin()); 1602 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1603 1604 auto a_d = aijkok->a_dual.view_device(); 1605 auto i_d = aijkok->i_dual.view_device(); 1606 auto j_d = aijkok->j_dual.view_device(); 1607 1608 KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 1609 1610 B->assembled = PETSC_TRUE; 1611 B->preallocated = PETSC_TRUE; 1612 B->ops->solve = MatSolve_SeqAIJKokkos; 1613 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1614 B->ops->matsolve = NULL; 1615 B->ops->matsolvetranspose = NULL; 1616 B->offloadmask = PETSC_OFFLOAD_GPU; 1617 1618 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1619 factors->transpose_updated = PETSC_FALSE; 1620 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1621 /* TODO: log flops, but how to know that? */ 1622 PetscCall(PetscLogGpuTimeEnd()); 1623 PetscFunctionReturn(0); 1624 } 1625 1626 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1627 { 1628 Mat_SeqAIJKokkos *aijkok; 1629 Mat_SeqAIJ *b; 1630 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1631 PetscInt fill_lev = info->levels; 1632 PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 1633 PetscInt n = A->rmap->n; 1634 1635 PetscFunctionBegin; 1636 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1637 /* Rebuild factors */ 1638 if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 1639 else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 1640 1641 /* Create a spiluk handle and then do symbolic factorization */ 1642 nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 1643 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 1644 1645 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1646 1647 Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 1648 Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 1649 Kokkos::realloc(factors->iU_d,n+1); 1650 Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 1651 1652 aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1653 auto i_d = aijkok->i_dual.view_device(); 1654 auto j_d = aijkok->j_dual.view_device(); 1655 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 1656 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1657 1658 Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1659 Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 1660 Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 1661 Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 1662 1663 /* TODO: add options to select sptrsv algorithms */ 1664 /* Create sptrsv handles for L, U and their transpose */ 1665 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1666 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1667 #else 1668 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1669 #endif 1670 1671 factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 1672 factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 1673 factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 1674 factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 1675 1676 /* Fill fields of the factor matrix B */ 1677 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL)); 1678 b = (Mat_SeqAIJ*)B->data; 1679 b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 1680 B->info.fill_ratio_given = info->fill; 1681 B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 1682 1683 B->offloadmask = PETSC_OFFLOAD_GPU; 1684 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1685 PetscFunctionReturn(0); 1686 } 1687 1688 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1689 { 1690 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1691 const PetscInt nrows = A->rmap->n; 1692 1693 PetscFunctionBegin; 1694 PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 1695 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1696 // move B data into Kokkos 1697 PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok 1698 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok 1699 { 1700 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1701 if (!baijkok->diag_d.extent(0)) { 1702 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1703 baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1704 Kokkos::deep_copy (baijkok->diag_d, h_diag); 1705 } 1706 } 1707 PetscFunctionReturn(0); 1708 } 1709 1710 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1711 { 1712 PetscFunctionBegin; 1713 *type = MATSOLVERKOKKOS; 1714 PetscFunctionReturn(0); 1715 } 1716 1717 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 1718 { 1719 PetscFunctionBegin; 1720 *type = MATSOLVERKOKKOSDEVICE; 1721 PetscFunctionReturn(0); 1722 } 1723 1724 /*MC 1725 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1726 on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1727 1728 Level: beginner 1729 1730 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1731 M*/ 1732 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1733 { 1734 PetscInt n = A->rmap->n; 1735 1736 PetscFunctionBegin; 1737 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 1738 PetscCall(MatSetSizes(*B,n,n,n,n)); 1739 (*B)->factortype = ftype; 1740 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 1741 PetscCall(MatSetType(*B,MATSEQAIJKOKKOS)); 1742 1743 if (ftype == MAT_FACTOR_LU) { 1744 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 1745 (*B)->canuseordering = PETSC_TRUE; 1746 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1747 } else if (ftype == MAT_FACTOR_ILU) { 1748 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 1749 (*B)->canuseordering = PETSC_FALSE; 1750 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1751 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1752 1753 PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 1754 PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos)); 1755 PetscFunctionReturn(0); 1756 } 1757 1758 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 1759 { 1760 PetscInt n = A->rmap->n; 1761 1762 PetscFunctionBegin; 1763 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 1764 PetscCall(MatSetSizes(*B,n,n,n,n)); 1765 (*B)->factortype = ftype; 1766 (*B)->canuseordering = PETSC_TRUE; 1767 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 1768 PetscCall(MatSetType(*B,MATSEQAIJKOKKOS)); 1769 1770 if (ftype == MAT_FACTOR_LU) { 1771 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 1772 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1773 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1774 1775 PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 1776 PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device)); 1777 PetscFunctionReturn(0); 1778 } 1779 1780 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1781 { 1782 PetscFunctionBegin; 1783 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos)); 1784 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos)); 1785 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device)); 1786 PetscFunctionReturn(0); 1787 } 1788 1789 /* Utility to print out a KokkosCsrMatrix for debugging */ 1790 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1791 { 1792 const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1793 const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1794 const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1795 const PetscInt *i = iv.data(); 1796 const PetscInt *j = jv.data(); 1797 const PetscScalar *a = av.data(); 1798 PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1799 1800 PetscFunctionBegin; 1801 PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz)); 1802 for (PetscInt k=0; k<m; k++) { 1803 PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k)); 1804 for (PetscInt p=i[k]; p<i[k+1]; p++) { 1805 PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],(double)PetscRealPart(a[p]))); 1806 } 1807 PetscCall(PetscPrintf(PETSC_COMM_SELF,"\n")); 1808 } 1809 PetscFunctionReturn(0); 1810 } 1811