1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petsc/private/sfimpl.h> 5 #include <petscsystypes.h> 6 #include <petscerror.h> 7 8 #include <Kokkos_Core.hpp> 9 #include <KokkosBlas.hpp> 10 #include <KokkosKernels_Sorting.hpp> 11 #include <KokkosSparse_CrsMatrix.hpp> 12 #include <KokkosSparse_spmv.hpp> 13 #include <KokkosSparse_spiluk.hpp> 14 #include <KokkosSparse_sptrsv.hpp> 15 #include <KokkosSparse_spgemm.hpp> 16 #include <KokkosSparse_spadd.hpp> 17 18 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 19 20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 21 22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 23 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 24 In the latter case, it is important to set a_dual's sync state correctly. 25 */ 26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 27 { 28 Mat_SeqAIJ *aijseq; 29 Mat_SeqAIJKokkos *aijkok; 30 31 PetscFunctionBegin; 32 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 33 PetscCall(MatAssemblyEnd_SeqAIJ(A,mode)); 34 35 aijseq = static_cast<Mat_SeqAIJ*>(A->data); 36 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 37 38 /* If aijkok does not exist, we just copy i, j to device. 39 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 40 In both cases, we build a new aijkok structure. 41 */ 42 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 43 delete aijkok; 44 aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 45 A->spptr = aijkok; 46 } 47 48 if (aijkok && aijkok->device_mat_d.data()) { 49 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 50 } 51 PetscFunctionReturn(0); 52 } 53 54 /* Sync CSR data to device if not yet */ 55 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 56 { 57 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 58 59 PetscFunctionBegin; 60 PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 61 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 62 if (aijkok->a_dual.need_sync_device()) { 63 aijkok->a_dual.sync_device(); 64 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 65 aijkok->hermitian_updated = PETSC_FALSE; 66 } 67 PetscFunctionReturn(0); 68 } 69 70 /* Mark the CSR data on device as modified */ 71 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 72 { 73 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 74 75 PetscFunctionBegin; 76 PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 77 aijkok->a_dual.clear_sync_state(); 78 aijkok->a_dual.modify_device(); 79 aijkok->transpose_updated = PETSC_FALSE; 80 aijkok->hermitian_updated = PETSC_FALSE; 81 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 82 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 83 PetscFunctionReturn(0); 84 } 85 86 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 87 { 88 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 89 90 PetscFunctionBegin; 91 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 92 /* We do not expect one needs factors on host */ 93 PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 94 PetscCheck(aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 95 aijkok->a_dual.sync_host(); 96 PetscFunctionReturn(0); 97 } 98 99 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 100 { 101 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 102 103 PetscFunctionBegin; 104 if (aijkok) { 105 aijkok->a_dual.sync_host(); 106 *array = aijkok->a_dual.view_host().data(); 107 } else { /* Happens when calling MatSetValues on a newly created matrix */ 108 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 109 } 110 PetscFunctionReturn(0); 111 } 112 113 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 114 { 115 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 116 117 PetscFunctionBegin; 118 if (aijkok) aijkok->a_dual.modify_host(); 119 PetscFunctionReturn(0); 120 } 121 122 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 123 { 124 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 125 126 PetscFunctionBegin; 127 if (aijkok) { 128 aijkok->a_dual.sync_host(); 129 *array = aijkok->a_dual.view_host().data(); 130 } else { 131 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 132 } 133 PetscFunctionReturn(0); 134 } 135 136 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 137 { 138 PetscFunctionBegin; 139 *array = NULL; 140 PetscFunctionReturn(0); 141 } 142 143 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 144 { 145 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 146 147 PetscFunctionBegin; 148 if (aijkok) { 149 *array = aijkok->a_dual.view_host().data(); 150 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 151 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 152 } 153 PetscFunctionReturn(0); 154 } 155 156 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 157 { 158 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 159 160 PetscFunctionBegin; 161 if (aijkok) { 162 aijkok->a_dual.clear_sync_state(); 163 aijkok->a_dual.modify_host(); 164 } 165 PetscFunctionReturn(0); 166 } 167 168 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype) 169 { 170 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 171 172 PetscFunctionBegin; 173 PetscCheck(aijkok != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"aijkok is NULL"); 174 175 if (i) *i = aijkok->i_device_data(); 176 if (j) *j = aijkok->j_device_data(); 177 if (a) { 178 aijkok->a_dual.sync_device(); 179 *a = aijkok->a_device_data(); 180 } 181 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 182 PetscFunctionReturn(0); 183 } 184 185 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 186 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 187 { 188 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 189 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 190 191 PetscFunctionBegin; 192 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 193 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 194 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 195 PetscFunctionReturn(0); 196 } 197 198 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 199 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 200 { 201 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 202 203 PetscFunctionBegin; 204 if (aijkok && aijkok->device_mat_d.data()) { 205 *d_mat = aijkok->device_mat_d.data(); 206 } else { 207 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it) 208 *d_mat = NULL; 209 } 210 PetscFunctionReturn(0); 211 } 212 213 /* Generate the transpose on device and cache it internally */ 214 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 215 { 216 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 217 218 PetscFunctionBegin; 219 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 220 if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 221 /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 222 PetscCallCXX(aijkok->a_dual.sync_device()); 223 PetscCallCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 224 PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 225 aijkok->transpose_updated = PETSC_TRUE; 226 } 227 *csrmatT = &aijkok->csrmatT; 228 PetscFunctionReturn(0); 229 } 230 231 /* Generate the Hermitian on device and cache it internally */ 232 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 233 { 234 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 235 236 PetscFunctionBegin; 237 PetscCall(PetscLogGpuTimeBegin()); 238 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 239 if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 240 PetscCallCXX(aijkok->a_dual.sync_device()); 241 PetscCallCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 242 PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 243 #if defined(PETSC_USE_COMPLEX) 244 const auto& a = aijkok->csrmatH.values; 245 Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 246 #endif 247 aijkok->hermitian_updated = PETSC_TRUE; 248 } 249 *csrmatH = &aijkok->csrmatH; 250 PetscCall(PetscLogGpuTimeEnd()); 251 PetscFunctionReturn(0); 252 } 253 254 /* y = A x */ 255 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 256 { 257 Mat_SeqAIJKokkos *aijkok; 258 ConstPetscScalarKokkosView xv; 259 PetscScalarKokkosView yv; 260 261 PetscFunctionBegin; 262 PetscCall(PetscLogGpuTimeBegin()); 263 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 264 PetscCall(VecGetKokkosView(xx,&xv)); 265 PetscCall(VecGetKokkosViewWrite(yy,&yv)); 266 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 267 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 268 PetscCall(VecRestoreKokkosView(xx,&xv)); 269 PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 270 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 271 PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz())); 272 PetscCall(PetscLogGpuTimeEnd()); 273 PetscFunctionReturn(0); 274 } 275 276 /* y = A^T x */ 277 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 278 { 279 Mat_SeqAIJKokkos *aijkok; 280 const char *mode; 281 ConstPetscScalarKokkosView xv; 282 PetscScalarKokkosView yv; 283 KokkosCsrMatrix *csrmat; 284 285 PetscFunctionBegin; 286 PetscCall(PetscLogGpuTimeBegin()); 287 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 288 PetscCall(VecGetKokkosView(xx,&xv)); 289 PetscCall(VecGetKokkosViewWrite(yy,&yv)); 290 if (A->form_explicit_transpose) { 291 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat)); 292 mode = "N"; 293 } else { 294 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 295 csrmat = &aijkok->csrmat; 296 mode = "T"; 297 } 298 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 299 PetscCall(VecRestoreKokkosView(xx,&xv)); 300 PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 301 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 302 PetscCall(PetscLogGpuTimeEnd()); 303 PetscFunctionReturn(0); 304 } 305 306 /* y = A^H x */ 307 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 308 { 309 Mat_SeqAIJKokkos *aijkok; 310 const char *mode; 311 ConstPetscScalarKokkosView xv; 312 PetscScalarKokkosView yv; 313 KokkosCsrMatrix *csrmat; 314 315 PetscFunctionBegin; 316 PetscCall(PetscLogGpuTimeBegin()); 317 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 318 PetscCall(VecGetKokkosView(xx,&xv)); 319 PetscCall(VecGetKokkosViewWrite(yy,&yv)); 320 if (A->form_explicit_transpose) { 321 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat)); 322 mode = "N"; 323 } else { 324 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 325 csrmat = &aijkok->csrmat; 326 mode = "C"; 327 } 328 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 329 PetscCall(VecRestoreKokkosView(xx,&xv)); 330 PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 331 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 332 PetscCall(PetscLogGpuTimeEnd()); 333 PetscFunctionReturn(0); 334 } 335 336 /* z = A x + y */ 337 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 338 { 339 Mat_SeqAIJKokkos *aijkok; 340 ConstPetscScalarKokkosView xv,yv; 341 PetscScalarKokkosView zv; 342 343 PetscFunctionBegin; 344 PetscCall(PetscLogGpuTimeBegin()); 345 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 346 PetscCall(VecGetKokkosView(xx,&xv)); 347 PetscCall(VecGetKokkosView(yy,&yv)); 348 PetscCall(VecGetKokkosViewWrite(zz,&zv)); 349 if (zz != yy) Kokkos::deep_copy(zv,yv); 350 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 351 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 352 PetscCall(VecRestoreKokkosView(xx,&xv)); 353 PetscCall(VecRestoreKokkosView(yy,&yv)); 354 PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 355 PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz())); 356 PetscCall(PetscLogGpuTimeEnd()); 357 PetscFunctionReturn(0); 358 } 359 360 /* z = A^T x + y */ 361 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 362 { 363 Mat_SeqAIJKokkos *aijkok; 364 const char *mode; 365 ConstPetscScalarKokkosView xv,yv; 366 PetscScalarKokkosView zv; 367 KokkosCsrMatrix *csrmat; 368 369 PetscFunctionBegin; 370 PetscCall(PetscLogGpuTimeBegin()); 371 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 372 PetscCall(VecGetKokkosView(xx,&xv)); 373 PetscCall(VecGetKokkosView(yy,&yv)); 374 PetscCall(VecGetKokkosViewWrite(zz,&zv)); 375 if (zz != yy) Kokkos::deep_copy(zv,yv); 376 if (A->form_explicit_transpose) { 377 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat)); 378 mode = "N"; 379 } else { 380 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 381 csrmat = &aijkok->csrmat; 382 mode = "T"; 383 } 384 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 385 PetscCall(VecRestoreKokkosView(xx,&xv)); 386 PetscCall(VecRestoreKokkosView(yy,&yv)); 387 PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 388 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 389 PetscCall(PetscLogGpuTimeEnd()); 390 PetscFunctionReturn(0); 391 } 392 393 /* z = A^H x + y */ 394 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 395 { 396 Mat_SeqAIJKokkos *aijkok; 397 const char *mode; 398 ConstPetscScalarKokkosView xv,yv; 399 PetscScalarKokkosView zv; 400 KokkosCsrMatrix *csrmat; 401 402 PetscFunctionBegin; 403 PetscCall(PetscLogGpuTimeBegin()); 404 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 405 PetscCall(VecGetKokkosView(xx,&xv)); 406 PetscCall(VecGetKokkosView(yy,&yv)); 407 PetscCall(VecGetKokkosViewWrite(zz,&zv)); 408 if (zz != yy) Kokkos::deep_copy(zv,yv); 409 if (A->form_explicit_transpose) { 410 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat)); 411 mode = "N"; 412 } else { 413 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 414 csrmat = &aijkok->csrmat; 415 mode = "C"; 416 } 417 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 418 PetscCall(VecRestoreKokkosView(xx,&xv)); 419 PetscCall(VecRestoreKokkosView(yy,&yv)); 420 PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 421 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 422 PetscCall(PetscLogGpuTimeEnd()); 423 PetscFunctionReturn(0); 424 } 425 426 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 427 { 428 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 429 430 PetscFunctionBegin; 431 switch (op) { 432 case MAT_FORM_EXPLICIT_TRANSPOSE: 433 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 434 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 435 A->form_explicit_transpose = flg; 436 break; 437 default: 438 PetscCall(MatSetOption_SeqAIJ(A,op,flg)); 439 break; 440 } 441 PetscFunctionReturn(0); 442 } 443 444 /* Depending on reuse, either build a new mat, or use the existing mat */ 445 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 446 { 447 Mat_SeqAIJ *aseq; 448 449 PetscFunctionBegin; 450 PetscCall(PetscKokkosInitializeCheck()); 451 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 452 PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); /* the returned newmat is a SeqAIJKokkos */ 453 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 454 PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 455 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 456 PetscCheck(A == *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 457 PetscCall(PetscFree(A->defaultvectype)); 458 PetscCall(PetscStrallocpy(VECKOKKOS,&A->defaultvectype)); /* Allocate and copy the string */ 459 PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS)); 460 PetscCall(MatSetOps_SeqAIJKokkos(A)); 461 aseq = static_cast<Mat_SeqAIJ*>(A->data); 462 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 463 PetscCheck(!A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 464 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 465 } 466 } 467 PetscFunctionReturn(0); 468 } 469 470 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 471 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 472 */ 473 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 474 { 475 Mat_SeqAIJ *bseq; 476 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 477 Mat mat; 478 479 PetscFunctionBegin; 480 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 481 PetscCall(MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B)); 482 mat = *B; 483 if (A->assembled) { 484 bseq = static_cast<Mat_SeqAIJ*>(mat->data); 485 bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 486 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 487 /* Now copy values to B if needed */ 488 if (dupOption == MAT_COPY_VALUES) { 489 if (akok->a_dual.need_sync_device()) { 490 Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 491 bkok->a_dual.modify_host(); 492 } else { /* If device has the latest data, we only copy data on device */ 493 Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 494 bkok->a_dual.modify_device(); 495 } 496 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 497 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 498 bkok->a_dual.modify_host(); 499 } 500 mat->spptr = bkok; 501 } 502 503 PetscCall(PetscFree(mat->defaultvectype)); 504 PetscCall(PetscStrallocpy(VECKOKKOS,&mat->defaultvectype)); /* Allocate and copy the string */ 505 PetscCall(PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS)); 506 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 507 PetscFunctionReturn(0); 508 } 509 510 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 511 { 512 Mat At; 513 KokkosCsrMatrix *internT; 514 Mat_SeqAIJKokkos *atkok,*bkok; 515 516 PetscFunctionBegin; 517 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&internT)); /* Generate a transpose internally */ 518 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 519 /* Deep copy internT, as we want to isolate the internal transpose */ 520 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT))); 521 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At)); 522 if (reuse == MAT_INITIAL_MATRIX) *B = At; 523 else PetscCall(MatHeaderReplace(A,&At)); /* Replace A with At inplace */ 524 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 525 if ((*B)->assembled) { 526 bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 527 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 528 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 529 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 530 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 531 MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 532 MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 533 PetscCallCXX(Kokkos::deep_copy(a_h,internT->values)); 534 PetscCallCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 535 } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 536 } 537 PetscFunctionReturn(0); 538 } 539 540 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 541 { 542 Mat_SeqAIJKokkos *aijkok; 543 544 PetscFunctionBegin; 545 if (A->factortype == MAT_FACTOR_NONE) { 546 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 547 delete aijkok; 548 } else { 549 delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 550 } 551 A->spptr = NULL; 552 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL)); 553 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 554 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 555 PetscCall(MatDestroy_SeqAIJ(A)); 556 PetscFunctionReturn(0); 557 } 558 559 /*MC 560 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 561 562 A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 563 564 Options Database Keys: 565 . -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions() 566 567 Level: beginner 568 569 .seealso: MatCreateSeqAIJKokkos(), MATMPIAIJKOKKOS 570 M*/ 571 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 572 { 573 PetscFunctionBegin; 574 PetscCall(PetscKokkosInitializeCheck()); 575 PetscCall(MatCreate_SeqAIJ(A)); 576 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A)); 577 PetscFunctionReturn(0); 578 } 579 580 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 581 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 582 { 583 Mat_SeqAIJ *a,*b; 584 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 585 MatScalarKokkosView aa,ba,ca; 586 MatRowMapKokkosView ai,bi,ci; 587 MatColIdxKokkosView aj,bj,cj; 588 PetscInt m,n,nnz,aN; 589 590 PetscFunctionBegin; 591 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 592 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 593 PetscValidPointer(C,4); 594 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 595 PetscCheckTypeName(B,MATSEQAIJKOKKOS); 596 PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 597 PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 598 599 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 600 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 601 a = static_cast<Mat_SeqAIJ*>(A->data); 602 b = static_cast<Mat_SeqAIJ*>(B->data); 603 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 604 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 605 aa = akok->a_dual.view_device(); 606 ai = akok->i_dual.view_device(); 607 ba = bkok->a_dual.view_device(); 608 bi = bkok->i_dual.view_device(); 609 m = A->rmap->n; /* M, N and nnz of C */ 610 n = A->cmap->n + B->cmap->n; 611 nnz = a->nz + b->nz; 612 aN = A->cmap->n; /* N of A */ 613 if (reuse == MAT_INITIAL_MATRIX) { 614 aj = akok->j_dual.view_device(); 615 bj = bkok->j_dual.view_device(); 616 auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 617 auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 618 auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 619 ca = ca_dual.view_device(); 620 ci = ci_dual.view_device(); 621 cj = cj_dual.view_device(); 622 623 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 624 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 625 PetscInt i = t.league_rank(); /* row i */ 626 PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 627 628 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 629 ci(i) = coffset; 630 if (i == m-1) ci(m) = ai(m) + bi(m); 631 }); 632 633 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 634 if (k < alen) { 635 ca(coffset+k) = aa(ai(i)+k); 636 cj(coffset+k) = aj(ai(i)+k); 637 } else { 638 ca(coffset+k) = ba(bi(i)+k-alen); 639 cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 640 } 641 }); 642 }); 643 ca_dual.modify_device(); 644 ci_dual.modify_device(); 645 cj_dual.modify_device(); 646 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 647 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C)); 648 } else if (reuse == MAT_REUSE_MATRIX) { 649 PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 650 PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 651 ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 652 ca = ckok->a_dual.view_device(); 653 ci = ckok->i_dual.view_device(); 654 655 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 656 PetscInt i = t.league_rank(); /* row i */ 657 PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 658 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 659 if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 660 else ca(ci(i)+k) = ba(bi(i)+k-alen); 661 }); 662 }); 663 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 664 } 665 PetscFunctionReturn(0); 666 } 667 668 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 669 { 670 PetscFunctionBegin; 671 delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 672 PetscFunctionReturn(0); 673 } 674 675 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 676 { 677 Mat_Product *product = C->product; 678 Mat A,B; 679 bool transA,transB; /* use bool, since KK needs this type */ 680 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 681 Mat_SeqAIJ *c; 682 MatProductData_SeqAIJKokkos *pdata; 683 KokkosCsrMatrix *csrmatA,*csrmatB; 684 685 PetscFunctionBegin; 686 MatCheckProduct(C,1); 687 PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 688 pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 689 690 if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 691 pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 692 PetscFunctionReturn(0); 693 } 694 695 switch (product->type) { 696 case MATPRODUCT_AB: transA = false; transB = false; break; 697 case MATPRODUCT_AtB: transA = true; transB = false; break; 698 case MATPRODUCT_ABt: transA = false; transB = true; break; 699 default: 700 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 701 } 702 703 A = product->A; 704 B = product->B; 705 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 706 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 707 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 708 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 709 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 710 711 PetscCheck(ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 712 713 csrmatA = &akok->csrmat; 714 csrmatB = &bkok->csrmat; 715 716 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 717 if (transA) { 718 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA)); 719 transA = false; 720 } 721 722 if (transB) { 723 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB)); 724 transB = false; 725 } 726 PetscCall(PetscLogGpuTimeBegin()); 727 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 728 PetscCallCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 729 PetscCall(PetscLogGpuTimeEnd()); 730 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 731 /* shorter version of MatAssemblyEnd_SeqAIJ */ 732 c = (Mat_SeqAIJ*)C->data; 733 PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz)); 734 PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n")); 735 PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax)); 736 c->reallocs = 0; 737 C->info.mallocs = 0; 738 C->info.nz_unneeded = 0; 739 C->assembled = C->was_assembled = PETSC_TRUE; 740 C->num_ass++; 741 PetscFunctionReturn(0); 742 } 743 744 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 745 { 746 Mat_Product *product = C->product; 747 MatProductType ptype; 748 Mat A,B; 749 bool transA,transB; 750 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 751 MatProductData_SeqAIJKokkos *pdata; 752 MPI_Comm comm; 753 KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 754 755 PetscFunctionBegin; 756 MatCheckProduct(C,1); 757 PetscCall(PetscObjectGetComm((PetscObject)C,&comm)); 758 PetscCheck(!product->data,comm,PETSC_ERR_PLIB,"Product data not empty"); 759 A = product->A; 760 B = product->B; 761 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 762 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 763 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 764 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 765 csrmatA = &akok->csrmat; 766 csrmatB = &bkok->csrmat; 767 768 ptype = product->type; 769 switch (ptype) { 770 case MATPRODUCT_AB: transA = false; transB = false; break; 771 case MATPRODUCT_AtB: transA = true; transB = false; break; 772 case MATPRODUCT_ABt: transA = false; transB = true; break; 773 default: 774 SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 775 } 776 777 product->data = pdata = new MatProductData_SeqAIJKokkos(); 778 pdata->kh.set_team_work_size(16); 779 pdata->kh.set_dynamic_scheduling(true); 780 pdata->reusesym = product->api_user; 781 782 /* TODO: add command line options to select spgemm algorithms */ 783 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 784 /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API. 785 We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support. 786 */ 787 pdata->kh.create_spgemm_handle(spgemm_alg); 788 789 PetscCall(PetscLogGpuTimeBegin()); 790 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 791 if (transA) { 792 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA)); 793 transA = false; 794 } 795 796 if (transB) { 797 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB)); 798 transB = false; 799 } 800 801 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 802 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 803 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 804 calling new Mat_SeqAIJKokkos(). 805 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 806 */ 807 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 808 PetscCallCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 809 PetscCall(PetscLogGpuTimeEnd()); 810 811 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 812 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C,ckok)); 813 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 814 PetscFunctionReturn(0); 815 } 816 817 /* handles sparse matrix matrix ops */ 818 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 819 { 820 Mat_Product *product = mat->product; 821 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 822 823 PetscFunctionBegin; 824 MatCheckProduct(mat,1); 825 PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok)); 826 if (product->type == MATPRODUCT_ABC) { 827 PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok)); 828 } 829 if (Biskok && Ciskok) { 830 switch (product->type) { 831 case MATPRODUCT_AB: 832 case MATPRODUCT_AtB: 833 case MATPRODUCT_ABt: 834 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 835 break; 836 case MATPRODUCT_PtAP: 837 case MATPRODUCT_RARt: 838 case MATPRODUCT_ABC: 839 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 840 break; 841 default: 842 break; 843 } 844 } else { /* fallback for AIJ */ 845 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 846 } 847 PetscFunctionReturn(0); 848 } 849 850 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 851 { 852 Mat_SeqAIJKokkos *aijkok; 853 854 PetscFunctionBegin; 855 PetscCall(PetscLogGpuTimeBegin()); 856 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 857 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 858 KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 859 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 860 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 861 PetscCall(PetscLogGpuTimeEnd()); 862 PetscFunctionReturn(0); 863 } 864 865 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 866 { 867 Mat_SeqAIJKokkos *aijkok; 868 869 PetscFunctionBegin; 870 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 871 if (aijkok) { /* Only zero the device if data is already there */ 872 KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 873 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 874 } else { /* Might be preallocated but not assembled */ 875 PetscCall(MatZeroEntries_SeqAIJ(A)); 876 } 877 PetscFunctionReturn(0); 878 } 879 880 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 881 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv) 882 { 883 Mat_SeqAIJKokkos *aijkok; 884 885 PetscFunctionBegin; 886 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 887 PetscValidPointer(kv,2); 888 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 889 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 890 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 891 *kv = aijkok->a_dual.view_device(); 892 PetscFunctionReturn(0); 893 } 894 895 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv) 896 { 897 PetscFunctionBegin; 898 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 899 PetscValidPointer(kv,2); 900 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 901 PetscFunctionReturn(0); 902 } 903 904 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv) 905 { 906 Mat_SeqAIJKokkos *aijkok; 907 908 PetscFunctionBegin; 909 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 910 PetscValidPointer(kv,2); 911 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 912 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 913 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 914 *kv = aijkok->a_dual.view_device(); 915 PetscFunctionReturn(0); 916 } 917 918 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv) 919 { 920 PetscFunctionBegin; 921 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 922 PetscValidPointer(kv,2); 923 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 924 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 925 PetscFunctionReturn(0); 926 } 927 928 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 929 { 930 Mat_SeqAIJKokkos *aijkok; 931 932 PetscFunctionBegin; 933 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 934 PetscValidPointer(kv,2); 935 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 936 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 937 *kv = aijkok->a_dual.view_device(); 938 PetscFunctionReturn(0); 939 } 940 941 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 942 { 943 PetscFunctionBegin; 944 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 945 PetscValidPointer(kv,2); 946 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 947 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 948 PetscFunctionReturn(0); 949 } 950 951 /* Computes Y += alpha X */ 952 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 953 { 954 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 955 Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 956 ConstMatScalarKokkosView Xa; 957 MatScalarKokkosView Ya; 958 959 PetscFunctionBegin; 960 PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 961 PetscCheckTypeName(X,MATSEQAIJKOKKOS); 962 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 963 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 964 PetscCall(PetscLogGpuTimeBegin()); 965 966 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 967 /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 968 PetscBool e; 969 PetscCall(PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e)); 970 if (e) { 971 PetscCall(PetscArraycmp(x->j,y->j,y->nz,&e)); 972 if (e) pattern = SAME_NONZERO_PATTERN; 973 } 974 } 975 976 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 977 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 978 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 979 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 980 */ 981 ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 982 xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 983 Xa = xkok->a_dual.view_device(); 984 Ya = ykok->a_dual.view_device(); 985 986 if (pattern == SAME_NONZERO_PATTERN) { 987 KokkosBlas::axpy(alpha,Xa,Ya); 988 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 989 } else if (pattern == SUBSET_NONZERO_PATTERN) { 990 MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 991 MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 992 993 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 994 PetscInt i = t.league_rank(); /* row i */ 995 Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 996 PetscInt p,q = Yi(i); 997 for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 998 while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 999 if (Xj(p) == Yj(q)) { /* Found it */ 1000 Ya(q) += alpha * Xa(p); 1001 q++; 1002 } else { 1003 /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1004 Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1005 */ 1006 if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */ 1007 } 1008 } 1009 }); 1010 }); 1011 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1012 } else { /* different nonzero patterns */ 1013 Mat Z; 1014 KokkosCsrMatrix zcsr; 1015 KernelHandle kh; 1016 kh.create_spadd_handle(false); 1017 KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1018 KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1019 zkok = new Mat_SeqAIJKokkos(zcsr); 1020 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z)); 1021 PetscCall(MatHeaderReplace(Y,&Z)); 1022 kh.destroy_spadd_handle(); 1023 } 1024 PetscCall(PetscLogGpuTimeEnd()); 1025 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0)*2)); /* Because we scaled X and then added it to Y */ 1026 PetscFunctionReturn(0); 1027 } 1028 1029 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 1030 { 1031 Mat_SeqAIJKokkos *akok; 1032 Mat_SeqAIJ *aseq; 1033 1034 PetscFunctionBegin; 1035 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j)); 1036 aseq = static_cast<Mat_SeqAIJ*>(mat->data); 1037 akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr); 1038 delete akok; 1039 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,mat->nonzerostate+1,PETSC_FALSE); 1040 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1041 akok->SetUpCOO(aseq); 1042 PetscFunctionReturn(0); 1043 } 1044 1045 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode) 1046 { 1047 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ*>(A->data); 1048 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1049 PetscCount Annz = aseq->nz; 1050 const PetscCountKokkosView& jmap = akok->jmap_d; 1051 const PetscCountKokkosView& perm = akok->perm_d; 1052 MatScalarKokkosView Aa; 1053 ConstMatScalarKokkosView kv; 1054 PetscMemType memtype; 1055 1056 PetscFunctionBegin; 1057 PetscCall(PetscGetMemType(v,&memtype)); 1058 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1059 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n)); 1060 } else { 1061 kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */ 1062 } 1063 1064 if (imode == INSERT_VALUES) { 1065 PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); /* write matrix values */ 1066 Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */ 1067 } else PetscCall(MatSeqAIJGetKokkosView(A,&Aa)); /* read & write matrix values */ 1068 1069 Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=jmap(i); k<jmap(i+1); k++) Aa(i) += kv(perm(k));}); 1070 1071 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa)); 1072 else PetscCall(MatSeqAIJRestoreKokkosView(A,&Aa)); 1073 PetscFunctionReturn(0); 1074 } 1075 1076 PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A,const PetscInt *diag) 1077 { 1078 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1079 MatScalarKokkosView Aa; 1080 const MatRowMapKokkosView& Ai = akok->i_dual.view_device(); 1081 PetscInt m = A->rmap->n; 1082 ConstMatRowMapKokkosView Adiag(diag,m); /* diag is a device pointer */ 1083 1084 PetscFunctionBegin; 1085 PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); 1086 Kokkos::parallel_for(m,KOKKOS_LAMBDA(const PetscInt i) { 1087 PetscScalar tmp; 1088 if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i+1)) { /* The diagonal element exists */ 1089 tmp = Aa(Ai(i)); 1090 Aa(Ai(i)) = Aa(Adiag(i)); 1091 Aa(Adiag(i)) = tmp; 1092 } 1093 }); 1094 PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa)); 1095 PetscFunctionReturn(0); 1096 } 1097 1098 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1099 { 1100 PetscFunctionBegin; 1101 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1102 PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info)); 1103 B->offloadmask = PETSC_OFFLOAD_CPU; 1104 PetscFunctionReturn(0); 1105 } 1106 1107 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1108 { 1109 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1110 1111 PetscFunctionBegin; 1112 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1113 A->boundtocpu = PETSC_FALSE; 1114 1115 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1116 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1117 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1118 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1119 A->ops->scale = MatScale_SeqAIJKokkos; 1120 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1121 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1122 A->ops->mult = MatMult_SeqAIJKokkos; 1123 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1124 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1125 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1126 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1127 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1128 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1129 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1130 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1131 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1132 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1133 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1134 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1135 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1136 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1137 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1138 1139 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos)); 1140 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJKokkos)); 1141 PetscFunctionReturn(0); 1142 } 1143 1144 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1145 { 1146 Mat_SeqAIJ *aseq; 1147 PetscInt i,m,n; 1148 1149 PetscFunctionBegin; 1150 PetscCheck(!A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1151 1152 m = akok->nrows(); 1153 n = akok->ncols(); 1154 PetscCall(MatSetSizes(A,m,n,m,n)); 1155 PetscCall(MatSetType(A,MATSEQAIJKOKKOS)); 1156 1157 /* Set up data structures of A as a MATSEQAIJ */ 1158 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL)); 1159 aseq = (Mat_SeqAIJ*)(A)->data; 1160 1161 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1162 akok->j_dual.sync_host(); 1163 1164 aseq->i = akok->i_host_data(); 1165 aseq->j = akok->j_host_data(); 1166 aseq->a = akok->a_host_data(); 1167 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1168 aseq->singlemalloc = PETSC_FALSE; 1169 aseq->free_a = PETSC_FALSE; 1170 aseq->free_ij = PETSC_FALSE; 1171 aseq->nz = akok->nnz(); 1172 aseq->maxnz = aseq->nz; 1173 1174 PetscCall(PetscMalloc1(m,&aseq->imax)); 1175 PetscCall(PetscMalloc1(m,&aseq->ilen)); 1176 for (i=0; i<m; i++) { 1177 aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1178 } 1179 1180 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1181 akok->nonzerostate = A->nonzerostate; 1182 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1183 PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY)); 1184 PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY)); 1185 PetscFunctionReturn(0); 1186 } 1187 1188 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1189 1190 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1191 */ 1192 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1193 { 1194 PetscFunctionBegin; 1195 PetscCall(MatCreate(comm,A)); 1196 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A,akok)); 1197 PetscFunctionReturn(0); 1198 } 1199 1200 /* --------------------------------------------------------------------------------*/ 1201 /*@C 1202 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 1203 (the default parallel PETSc format). This matrix will ultimately be handled by 1204 Kokkos for calculations. For good matrix 1205 assembly performance the user should preallocate the matrix storage by setting 1206 the parameter nz (or the array nnz). By setting these parameters accurately, 1207 performance during matrix assembly can be increased by more than a factor of 50. 1208 1209 Collective 1210 1211 Input Parameters: 1212 + comm - MPI communicator, set to PETSC_COMM_SELF 1213 . m - number of rows 1214 . n - number of columns 1215 . nz - number of nonzeros per row (same for all rows) 1216 - nnz - array containing the number of nonzeros in the various rows 1217 (possibly different for each row) or NULL 1218 1219 Output Parameter: 1220 . A - the matrix 1221 1222 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1223 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1224 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1225 1226 Notes: 1227 If nnz is given then nz is ignored 1228 1229 The AIJ format (also called the Yale sparse matrix format or 1230 compressed row storage), is fully compatible with standard Fortran 77 1231 storage. That is, the stored row and column indices can begin at 1232 either one (as in Fortran) or zero. See the users' manual for details. 1233 1234 Specify the preallocated storage with either nz or nnz (not both). 1235 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1236 allocation. For large problems you MUST preallocate memory or you 1237 will get TERRIBLE performance, see the users' manual chapter on matrices. 1238 1239 By default, this format uses inodes (identical nodes) when possible, to 1240 improve numerical efficiency of matrix-vector products and solves. We 1241 search for consecutive rows with the same nonzero structure, thereby 1242 reusing matrix information to achieve increased efficiency. 1243 1244 Level: intermediate 1245 1246 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 1247 @*/ 1248 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1249 { 1250 PetscFunctionBegin; 1251 PetscCall(PetscKokkosInitializeCheck()); 1252 PetscCall(MatCreate(comm,A)); 1253 PetscCall(MatSetSizes(*A,m,n,m,n)); 1254 PetscCall(MatSetType(*A,MATSEQAIJKOKKOS)); 1255 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz)); 1256 PetscFunctionReturn(0); 1257 } 1258 1259 typedef Kokkos::TeamPolicy<>::member_type team_member; 1260 // 1261 // This factorization exploits block diagonal matrices with "Nf" (not used). 1262 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1263 // 1264 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1265 { 1266 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1267 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1268 IS isrow = b->row,isicol = b->icol; 1269 const PetscInt *r_h,*ic_h; 1270 const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data(); 1271 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1272 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1273 PetscBool row_identity,col_identity; 1274 PetscInt nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used 1275 1276 PetscFunctionBegin; 1277 PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n); 1278 PetscCall(MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity)); 1279 PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 1280 PetscCall(ISGetIndices(isrow,&r_h)); 1281 PetscCall(ISGetIndices(isicol,&ic_h)); 1282 PetscCall(ISGetSize(isicol,&nc)); 1283 PetscCall(PetscLogGpuTimeBegin()); 1284 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1285 { 1286 #define KOKKOS_SHARED_LEVEL 1 1287 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1288 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1289 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1290 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 1291 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 1292 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 1293 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 1294 size_t flops_h = 0.0; 1295 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 1296 Kokkos::View<size_t> d_flops_k ("flops"); 1297 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1298 const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 1299 Kokkos::deep_copy (d_flops_k, h_flops_k); 1300 Kokkos::deep_copy (d_r_k, h_r_k); 1301 Kokkos::deep_copy (d_ic_k, h_ic_k); 1302 // Fill A --> fact 1303 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1304 const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 1305 const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 1306 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1307 // zero rows of B 1308 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1309 PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 1310 PetscScalar *baL = ba_d + bi_d[rowb]; 1311 PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 1312 /* zero (unfactored row) */ 1313 for (int j=0;j<nzbL;j++) baL[j] = 0; 1314 for (int j=0;j<nzbU;j++) baU[j] = 0; 1315 }); 1316 // copy A into B 1317 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1318 PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 1319 const PetscScalar *av = aa_d + ai_d[rowa]; 1320 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1321 /* load in initial (unfactored row) */ 1322 for (int j=0;j<nza;j++) { 1323 PetscInt colb = ic[ajtmp[j]]; 1324 PetscScalar vala = av[j]; 1325 if (colb == rowb) { 1326 *(ba_d + bdiag_d[rowb]) = vala; 1327 } else { 1328 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1329 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1330 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 1331 for (int j=0; j<nz ; j++) { 1332 if (pbj[j] == colb) { 1333 pba[j] = vala; 1334 set++; 1335 break; 1336 } 1337 } 1338 #if !defined(PETSC_HAVE_SYCL) 1339 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1340 #endif 1341 } 1342 } 1343 }); 1344 }); 1345 Kokkos::fence(); 1346 1347 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 1348 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1349 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1350 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1351 const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 1352 const PetscInt start = field*nloc, end = start + nloc; 1353 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1354 // A22 panel update for each row A(1,:) and col A(:,1) 1355 for (int ii=start; ii<end-1; ii++) { 1356 const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 1357 const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 1358 const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 1359 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1360 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 1361 PetscInt kIdx = j*Ni + field_block_idx; 1362 if (kIdx >= nzUi) /* void */ ; 1363 else { 1364 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1365 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1366 const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 1367 size_t st_idx; 1368 // find and do L(k,i) = A(:k,i) / A(i,i) 1369 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1370 // get column, there has got to be a better way 1371 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 1372 if (pjL[j] == ii) { 1373 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1374 idx = j; // output 1375 *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1376 } 1377 }, st_idx); 1378 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 1379 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1380 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register 1381 #endif 1382 // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1383 // U(i+1,:end) 1384 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 1385 PetscScalar Uij = baUi[uiIdx]; 1386 PetscInt col = bjUi[uiIdx]; 1387 if (col==myk) { 1388 // A_kk = A_kk - L_ki * U_ij(k) 1389 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1390 *Akkv = *Akkv - L_ki() * Uij; // UiK 1391 } else { 1392 PetscScalar *start, *end, *pAkjv=NULL; 1393 PetscInt high, low; 1394 const PetscInt *startj; 1395 if (col<myk) { // L 1396 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1397 PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 1398 start = pLki+1; // start at pLki+1, A22(myk,1) 1399 startj= bj_d + bi_d[myk] + idx; 1400 end = ba_d + bi_d[myk+1]; 1401 } else { 1402 PetscInt idx = bdiag_d[myk+1]+1; 1403 start = ba_d + idx; 1404 startj= bj_d + idx; 1405 end = ba_d + bdiag_d[myk]; 1406 } 1407 // search for 'col', use bisection search - TODO 1408 low = 0; 1409 high = (PetscInt)(end-start); 1410 while (high-low > 5) { 1411 int t = (low+high)/2; 1412 if (startj[t] > col) high = t; 1413 else low = t; 1414 } 1415 for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 1416 if (startj[pAkjv-start] == col) break; 1417 } 1418 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1419 if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register 1420 #endif 1421 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1422 } 1423 }); 1424 } 1425 }); 1426 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1427 if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 1428 } /* endof for (i=0; i<n; i++) { */ 1429 Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 1430 }); 1431 Kokkos::fence(); 1432 Kokkos::deep_copy (h_flops_k, d_flops_k); 1433 PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k())); 1434 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 1435 const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 1436 const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 1437 /* Invert diagonal for simpler triangular solves */ 1438 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 1439 int i = start + outer_index*Ni + lg_rank%Ni; 1440 if (i < end) { 1441 PetscScalar *pv = ba_d + bdiag_d[i]; 1442 *pv = 1.0/(*pv); 1443 } 1444 }); 1445 }); 1446 } 1447 PetscCall(PetscLogGpuTimeEnd()); 1448 PetscCall(ISRestoreIndices(isicol,&ic_h)); 1449 PetscCall(ISRestoreIndices(isrow,&r_h)); 1450 1451 PetscCall(ISIdentity(isrow,&row_identity)); 1452 PetscCall(ISIdentity(isicol,&col_identity)); 1453 if (b->inode.size) { 1454 B->ops->solve = MatSolve_SeqAIJ_Inode; 1455 } else if (row_identity && col_identity) { 1456 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1457 } else { 1458 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1459 } 1460 B->offloadmask = PETSC_OFFLOAD_GPU; 1461 PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU 1462 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1463 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1464 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1465 B->ops->matsolve = MatMatSolve_SeqAIJ; 1466 B->assembled = PETSC_TRUE; 1467 B->preallocated = PETSC_TRUE; 1468 1469 PetscFunctionReturn(0); 1470 } 1471 1472 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1473 { 1474 PetscFunctionBegin; 1475 PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 1476 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1477 PetscFunctionReturn(0); 1478 } 1479 1480 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1481 { 1482 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1483 1484 PetscFunctionBegin; 1485 if (!factors->sptrsv_symbolic_completed) { 1486 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 1487 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 1488 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1489 } 1490 PetscFunctionReturn(0); 1491 } 1492 1493 /* Check if we need to update factors etc for transpose solve */ 1494 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1495 { 1496 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1497 MatColIdxType n = A->rmap->n; 1498 1499 PetscFunctionBegin; 1500 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1501 /* Update L^T and do sptrsv symbolic */ 1502 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 1503 Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1504 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1505 factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 1506 1507 KokkosKernels::Impl::transpose_matrix< 1508 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1509 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1510 MatRowMapKokkosView,DefaultExecutionSpace>( 1511 n,n,factors->iL_d,factors->jL_d,factors->aL_d, 1512 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1513 1514 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1515 We have to sort the indices, until KK provides finer control options. 1516 */ 1517 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1518 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1519 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1520 1521 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 1522 1523 /* Update U^T and do sptrsv symbolic */ 1524 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 1525 Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1526 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1527 factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 1528 1529 KokkosKernels::Impl::transpose_matrix< 1530 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1531 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1532 MatRowMapKokkosView,DefaultExecutionSpace>( 1533 n,n,factors->iU_d, factors->jU_d, factors->aU_d, 1534 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1535 1536 /* Sort indices. See comments above */ 1537 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1538 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1539 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1540 1541 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 1542 factors->transpose_updated = PETSC_TRUE; 1543 } 1544 PetscFunctionReturn(0); 1545 } 1546 1547 /* Solve Ax = b, with A = LU */ 1548 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 1549 { 1550 ConstPetscScalarKokkosView bv; 1551 PetscScalarKokkosView xv; 1552 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1553 1554 PetscFunctionBegin; 1555 PetscCall(PetscLogGpuTimeBegin()); 1556 PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 1557 PetscCall(VecGetKokkosView(b,&bv)); 1558 PetscCall(VecGetKokkosViewWrite(x,&xv)); 1559 /* Solve L tmpv = b */ 1560 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 1561 /* Solve Ux = tmpv */ 1562 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 1563 PetscCall(VecRestoreKokkosView(b,&bv)); 1564 PetscCall(VecRestoreKokkosViewWrite(x,&xv)); 1565 PetscCall(PetscLogGpuTimeEnd()); 1566 PetscFunctionReturn(0); 1567 } 1568 1569 /* Solve A^T x = b, where A^T = U^T L^T */ 1570 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 1571 { 1572 ConstPetscScalarKokkosView bv; 1573 PetscScalarKokkosView xv; 1574 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1575 1576 PetscFunctionBegin; 1577 PetscCall(PetscLogGpuTimeBegin()); 1578 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 1579 PetscCall(VecGetKokkosView(b,&bv)); 1580 PetscCall(VecGetKokkosViewWrite(x,&xv)); 1581 /* Solve U^T tmpv = b */ 1582 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 1583 1584 /* Solve L^T x = tmpv */ 1585 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 1586 PetscCall(VecRestoreKokkosView(b,&bv)); 1587 PetscCall(VecRestoreKokkosViewWrite(x,&xv)); 1588 PetscCall(PetscLogGpuTimeEnd()); 1589 PetscFunctionReturn(0); 1590 } 1591 1592 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1593 { 1594 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1595 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1596 PetscInt fill_lev = info->levels; 1597 1598 PetscFunctionBegin; 1599 PetscCall(PetscLogGpuTimeBegin()); 1600 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1601 1602 auto a_d = aijkok->a_dual.view_device(); 1603 auto i_d = aijkok->i_dual.view_device(); 1604 auto j_d = aijkok->j_dual.view_device(); 1605 1606 KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 1607 1608 B->assembled = PETSC_TRUE; 1609 B->preallocated = PETSC_TRUE; 1610 B->ops->solve = MatSolve_SeqAIJKokkos; 1611 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1612 B->ops->matsolve = NULL; 1613 B->ops->matsolvetranspose = NULL; 1614 B->offloadmask = PETSC_OFFLOAD_GPU; 1615 1616 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1617 factors->transpose_updated = PETSC_FALSE; 1618 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1619 /* TODO: log flops, but how to know that? */ 1620 PetscCall(PetscLogGpuTimeEnd()); 1621 PetscFunctionReturn(0); 1622 } 1623 1624 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1625 { 1626 Mat_SeqAIJKokkos *aijkok; 1627 Mat_SeqAIJ *b; 1628 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1629 PetscInt fill_lev = info->levels; 1630 PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 1631 PetscInt n = A->rmap->n; 1632 1633 PetscFunctionBegin; 1634 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1635 /* Rebuild factors */ 1636 if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 1637 else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 1638 1639 /* Create a spiluk handle and then do symbolic factorization */ 1640 nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 1641 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 1642 1643 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1644 1645 Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 1646 Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 1647 Kokkos::realloc(factors->iU_d,n+1); 1648 Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 1649 1650 aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1651 auto i_d = aijkok->i_dual.view_device(); 1652 auto j_d = aijkok->j_dual.view_device(); 1653 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 1654 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1655 1656 Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1657 Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 1658 Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 1659 Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 1660 1661 /* TODO: add options to select sptrsv algorithms */ 1662 /* Create sptrsv handles for L, U and their transpose */ 1663 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1664 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1665 #else 1666 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1667 #endif 1668 1669 factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 1670 factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 1671 factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 1672 factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 1673 1674 /* Fill fields of the factor matrix B */ 1675 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL)); 1676 b = (Mat_SeqAIJ*)B->data; 1677 b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 1678 B->info.fill_ratio_given = info->fill; 1679 B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 1680 1681 B->offloadmask = PETSC_OFFLOAD_GPU; 1682 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1683 PetscFunctionReturn(0); 1684 } 1685 1686 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1687 { 1688 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1689 const PetscInt nrows = A->rmap->n; 1690 1691 PetscFunctionBegin; 1692 PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 1693 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1694 // move B data into Kokkos 1695 PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok 1696 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok 1697 { 1698 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1699 if (!baijkok->diag_d.extent(0)) { 1700 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1701 baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1702 Kokkos::deep_copy (baijkok->diag_d, h_diag); 1703 } 1704 } 1705 PetscFunctionReturn(0); 1706 } 1707 1708 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1709 { 1710 PetscFunctionBegin; 1711 *type = MATSOLVERKOKKOS; 1712 PetscFunctionReturn(0); 1713 } 1714 1715 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 1716 { 1717 PetscFunctionBegin; 1718 *type = MATSOLVERKOKKOSDEVICE; 1719 PetscFunctionReturn(0); 1720 } 1721 1722 /*MC 1723 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1724 on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1725 1726 Level: beginner 1727 1728 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1729 M*/ 1730 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1731 { 1732 PetscInt n = A->rmap->n; 1733 1734 PetscFunctionBegin; 1735 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 1736 PetscCall(MatSetSizes(*B,n,n,n,n)); 1737 (*B)->factortype = ftype; 1738 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 1739 PetscCall(MatSetType(*B,MATSEQAIJKOKKOS)); 1740 1741 if (ftype == MAT_FACTOR_LU) { 1742 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 1743 (*B)->canuseordering = PETSC_TRUE; 1744 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1745 } else if (ftype == MAT_FACTOR_ILU) { 1746 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 1747 (*B)->canuseordering = PETSC_FALSE; 1748 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1749 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1750 1751 PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 1752 PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos)); 1753 PetscFunctionReturn(0); 1754 } 1755 1756 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 1757 { 1758 PetscInt n = A->rmap->n; 1759 1760 PetscFunctionBegin; 1761 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 1762 PetscCall(MatSetSizes(*B,n,n,n,n)); 1763 (*B)->factortype = ftype; 1764 (*B)->canuseordering = PETSC_TRUE; 1765 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 1766 PetscCall(MatSetType(*B,MATSEQAIJKOKKOS)); 1767 1768 if (ftype == MAT_FACTOR_LU) { 1769 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 1770 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1771 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1772 1773 PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 1774 PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device)); 1775 PetscFunctionReturn(0); 1776 } 1777 1778 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1779 { 1780 PetscFunctionBegin; 1781 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos)); 1782 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos)); 1783 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device)); 1784 PetscFunctionReturn(0); 1785 } 1786 1787 /* Utility to print out a KokkosCsrMatrix for debugging */ 1788 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1789 { 1790 const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1791 const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1792 const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1793 const PetscInt *i = iv.data(); 1794 const PetscInt *j = jv.data(); 1795 const PetscScalar *a = av.data(); 1796 PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1797 1798 PetscFunctionBegin; 1799 PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz)); 1800 for (PetscInt k=0; k<m; k++) { 1801 PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k)); 1802 for (PetscInt p=i[k]; p<i[k+1]; p++) { 1803 PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p])); 1804 } 1805 PetscCall(PetscPrintf(PETSC_COMM_SELF,"\n")); 1806 } 1807 PetscFunctionReturn(0); 1808 } 1809