1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petsc/private/sfimpl.h> 5 #include <petscsystypes.h> 6 #include <petscerror.h> 7 8 #include <Kokkos_Core.hpp> 9 #include <KokkosBlas.hpp> 10 #include <KokkosKernels_Sorting.hpp> 11 #include <KokkosSparse_CrsMatrix.hpp> 12 #include <KokkosSparse_spmv.hpp> 13 #include <KokkosSparse_spiluk.hpp> 14 #include <KokkosSparse_sptrsv.hpp> 15 #include <KokkosSparse_spgemm.hpp> 16 #include <KokkosSparse_spadd.hpp> 17 18 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 19 20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 21 22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 23 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 24 In the latter case, it is important to set a_dual's sync state correctly. 25 */ 26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 27 { 28 Mat_SeqAIJ *aijseq; 29 Mat_SeqAIJKokkos *aijkok; 30 31 PetscFunctionBegin; 32 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 33 PetscCall(MatAssemblyEnd_SeqAIJ(A,mode)); 34 35 aijseq = static_cast<Mat_SeqAIJ*>(A->data); 36 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 37 38 /* If aijkok does not exist, we just copy i, j to device. 39 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 40 In both cases, we build a new aijkok structure. 41 */ 42 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 43 delete aijkok; 44 aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 45 A->spptr = aijkok; 46 } 47 48 if (aijkok->device_mat_d.data()) { 49 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 50 } 51 PetscFunctionReturn(0); 52 } 53 54 /* Sync CSR data to device if not yet */ 55 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 56 { 57 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 58 59 PetscFunctionBegin; 60 PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 61 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 62 if (aijkok->a_dual.need_sync_device()) { 63 aijkok->a_dual.sync_device(); 64 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 65 aijkok->hermitian_updated = PETSC_FALSE; 66 } 67 PetscFunctionReturn(0); 68 } 69 70 /* Mark the CSR data on device as modified */ 71 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 72 { 73 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 74 75 PetscFunctionBegin; 76 PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 77 aijkok->a_dual.clear_sync_state(); 78 aijkok->a_dual.modify_device(); 79 aijkok->transpose_updated = PETSC_FALSE; 80 aijkok->hermitian_updated = PETSC_FALSE; 81 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 82 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 83 PetscFunctionReturn(0); 84 } 85 86 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 87 { 88 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 89 90 PetscFunctionBegin; 91 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 92 /* We do not expect one needs factors on host */ 93 PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 94 PetscCheck(aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 95 aijkok->a_dual.sync_host(); 96 PetscFunctionReturn(0); 97 } 98 99 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 100 { 101 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 102 103 PetscFunctionBegin; 104 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 105 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 106 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 107 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 108 */ 109 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 110 aijkok->a_dual.sync_host(); 111 *array = aijkok->a_dual.view_host().data(); 112 } else { /* Happens when calling MatSetValues on a newly created matrix */ 113 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 114 } 115 PetscFunctionReturn(0); 116 } 117 118 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 119 { 120 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 121 122 PetscFunctionBegin; 123 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 124 PetscFunctionReturn(0); 125 } 126 127 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 128 { 129 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 130 131 PetscFunctionBegin; 132 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 133 aijkok->a_dual.sync_host(); 134 *array = aijkok->a_dual.view_host().data(); 135 } else { 136 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 137 } 138 PetscFunctionReturn(0); 139 } 140 141 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 142 { 143 PetscFunctionBegin; 144 *array = NULL; 145 PetscFunctionReturn(0); 146 } 147 148 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 149 { 150 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 151 152 PetscFunctionBegin; 153 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 154 *array = aijkok->a_dual.view_host().data(); 155 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 156 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 157 } 158 PetscFunctionReturn(0); 159 } 160 161 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 162 { 163 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 164 165 PetscFunctionBegin; 166 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 167 aijkok->a_dual.clear_sync_state(); 168 aijkok->a_dual.modify_host(); 169 } 170 PetscFunctionReturn(0); 171 } 172 173 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype) 174 { 175 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 176 177 PetscFunctionBegin; 178 PetscCheck(aijkok != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"aijkok is NULL"); 179 180 if (i) *i = aijkok->i_device_data(); 181 if (j) *j = aijkok->j_device_data(); 182 if (a) { 183 aijkok->a_dual.sync_device(); 184 *a = aijkok->a_device_data(); 185 } 186 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 187 PetscFunctionReturn(0); 188 } 189 190 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 191 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 192 { 193 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 194 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 195 196 PetscFunctionBegin; 197 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 198 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 199 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 200 PetscFunctionReturn(0); 201 } 202 203 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 204 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 205 { 206 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 207 208 PetscFunctionBegin; 209 if (aijkok && aijkok->device_mat_d.data()) { 210 *d_mat = aijkok->device_mat_d.data(); 211 } else { 212 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it) 213 *d_mat = NULL; 214 } 215 PetscFunctionReturn(0); 216 } 217 218 /* Generate the transpose on device and cache it internally */ 219 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 220 { 221 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 222 223 PetscFunctionBegin; 224 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 225 if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 226 /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 227 PetscCallCXX(aijkok->a_dual.sync_device()); 228 PetscCallCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 229 PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 230 aijkok->transpose_updated = PETSC_TRUE; 231 } 232 *csrmatT = &aijkok->csrmatT; 233 PetscFunctionReturn(0); 234 } 235 236 /* Generate the Hermitian on device and cache it internally */ 237 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 238 { 239 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 240 241 PetscFunctionBegin; 242 PetscCall(PetscLogGpuTimeBegin()); 243 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 244 if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 245 PetscCallCXX(aijkok->a_dual.sync_device()); 246 PetscCallCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 247 PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 248 #if defined(PETSC_USE_COMPLEX) 249 const auto& a = aijkok->csrmatH.values; 250 Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 251 #endif 252 aijkok->hermitian_updated = PETSC_TRUE; 253 } 254 *csrmatH = &aijkok->csrmatH; 255 PetscCall(PetscLogGpuTimeEnd()); 256 PetscFunctionReturn(0); 257 } 258 259 /* y = A x */ 260 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 261 { 262 Mat_SeqAIJKokkos *aijkok; 263 ConstPetscScalarKokkosView xv; 264 PetscScalarKokkosView yv; 265 266 PetscFunctionBegin; 267 PetscCall(PetscLogGpuTimeBegin()); 268 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 269 PetscCall(VecGetKokkosView(xx,&xv)); 270 PetscCall(VecGetKokkosViewWrite(yy,&yv)); 271 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 272 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 273 PetscCall(VecRestoreKokkosView(xx,&xv)); 274 PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 275 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 276 PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz())); 277 PetscCall(PetscLogGpuTimeEnd()); 278 PetscFunctionReturn(0); 279 } 280 281 /* y = A^T x */ 282 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 283 { 284 Mat_SeqAIJKokkos *aijkok; 285 const char *mode; 286 ConstPetscScalarKokkosView xv; 287 PetscScalarKokkosView yv; 288 KokkosCsrMatrix *csrmat; 289 290 PetscFunctionBegin; 291 PetscCall(PetscLogGpuTimeBegin()); 292 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 293 PetscCall(VecGetKokkosView(xx,&xv)); 294 PetscCall(VecGetKokkosViewWrite(yy,&yv)); 295 if (A->form_explicit_transpose) { 296 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat)); 297 mode = "N"; 298 } else { 299 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 300 csrmat = &aijkok->csrmat; 301 mode = "T"; 302 } 303 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 304 PetscCall(VecRestoreKokkosView(xx,&xv)); 305 PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 306 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 307 PetscCall(PetscLogGpuTimeEnd()); 308 PetscFunctionReturn(0); 309 } 310 311 /* y = A^H x */ 312 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 313 { 314 Mat_SeqAIJKokkos *aijkok; 315 const char *mode; 316 ConstPetscScalarKokkosView xv; 317 PetscScalarKokkosView yv; 318 KokkosCsrMatrix *csrmat; 319 320 PetscFunctionBegin; 321 PetscCall(PetscLogGpuTimeBegin()); 322 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 323 PetscCall(VecGetKokkosView(xx,&xv)); 324 PetscCall(VecGetKokkosViewWrite(yy,&yv)); 325 if (A->form_explicit_transpose) { 326 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat)); 327 mode = "N"; 328 } else { 329 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 330 csrmat = &aijkok->csrmat; 331 mode = "C"; 332 } 333 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 334 PetscCall(VecRestoreKokkosView(xx,&xv)); 335 PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 336 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 337 PetscCall(PetscLogGpuTimeEnd()); 338 PetscFunctionReturn(0); 339 } 340 341 /* z = A x + y */ 342 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 343 { 344 Mat_SeqAIJKokkos *aijkok; 345 ConstPetscScalarKokkosView xv,yv; 346 PetscScalarKokkosView zv; 347 348 PetscFunctionBegin; 349 PetscCall(PetscLogGpuTimeBegin()); 350 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 351 PetscCall(VecGetKokkosView(xx,&xv)); 352 PetscCall(VecGetKokkosView(yy,&yv)); 353 PetscCall(VecGetKokkosViewWrite(zz,&zv)); 354 if (zz != yy) Kokkos::deep_copy(zv,yv); 355 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 356 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 357 PetscCall(VecRestoreKokkosView(xx,&xv)); 358 PetscCall(VecRestoreKokkosView(yy,&yv)); 359 PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 360 PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz())); 361 PetscCall(PetscLogGpuTimeEnd()); 362 PetscFunctionReturn(0); 363 } 364 365 /* z = A^T x + y */ 366 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 367 { 368 Mat_SeqAIJKokkos *aijkok; 369 const char *mode; 370 ConstPetscScalarKokkosView xv,yv; 371 PetscScalarKokkosView zv; 372 KokkosCsrMatrix *csrmat; 373 374 PetscFunctionBegin; 375 PetscCall(PetscLogGpuTimeBegin()); 376 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 377 PetscCall(VecGetKokkosView(xx,&xv)); 378 PetscCall(VecGetKokkosView(yy,&yv)); 379 PetscCall(VecGetKokkosViewWrite(zz,&zv)); 380 if (zz != yy) Kokkos::deep_copy(zv,yv); 381 if (A->form_explicit_transpose) { 382 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat)); 383 mode = "N"; 384 } else { 385 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 386 csrmat = &aijkok->csrmat; 387 mode = "T"; 388 } 389 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 390 PetscCall(VecRestoreKokkosView(xx,&xv)); 391 PetscCall(VecRestoreKokkosView(yy,&yv)); 392 PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 393 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 394 PetscCall(PetscLogGpuTimeEnd()); 395 PetscFunctionReturn(0); 396 } 397 398 /* z = A^H x + y */ 399 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 400 { 401 Mat_SeqAIJKokkos *aijkok; 402 const char *mode; 403 ConstPetscScalarKokkosView xv,yv; 404 PetscScalarKokkosView zv; 405 KokkosCsrMatrix *csrmat; 406 407 PetscFunctionBegin; 408 PetscCall(PetscLogGpuTimeBegin()); 409 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 410 PetscCall(VecGetKokkosView(xx,&xv)); 411 PetscCall(VecGetKokkosView(yy,&yv)); 412 PetscCall(VecGetKokkosViewWrite(zz,&zv)); 413 if (zz != yy) Kokkos::deep_copy(zv,yv); 414 if (A->form_explicit_transpose) { 415 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat)); 416 mode = "N"; 417 } else { 418 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 419 csrmat = &aijkok->csrmat; 420 mode = "C"; 421 } 422 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 423 PetscCall(VecRestoreKokkosView(xx,&xv)); 424 PetscCall(VecRestoreKokkosView(yy,&yv)); 425 PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 426 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 427 PetscCall(PetscLogGpuTimeEnd()); 428 PetscFunctionReturn(0); 429 } 430 431 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 432 { 433 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 434 435 PetscFunctionBegin; 436 switch (op) { 437 case MAT_FORM_EXPLICIT_TRANSPOSE: 438 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 439 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 440 A->form_explicit_transpose = flg; 441 break; 442 default: 443 PetscCall(MatSetOption_SeqAIJ(A,op,flg)); 444 break; 445 } 446 PetscFunctionReturn(0); 447 } 448 449 /* Depending on reuse, either build a new mat, or use the existing mat */ 450 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 451 { 452 Mat_SeqAIJ *aseq; 453 454 PetscFunctionBegin; 455 PetscCall(PetscKokkosInitializeCheck()); 456 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 457 PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); /* the returned newmat is a SeqAIJKokkos */ 458 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 459 PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 460 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 461 PetscCheck(A == *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 462 PetscCall(PetscFree(A->defaultvectype)); 463 PetscCall(PetscStrallocpy(VECKOKKOS,&A->defaultvectype)); /* Allocate and copy the string */ 464 PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS)); 465 PetscCall(MatSetOps_SeqAIJKokkos(A)); 466 aseq = static_cast<Mat_SeqAIJ*>(A->data); 467 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 468 PetscCheck(!A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 469 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 470 } 471 } 472 PetscFunctionReturn(0); 473 } 474 475 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 476 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 477 */ 478 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 479 { 480 Mat_SeqAIJ *bseq; 481 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 482 Mat mat; 483 484 PetscFunctionBegin; 485 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 486 PetscCall(MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B)); 487 mat = *B; 488 if (A->assembled) { 489 bseq = static_cast<Mat_SeqAIJ*>(mat->data); 490 bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 491 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 492 /* Now copy values to B if needed */ 493 if (dupOption == MAT_COPY_VALUES) { 494 if (akok->a_dual.need_sync_device()) { 495 Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 496 bkok->a_dual.modify_host(); 497 } else { /* If device has the latest data, we only copy data on device */ 498 Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 499 bkok->a_dual.modify_device(); 500 } 501 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 502 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 503 bkok->a_dual.modify_host(); 504 } 505 mat->spptr = bkok; 506 } 507 508 PetscCall(PetscFree(mat->defaultvectype)); 509 PetscCall(PetscStrallocpy(VECKOKKOS,&mat->defaultvectype)); /* Allocate and copy the string */ 510 PetscCall(PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS)); 511 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 512 PetscFunctionReturn(0); 513 } 514 515 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 516 { 517 Mat At; 518 KokkosCsrMatrix *internT; 519 Mat_SeqAIJKokkos *atkok,*bkok; 520 521 PetscFunctionBegin; 522 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&internT)); /* Generate a transpose internally */ 523 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 524 /* Deep copy internT, as we want to isolate the internal transpose */ 525 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT))); 526 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At)); 527 if (reuse == MAT_INITIAL_MATRIX) *B = At; 528 else PetscCall(MatHeaderReplace(A,&At)); /* Replace A with At inplace */ 529 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 530 if ((*B)->assembled) { 531 bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 532 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 533 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 534 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 535 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 536 MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 537 MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 538 PetscCallCXX(Kokkos::deep_copy(a_h,internT->values)); 539 PetscCallCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 540 } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 541 } 542 PetscFunctionReturn(0); 543 } 544 545 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 546 { 547 Mat_SeqAIJKokkos *aijkok; 548 549 PetscFunctionBegin; 550 if (A->factortype == MAT_FACTOR_NONE) { 551 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 552 delete aijkok; 553 } else { 554 delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 555 } 556 A->spptr = NULL; 557 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL)); 558 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 559 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 560 PetscCall(MatDestroy_SeqAIJ(A)); 561 PetscFunctionReturn(0); 562 } 563 564 /*MC 565 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 566 567 A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 568 569 Options Database Keys: 570 . -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions() 571 572 Level: beginner 573 574 .seealso: `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 575 M*/ 576 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 577 { 578 PetscFunctionBegin; 579 PetscCall(PetscKokkosInitializeCheck()); 580 PetscCall(MatCreate_SeqAIJ(A)); 581 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A)); 582 PetscFunctionReturn(0); 583 } 584 585 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 586 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 587 { 588 Mat_SeqAIJ *a,*b; 589 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 590 MatScalarKokkosView aa,ba,ca; 591 MatRowMapKokkosView ai,bi,ci; 592 MatColIdxKokkosView aj,bj,cj; 593 PetscInt m,n,nnz,aN; 594 595 PetscFunctionBegin; 596 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 597 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 598 PetscValidPointer(C,4); 599 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 600 PetscCheckTypeName(B,MATSEQAIJKOKKOS); 601 PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 602 PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 603 604 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 605 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 606 a = static_cast<Mat_SeqAIJ*>(A->data); 607 b = static_cast<Mat_SeqAIJ*>(B->data); 608 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 609 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 610 aa = akok->a_dual.view_device(); 611 ai = akok->i_dual.view_device(); 612 ba = bkok->a_dual.view_device(); 613 bi = bkok->i_dual.view_device(); 614 m = A->rmap->n; /* M, N and nnz of C */ 615 n = A->cmap->n + B->cmap->n; 616 nnz = a->nz + b->nz; 617 aN = A->cmap->n; /* N of A */ 618 if (reuse == MAT_INITIAL_MATRIX) { 619 aj = akok->j_dual.view_device(); 620 bj = bkok->j_dual.view_device(); 621 auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 622 auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 623 auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 624 ca = ca_dual.view_device(); 625 ci = ci_dual.view_device(); 626 cj = cj_dual.view_device(); 627 628 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 629 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 630 PetscInt i = t.league_rank(); /* row i */ 631 PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 632 633 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 634 ci(i) = coffset; 635 if (i == m-1) ci(m) = ai(m) + bi(m); 636 }); 637 638 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 639 if (k < alen) { 640 ca(coffset+k) = aa(ai(i)+k); 641 cj(coffset+k) = aj(ai(i)+k); 642 } else { 643 ca(coffset+k) = ba(bi(i)+k-alen); 644 cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 645 } 646 }); 647 }); 648 ca_dual.modify_device(); 649 ci_dual.modify_device(); 650 cj_dual.modify_device(); 651 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 652 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C)); 653 } else if (reuse == MAT_REUSE_MATRIX) { 654 PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 655 PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 656 ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 657 ca = ckok->a_dual.view_device(); 658 ci = ckok->i_dual.view_device(); 659 660 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 661 PetscInt i = t.league_rank(); /* row i */ 662 PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 663 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 664 if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 665 else ca(ci(i)+k) = ba(bi(i)+k-alen); 666 }); 667 }); 668 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 669 } 670 PetscFunctionReturn(0); 671 } 672 673 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 674 { 675 PetscFunctionBegin; 676 delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 677 PetscFunctionReturn(0); 678 } 679 680 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 681 { 682 Mat_Product *product = C->product; 683 Mat A,B; 684 bool transA,transB; /* use bool, since KK needs this type */ 685 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 686 Mat_SeqAIJ *c; 687 MatProductData_SeqAIJKokkos *pdata; 688 KokkosCsrMatrix *csrmatA,*csrmatB; 689 690 PetscFunctionBegin; 691 MatCheckProduct(C,1); 692 PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 693 pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 694 695 if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 696 pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 697 PetscFunctionReturn(0); 698 } 699 700 switch (product->type) { 701 case MATPRODUCT_AB: transA = false; transB = false; break; 702 case MATPRODUCT_AtB: transA = true; transB = false; break; 703 case MATPRODUCT_ABt: transA = false; transB = true; break; 704 default: 705 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 706 } 707 708 A = product->A; 709 B = product->B; 710 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 711 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 712 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 713 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 714 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 715 716 PetscCheck(ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 717 718 csrmatA = &akok->csrmat; 719 csrmatB = &bkok->csrmat; 720 721 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 722 if (transA) { 723 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA)); 724 transA = false; 725 } 726 727 if (transB) { 728 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB)); 729 transB = false; 730 } 731 PetscCall(PetscLogGpuTimeBegin()); 732 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 733 PetscCallCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 734 PetscCall(PetscLogGpuTimeEnd()); 735 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 736 /* shorter version of MatAssemblyEnd_SeqAIJ */ 737 c = (Mat_SeqAIJ*)C->data; 738 PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz)); 739 PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n")); 740 PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax)); 741 c->reallocs = 0; 742 C->info.mallocs = 0; 743 C->info.nz_unneeded = 0; 744 C->assembled = C->was_assembled = PETSC_TRUE; 745 C->num_ass++; 746 PetscFunctionReturn(0); 747 } 748 749 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 750 { 751 Mat_Product *product = C->product; 752 MatProductType ptype; 753 Mat A,B; 754 bool transA,transB; 755 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 756 MatProductData_SeqAIJKokkos *pdata; 757 MPI_Comm comm; 758 KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 759 760 PetscFunctionBegin; 761 MatCheckProduct(C,1); 762 PetscCall(PetscObjectGetComm((PetscObject)C,&comm)); 763 PetscCheck(!product->data,comm,PETSC_ERR_PLIB,"Product data not empty"); 764 A = product->A; 765 B = product->B; 766 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 767 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 768 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 769 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 770 csrmatA = &akok->csrmat; 771 csrmatB = &bkok->csrmat; 772 773 ptype = product->type; 774 switch (ptype) { 775 case MATPRODUCT_AB: transA = false; transB = false; break; 776 case MATPRODUCT_AtB: transA = true; transB = false; break; 777 case MATPRODUCT_ABt: transA = false; transB = true; break; 778 default: 779 SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 780 } 781 782 product->data = pdata = new MatProductData_SeqAIJKokkos(); 783 pdata->kh.set_team_work_size(16); 784 pdata->kh.set_dynamic_scheduling(true); 785 pdata->reusesym = product->api_user; 786 787 /* TODO: add command line options to select spgemm algorithms */ 788 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 789 /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API. 790 We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support. 791 */ 792 pdata->kh.create_spgemm_handle(spgemm_alg); 793 794 PetscCall(PetscLogGpuTimeBegin()); 795 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 796 if (transA) { 797 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA)); 798 transA = false; 799 } 800 801 if (transB) { 802 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB)); 803 transB = false; 804 } 805 806 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 807 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 808 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 809 calling new Mat_SeqAIJKokkos(). 810 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 811 */ 812 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 813 PetscCallCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 814 PetscCall(PetscLogGpuTimeEnd()); 815 816 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 817 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C,ckok)); 818 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 819 PetscFunctionReturn(0); 820 } 821 822 /* handles sparse matrix matrix ops */ 823 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 824 { 825 Mat_Product *product = mat->product; 826 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 827 828 PetscFunctionBegin; 829 MatCheckProduct(mat,1); 830 PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok)); 831 if (product->type == MATPRODUCT_ABC) { 832 PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok)); 833 } 834 if (Biskok && Ciskok) { 835 switch (product->type) { 836 case MATPRODUCT_AB: 837 case MATPRODUCT_AtB: 838 case MATPRODUCT_ABt: 839 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 840 break; 841 case MATPRODUCT_PtAP: 842 case MATPRODUCT_RARt: 843 case MATPRODUCT_ABC: 844 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 845 break; 846 default: 847 break; 848 } 849 } else { /* fallback for AIJ */ 850 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 851 } 852 PetscFunctionReturn(0); 853 } 854 855 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 856 { 857 Mat_SeqAIJKokkos *aijkok; 858 859 PetscFunctionBegin; 860 PetscCall(PetscLogGpuTimeBegin()); 861 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 862 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 863 KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 864 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 865 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 866 PetscCall(PetscLogGpuTimeEnd()); 867 PetscFunctionReturn(0); 868 } 869 870 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 871 { 872 Mat_SeqAIJKokkos *aijkok; 873 874 PetscFunctionBegin; 875 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 876 if (aijkok) { /* Only zero the device if data is already there */ 877 KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 878 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 879 } else { /* Might be preallocated but not assembled */ 880 PetscCall(MatZeroEntries_SeqAIJ(A)); 881 } 882 PetscFunctionReturn(0); 883 } 884 885 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 886 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv) 887 { 888 Mat_SeqAIJKokkos *aijkok; 889 890 PetscFunctionBegin; 891 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 892 PetscValidPointer(kv,2); 893 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 894 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 895 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 896 *kv = aijkok->a_dual.view_device(); 897 PetscFunctionReturn(0); 898 } 899 900 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv) 901 { 902 PetscFunctionBegin; 903 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 904 PetscValidPointer(kv,2); 905 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 906 PetscFunctionReturn(0); 907 } 908 909 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv) 910 { 911 Mat_SeqAIJKokkos *aijkok; 912 913 PetscFunctionBegin; 914 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 915 PetscValidPointer(kv,2); 916 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 917 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 918 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 919 *kv = aijkok->a_dual.view_device(); 920 PetscFunctionReturn(0); 921 } 922 923 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv) 924 { 925 PetscFunctionBegin; 926 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 927 PetscValidPointer(kv,2); 928 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 929 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 930 PetscFunctionReturn(0); 931 } 932 933 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 934 { 935 Mat_SeqAIJKokkos *aijkok; 936 937 PetscFunctionBegin; 938 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 939 PetscValidPointer(kv,2); 940 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 941 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 942 *kv = aijkok->a_dual.view_device(); 943 PetscFunctionReturn(0); 944 } 945 946 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 947 { 948 PetscFunctionBegin; 949 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 950 PetscValidPointer(kv,2); 951 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 952 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 953 PetscFunctionReturn(0); 954 } 955 956 /* Computes Y += alpha X */ 957 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 958 { 959 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 960 Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 961 ConstMatScalarKokkosView Xa; 962 MatScalarKokkosView Ya; 963 964 PetscFunctionBegin; 965 PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 966 PetscCheckTypeName(X,MATSEQAIJKOKKOS); 967 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 968 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 969 PetscCall(PetscLogGpuTimeBegin()); 970 971 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 972 /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 973 PetscBool e; 974 PetscCall(PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e)); 975 if (e) { 976 PetscCall(PetscArraycmp(x->j,y->j,y->nz,&e)); 977 if (e) pattern = SAME_NONZERO_PATTERN; 978 } 979 } 980 981 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 982 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 983 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 984 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 985 */ 986 ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 987 xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 988 Xa = xkok->a_dual.view_device(); 989 Ya = ykok->a_dual.view_device(); 990 991 if (pattern == SAME_NONZERO_PATTERN) { 992 KokkosBlas::axpy(alpha,Xa,Ya); 993 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 994 } else if (pattern == SUBSET_NONZERO_PATTERN) { 995 MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 996 MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 997 998 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 999 PetscInt i = t.league_rank(); /* row i */ 1000 Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 1001 PetscInt p,q = Yi(i); 1002 for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 1003 while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 1004 if (Xj(p) == Yj(q)) { /* Found it */ 1005 Ya(q) += alpha * Xa(p); 1006 q++; 1007 } else { 1008 /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1009 Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1010 */ 1011 if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = 1012 #if PETSC_PKG_KOKKOS_VERSION_GE(3,6,99) 1013 Kokkos::nan("1"); /* auto promote the double NaN if needed */ 1014 #else 1015 Kokkos::Experimental::nan("1"); 1016 #endif 1017 } 1018 } 1019 }); 1020 }); 1021 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1022 } else { /* different nonzero patterns */ 1023 Mat Z; 1024 KokkosCsrMatrix zcsr; 1025 KernelHandle kh; 1026 kh.create_spadd_handle(false); 1027 KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1028 KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1029 zkok = new Mat_SeqAIJKokkos(zcsr); 1030 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z)); 1031 PetscCall(MatHeaderReplace(Y,&Z)); 1032 kh.destroy_spadd_handle(); 1033 } 1034 PetscCall(PetscLogGpuTimeEnd()); 1035 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0)*2)); /* Because we scaled X and then added it to Y */ 1036 PetscFunctionReturn(0); 1037 } 1038 1039 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 1040 { 1041 Mat_SeqAIJKokkos *akok; 1042 Mat_SeqAIJ *aseq; 1043 1044 PetscFunctionBegin; 1045 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j)); 1046 aseq = static_cast<Mat_SeqAIJ*>(mat->data); 1047 akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr); 1048 delete akok; 1049 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,mat->nonzerostate+1,PETSC_FALSE); 1050 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1051 akok->SetUpCOO(aseq); 1052 PetscFunctionReturn(0); 1053 } 1054 1055 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode) 1056 { 1057 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ*>(A->data); 1058 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1059 PetscCount Annz = aseq->nz; 1060 const PetscCountKokkosView& jmap = akok->jmap_d; 1061 const PetscCountKokkosView& perm = akok->perm_d; 1062 MatScalarKokkosView Aa; 1063 ConstMatScalarKokkosView kv; 1064 PetscMemType memtype; 1065 1066 PetscFunctionBegin; 1067 PetscCall(PetscGetMemType(v,&memtype)); 1068 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1069 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n)); 1070 } else { 1071 kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */ 1072 } 1073 1074 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); /* write matrix values */ 1075 else PetscCall(MatSeqAIJGetKokkosView(A,&Aa)); /* read & write matrix values */ 1076 1077 Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) { 1078 PetscScalar sum = 0.0; 1079 for (PetscCount k=jmap(i); k<jmap(i+1); k++) sum += kv(perm(k)); 1080 Aa(i) = (imode == INSERT_VALUES? 0.0 : Aa(i)) + sum; 1081 }); 1082 1083 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa)); 1084 else PetscCall(MatSeqAIJRestoreKokkosView(A,&Aa)); 1085 PetscFunctionReturn(0); 1086 } 1087 1088 PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A,const PetscInt *diag) 1089 { 1090 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1091 MatScalarKokkosView Aa; 1092 const MatRowMapKokkosView& Ai = akok->i_dual.view_device(); 1093 PetscInt m = A->rmap->n; 1094 ConstMatRowMapKokkosView Adiag(diag,m); /* diag is a device pointer */ 1095 1096 PetscFunctionBegin; 1097 PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); 1098 Kokkos::parallel_for(m,KOKKOS_LAMBDA(const PetscInt i) { 1099 PetscScalar tmp; 1100 if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i+1)) { /* The diagonal element exists */ 1101 tmp = Aa(Ai(i)); 1102 Aa(Ai(i)) = Aa(Adiag(i)); 1103 Aa(Adiag(i)) = tmp; 1104 } 1105 }); 1106 PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa)); 1107 PetscFunctionReturn(0); 1108 } 1109 1110 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1111 { 1112 PetscFunctionBegin; 1113 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1114 PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info)); 1115 B->offloadmask = PETSC_OFFLOAD_CPU; 1116 PetscFunctionReturn(0); 1117 } 1118 1119 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1120 { 1121 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1122 1123 PetscFunctionBegin; 1124 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1125 A->boundtocpu = PETSC_FALSE; 1126 1127 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1128 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1129 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1130 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1131 A->ops->scale = MatScale_SeqAIJKokkos; 1132 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1133 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1134 A->ops->mult = MatMult_SeqAIJKokkos; 1135 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1136 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1137 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1138 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1139 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1140 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1141 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1142 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1143 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1144 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1145 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1146 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1147 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1148 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1149 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1150 1151 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos)); 1152 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJKokkos)); 1153 PetscFunctionReturn(0); 1154 } 1155 1156 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1157 { 1158 Mat_SeqAIJ *aseq; 1159 PetscInt i,m,n; 1160 1161 PetscFunctionBegin; 1162 PetscCheck(!A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1163 1164 m = akok->nrows(); 1165 n = akok->ncols(); 1166 PetscCall(MatSetSizes(A,m,n,m,n)); 1167 PetscCall(MatSetType(A,MATSEQAIJKOKKOS)); 1168 1169 /* Set up data structures of A as a MATSEQAIJ */ 1170 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL)); 1171 aseq = (Mat_SeqAIJ*)(A)->data; 1172 1173 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1174 akok->j_dual.sync_host(); 1175 1176 aseq->i = akok->i_host_data(); 1177 aseq->j = akok->j_host_data(); 1178 aseq->a = akok->a_host_data(); 1179 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1180 aseq->singlemalloc = PETSC_FALSE; 1181 aseq->free_a = PETSC_FALSE; 1182 aseq->free_ij = PETSC_FALSE; 1183 aseq->nz = akok->nnz(); 1184 aseq->maxnz = aseq->nz; 1185 1186 PetscCall(PetscMalloc1(m,&aseq->imax)); 1187 PetscCall(PetscMalloc1(m,&aseq->ilen)); 1188 for (i=0; i<m; i++) { 1189 aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1190 } 1191 1192 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1193 akok->nonzerostate = A->nonzerostate; 1194 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1195 PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY)); 1196 PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY)); 1197 PetscFunctionReturn(0); 1198 } 1199 1200 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1201 1202 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1203 */ 1204 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1205 { 1206 PetscFunctionBegin; 1207 PetscCall(MatCreate(comm,A)); 1208 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A,akok)); 1209 PetscFunctionReturn(0); 1210 } 1211 1212 /* --------------------------------------------------------------------------------*/ 1213 /*@C 1214 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 1215 (the default parallel PETSc format). This matrix will ultimately be handled by 1216 Kokkos for calculations. For good matrix 1217 assembly performance the user should preallocate the matrix storage by setting 1218 the parameter nz (or the array nnz). By setting these parameters accurately, 1219 performance during matrix assembly can be increased by more than a factor of 50. 1220 1221 Collective 1222 1223 Input Parameters: 1224 + comm - MPI communicator, set to PETSC_COMM_SELF 1225 . m - number of rows 1226 . n - number of columns 1227 . nz - number of nonzeros per row (same for all rows) 1228 - nnz - array containing the number of nonzeros in the various rows 1229 (possibly different for each row) or NULL 1230 1231 Output Parameter: 1232 . A - the matrix 1233 1234 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1235 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1236 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1237 1238 Notes: 1239 If nnz is given then nz is ignored 1240 1241 The AIJ format (also called the Yale sparse matrix format or 1242 compressed row storage), is fully compatible with standard Fortran 77 1243 storage. That is, the stored row and column indices can begin at 1244 either one (as in Fortran) or zero. See the users' manual for details. 1245 1246 Specify the preallocated storage with either nz or nnz (not both). 1247 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1248 allocation. For large problems you MUST preallocate memory or you 1249 will get TERRIBLE performance, see the users' manual chapter on matrices. 1250 1251 By default, this format uses inodes (identical nodes) when possible, to 1252 improve numerical efficiency of matrix-vector products and solves. We 1253 search for consecutive rows with the same nonzero structure, thereby 1254 reusing matrix information to achieve increased efficiency. 1255 1256 Level: intermediate 1257 1258 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()` 1259 @*/ 1260 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1261 { 1262 PetscFunctionBegin; 1263 PetscCall(PetscKokkosInitializeCheck()); 1264 PetscCall(MatCreate(comm,A)); 1265 PetscCall(MatSetSizes(*A,m,n,m,n)); 1266 PetscCall(MatSetType(*A,MATSEQAIJKOKKOS)); 1267 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz)); 1268 PetscFunctionReturn(0); 1269 } 1270 1271 typedef Kokkos::TeamPolicy<>::member_type team_member; 1272 // 1273 // This factorization exploits block diagonal matrices with "Nf" (not used). 1274 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1275 // 1276 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1277 { 1278 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1279 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1280 IS isrow = b->row,isicol = b->icol; 1281 const PetscInt *r_h,*ic_h; 1282 const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data(); 1283 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1284 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1285 PetscBool row_identity,col_identity; 1286 PetscInt nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used 1287 1288 PetscFunctionBegin; 1289 PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n); 1290 PetscCall(MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity)); 1291 PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 1292 PetscCall(ISGetIndices(isrow,&r_h)); 1293 PetscCall(ISGetIndices(isicol,&ic_h)); 1294 PetscCall(ISGetSize(isicol,&nc)); 1295 PetscCall(PetscLogGpuTimeBegin()); 1296 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1297 { 1298 #define KOKKOS_SHARED_LEVEL 1 1299 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1300 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1301 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1302 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 1303 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 1304 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 1305 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 1306 size_t flops_h = 0.0; 1307 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 1308 Kokkos::View<size_t> d_flops_k ("flops"); 1309 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1310 const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 1311 Kokkos::deep_copy (d_flops_k, h_flops_k); 1312 Kokkos::deep_copy (d_r_k, h_r_k); 1313 Kokkos::deep_copy (d_ic_k, h_ic_k); 1314 // Fill A --> fact 1315 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1316 const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 1317 const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 1318 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1319 // zero rows of B 1320 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1321 PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 1322 PetscScalar *baL = ba_d + bi_d[rowb]; 1323 PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 1324 /* zero (unfactored row) */ 1325 for (int j=0;j<nzbL;j++) baL[j] = 0; 1326 for (int j=0;j<nzbU;j++) baU[j] = 0; 1327 }); 1328 // copy A into B 1329 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1330 PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 1331 const PetscScalar *av = aa_d + ai_d[rowa]; 1332 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1333 /* load in initial (unfactored row) */ 1334 for (int j=0;j<nza;j++) { 1335 PetscInt colb = ic[ajtmp[j]]; 1336 PetscScalar vala = av[j]; 1337 if (colb == rowb) { 1338 *(ba_d + bdiag_d[rowb]) = vala; 1339 } else { 1340 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1341 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1342 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 1343 for (int j=0; j<nz ; j++) { 1344 if (pbj[j] == colb) { 1345 pba[j] = vala; 1346 set++; 1347 break; 1348 } 1349 } 1350 #if !defined(PETSC_HAVE_SYCL) 1351 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1352 #endif 1353 } 1354 } 1355 }); 1356 }); 1357 Kokkos::fence(); 1358 1359 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 1360 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1361 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1362 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1363 const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 1364 const PetscInt start = field*nloc, end = start + nloc; 1365 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1366 // A22 panel update for each row A(1,:) and col A(:,1) 1367 for (int ii=start; ii<end-1; ii++) { 1368 const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 1369 const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 1370 const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 1371 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1372 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 1373 PetscInt kIdx = j*Ni + field_block_idx; 1374 if (kIdx >= nzUi) /* void */ ; 1375 else { 1376 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1377 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1378 const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 1379 size_t st_idx; 1380 // find and do L(k,i) = A(:k,i) / A(i,i) 1381 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1382 // get column, there has got to be a better way 1383 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 1384 if (pjL[j] == ii) { 1385 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1386 idx = j; // output 1387 *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1388 } 1389 }, st_idx); 1390 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 1391 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1392 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register 1393 #endif 1394 // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1395 // U(i+1,:end) 1396 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 1397 PetscScalar Uij = baUi[uiIdx]; 1398 PetscInt col = bjUi[uiIdx]; 1399 if (col==myk) { 1400 // A_kk = A_kk - L_ki * U_ij(k) 1401 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1402 *Akkv = *Akkv - L_ki() * Uij; // UiK 1403 } else { 1404 PetscScalar *start, *end, *pAkjv=NULL; 1405 PetscInt high, low; 1406 const PetscInt *startj; 1407 if (col<myk) { // L 1408 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1409 PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 1410 start = pLki+1; // start at pLki+1, A22(myk,1) 1411 startj= bj_d + bi_d[myk] + idx; 1412 end = ba_d + bi_d[myk+1]; 1413 } else { 1414 PetscInt idx = bdiag_d[myk+1]+1; 1415 start = ba_d + idx; 1416 startj= bj_d + idx; 1417 end = ba_d + bdiag_d[myk]; 1418 } 1419 // search for 'col', use bisection search - TODO 1420 low = 0; 1421 high = (PetscInt)(end-start); 1422 while (high-low > 5) { 1423 int t = (low+high)/2; 1424 if (startj[t] > col) high = t; 1425 else low = t; 1426 } 1427 for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 1428 if (startj[pAkjv-start] == col) break; 1429 } 1430 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1431 if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register 1432 #endif 1433 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1434 } 1435 }); 1436 } 1437 }); 1438 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1439 if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 1440 } /* endof for (i=0; i<n; i++) { */ 1441 Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 1442 }); 1443 Kokkos::fence(); 1444 Kokkos::deep_copy (h_flops_k, d_flops_k); 1445 PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k())); 1446 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 1447 const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 1448 const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 1449 /* Invert diagonal for simpler triangular solves */ 1450 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 1451 int i = start + outer_index*Ni + lg_rank%Ni; 1452 if (i < end) { 1453 PetscScalar *pv = ba_d + bdiag_d[i]; 1454 *pv = 1.0/(*pv); 1455 } 1456 }); 1457 }); 1458 } 1459 PetscCall(PetscLogGpuTimeEnd()); 1460 PetscCall(ISRestoreIndices(isicol,&ic_h)); 1461 PetscCall(ISRestoreIndices(isrow,&r_h)); 1462 1463 PetscCall(ISIdentity(isrow,&row_identity)); 1464 PetscCall(ISIdentity(isicol,&col_identity)); 1465 if (b->inode.size) { 1466 B->ops->solve = MatSolve_SeqAIJ_Inode; 1467 } else if (row_identity && col_identity) { 1468 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1469 } else { 1470 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1471 } 1472 B->offloadmask = PETSC_OFFLOAD_GPU; 1473 PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU 1474 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1475 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1476 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1477 B->ops->matsolve = MatMatSolve_SeqAIJ; 1478 B->assembled = PETSC_TRUE; 1479 B->preallocated = PETSC_TRUE; 1480 1481 PetscFunctionReturn(0); 1482 } 1483 1484 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1485 { 1486 PetscFunctionBegin; 1487 PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 1488 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1489 PetscFunctionReturn(0); 1490 } 1491 1492 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1493 { 1494 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1495 1496 PetscFunctionBegin; 1497 if (!factors->sptrsv_symbolic_completed) { 1498 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 1499 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 1500 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1501 } 1502 PetscFunctionReturn(0); 1503 } 1504 1505 /* Check if we need to update factors etc for transpose solve */ 1506 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1507 { 1508 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1509 MatColIdxType n = A->rmap->n; 1510 1511 PetscFunctionBegin; 1512 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1513 /* Update L^T and do sptrsv symbolic */ 1514 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 1515 Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1516 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1517 factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 1518 1519 KokkosKernels::Impl::transpose_matrix< 1520 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1521 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1522 MatRowMapKokkosView,DefaultExecutionSpace>( 1523 n,n,factors->iL_d,factors->jL_d,factors->aL_d, 1524 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1525 1526 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1527 We have to sort the indices, until KK provides finer control options. 1528 */ 1529 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1530 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1531 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1532 1533 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 1534 1535 /* Update U^T and do sptrsv symbolic */ 1536 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 1537 Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1538 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1539 factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 1540 1541 KokkosKernels::Impl::transpose_matrix< 1542 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1543 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1544 MatRowMapKokkosView,DefaultExecutionSpace>( 1545 n,n,factors->iU_d, factors->jU_d, factors->aU_d, 1546 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1547 1548 /* Sort indices. See comments above */ 1549 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1550 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1551 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1552 1553 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 1554 factors->transpose_updated = PETSC_TRUE; 1555 } 1556 PetscFunctionReturn(0); 1557 } 1558 1559 /* Solve Ax = b, with A = LU */ 1560 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 1561 { 1562 ConstPetscScalarKokkosView bv; 1563 PetscScalarKokkosView xv; 1564 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1565 1566 PetscFunctionBegin; 1567 PetscCall(PetscLogGpuTimeBegin()); 1568 PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 1569 PetscCall(VecGetKokkosView(b,&bv)); 1570 PetscCall(VecGetKokkosViewWrite(x,&xv)); 1571 /* Solve L tmpv = b */ 1572 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 1573 /* Solve Ux = tmpv */ 1574 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 1575 PetscCall(VecRestoreKokkosView(b,&bv)); 1576 PetscCall(VecRestoreKokkosViewWrite(x,&xv)); 1577 PetscCall(PetscLogGpuTimeEnd()); 1578 PetscFunctionReturn(0); 1579 } 1580 1581 /* Solve A^T x = b, where A^T = U^T L^T */ 1582 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 1583 { 1584 ConstPetscScalarKokkosView bv; 1585 PetscScalarKokkosView xv; 1586 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1587 1588 PetscFunctionBegin; 1589 PetscCall(PetscLogGpuTimeBegin()); 1590 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 1591 PetscCall(VecGetKokkosView(b,&bv)); 1592 PetscCall(VecGetKokkosViewWrite(x,&xv)); 1593 /* Solve U^T tmpv = b */ 1594 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 1595 1596 /* Solve L^T x = tmpv */ 1597 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 1598 PetscCall(VecRestoreKokkosView(b,&bv)); 1599 PetscCall(VecRestoreKokkosViewWrite(x,&xv)); 1600 PetscCall(PetscLogGpuTimeEnd()); 1601 PetscFunctionReturn(0); 1602 } 1603 1604 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1605 { 1606 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1607 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1608 PetscInt fill_lev = info->levels; 1609 1610 PetscFunctionBegin; 1611 PetscCall(PetscLogGpuTimeBegin()); 1612 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1613 1614 auto a_d = aijkok->a_dual.view_device(); 1615 auto i_d = aijkok->i_dual.view_device(); 1616 auto j_d = aijkok->j_dual.view_device(); 1617 1618 KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 1619 1620 B->assembled = PETSC_TRUE; 1621 B->preallocated = PETSC_TRUE; 1622 B->ops->solve = MatSolve_SeqAIJKokkos; 1623 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1624 B->ops->matsolve = NULL; 1625 B->ops->matsolvetranspose = NULL; 1626 B->offloadmask = PETSC_OFFLOAD_GPU; 1627 1628 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1629 factors->transpose_updated = PETSC_FALSE; 1630 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1631 /* TODO: log flops, but how to know that? */ 1632 PetscCall(PetscLogGpuTimeEnd()); 1633 PetscFunctionReturn(0); 1634 } 1635 1636 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1637 { 1638 Mat_SeqAIJKokkos *aijkok; 1639 Mat_SeqAIJ *b; 1640 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1641 PetscInt fill_lev = info->levels; 1642 PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 1643 PetscInt n = A->rmap->n; 1644 1645 PetscFunctionBegin; 1646 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1647 /* Rebuild factors */ 1648 if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 1649 else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 1650 1651 /* Create a spiluk handle and then do symbolic factorization */ 1652 nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 1653 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 1654 1655 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1656 1657 Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 1658 Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 1659 Kokkos::realloc(factors->iU_d,n+1); 1660 Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 1661 1662 aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1663 auto i_d = aijkok->i_dual.view_device(); 1664 auto j_d = aijkok->j_dual.view_device(); 1665 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 1666 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1667 1668 Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1669 Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 1670 Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 1671 Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 1672 1673 /* TODO: add options to select sptrsv algorithms */ 1674 /* Create sptrsv handles for L, U and their transpose */ 1675 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1676 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1677 #else 1678 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1679 #endif 1680 1681 factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 1682 factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 1683 factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 1684 factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 1685 1686 /* Fill fields of the factor matrix B */ 1687 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL)); 1688 b = (Mat_SeqAIJ*)B->data; 1689 b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 1690 B->info.fill_ratio_given = info->fill; 1691 B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 1692 1693 B->offloadmask = PETSC_OFFLOAD_GPU; 1694 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1695 PetscFunctionReturn(0); 1696 } 1697 1698 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1699 { 1700 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1701 const PetscInt nrows = A->rmap->n; 1702 1703 PetscFunctionBegin; 1704 PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 1705 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1706 // move B data into Kokkos 1707 PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok 1708 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok 1709 { 1710 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1711 if (!baijkok->diag_d.extent(0)) { 1712 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1713 baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1714 Kokkos::deep_copy (baijkok->diag_d, h_diag); 1715 } 1716 } 1717 PetscFunctionReturn(0); 1718 } 1719 1720 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1721 { 1722 PetscFunctionBegin; 1723 *type = MATSOLVERKOKKOS; 1724 PetscFunctionReturn(0); 1725 } 1726 1727 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 1728 { 1729 PetscFunctionBegin; 1730 *type = MATSOLVERKOKKOSDEVICE; 1731 PetscFunctionReturn(0); 1732 } 1733 1734 /*MC 1735 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1736 on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1737 1738 Level: beginner 1739 1740 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1741 M*/ 1742 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1743 { 1744 PetscInt n = A->rmap->n; 1745 1746 PetscFunctionBegin; 1747 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 1748 PetscCall(MatSetSizes(*B,n,n,n,n)); 1749 (*B)->factortype = ftype; 1750 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 1751 PetscCall(MatSetType(*B,MATSEQAIJKOKKOS)); 1752 1753 if (ftype == MAT_FACTOR_LU) { 1754 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 1755 (*B)->canuseordering = PETSC_TRUE; 1756 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1757 } else if (ftype == MAT_FACTOR_ILU) { 1758 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 1759 (*B)->canuseordering = PETSC_FALSE; 1760 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1761 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1762 1763 PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 1764 PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos)); 1765 PetscFunctionReturn(0); 1766 } 1767 1768 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 1769 { 1770 PetscInt n = A->rmap->n; 1771 1772 PetscFunctionBegin; 1773 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 1774 PetscCall(MatSetSizes(*B,n,n,n,n)); 1775 (*B)->factortype = ftype; 1776 (*B)->canuseordering = PETSC_TRUE; 1777 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 1778 PetscCall(MatSetType(*B,MATSEQAIJKOKKOS)); 1779 1780 if (ftype == MAT_FACTOR_LU) { 1781 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 1782 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1783 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1784 1785 PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 1786 PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device)); 1787 PetscFunctionReturn(0); 1788 } 1789 1790 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1791 { 1792 PetscFunctionBegin; 1793 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos)); 1794 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos)); 1795 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device)); 1796 PetscFunctionReturn(0); 1797 } 1798 1799 /* Utility to print out a KokkosCsrMatrix for debugging */ 1800 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1801 { 1802 const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1803 const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1804 const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1805 const PetscInt *i = iv.data(); 1806 const PetscInt *j = jv.data(); 1807 const PetscScalar *a = av.data(); 1808 PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1809 1810 PetscFunctionBegin; 1811 PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz)); 1812 for (PetscInt k=0; k<m; k++) { 1813 PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k)); 1814 for (PetscInt p=i[k]; p<i[k+1]; p++) { 1815 PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],(double)PetscRealPart(a[p]))); 1816 } 1817 PetscCall(PetscPrintf(PETSC_COMM_SELF,"\n")); 1818 } 1819 PetscFunctionReturn(0); 1820 } 1821