1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petsc/private/sfimpl.h> 5 #include <petscsystypes.h> 6 #include <petscerror.h> 7 8 #include <Kokkos_Core.hpp> 9 #include <KokkosBlas.hpp> 10 #include <KokkosKernels_Sorting.hpp> 11 #include <KokkosSparse_CrsMatrix.hpp> 12 #include <KokkosSparse_spmv.hpp> 13 #include <KokkosSparse_spiluk.hpp> 14 #include <KokkosSparse_sptrsv.hpp> 15 #include <KokkosSparse_spgemm.hpp> 16 #include <KokkosSparse_spadd.hpp> 17 18 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 19 20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 21 22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 23 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 24 In the latter case, it is important to set a_dual's sync state correctly. 25 */ 26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 27 { 28 Mat_SeqAIJ *aijseq; 29 Mat_SeqAIJKokkos *aijkok; 30 31 PetscFunctionBegin; 32 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 33 PetscCall(MatAssemblyEnd_SeqAIJ(A,mode)); 34 35 aijseq = static_cast<Mat_SeqAIJ*>(A->data); 36 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 37 38 /* If aijkok does not exist, we just copy i, j to device. 39 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 40 In both cases, we build a new aijkok structure. 41 */ 42 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 43 delete aijkok; 44 aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 45 A->spptr = aijkok; 46 } 47 48 if (aijkok->device_mat_d.data()) { 49 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 50 } 51 PetscFunctionReturn(0); 52 } 53 54 /* Sync CSR data to device if not yet */ 55 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 56 { 57 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 58 59 PetscFunctionBegin; 60 PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 61 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 62 if (aijkok->a_dual.need_sync_device()) { 63 aijkok->a_dual.sync_device(); 64 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 65 aijkok->hermitian_updated = PETSC_FALSE; 66 } 67 PetscFunctionReturn(0); 68 } 69 70 /* Mark the CSR data on device as modified */ 71 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 72 { 73 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 74 75 PetscFunctionBegin; 76 PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 77 aijkok->a_dual.clear_sync_state(); 78 aijkok->a_dual.modify_device(); 79 aijkok->transpose_updated = PETSC_FALSE; 80 aijkok->hermitian_updated = PETSC_FALSE; 81 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 82 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 83 PetscFunctionReturn(0); 84 } 85 86 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 87 { 88 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 89 90 PetscFunctionBegin; 91 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 92 /* We do not expect one needs factors on host */ 93 PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 94 PetscCheck(aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 95 aijkok->a_dual.sync_host(); 96 PetscFunctionReturn(0); 97 } 98 99 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 100 { 101 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 102 103 PetscFunctionBegin; 104 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 105 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 106 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 107 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 108 */ 109 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 110 aijkok->a_dual.sync_host(); 111 *array = aijkok->a_dual.view_host().data(); 112 } else { /* Happens when calling MatSetValues on a newly created matrix */ 113 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 114 } 115 PetscFunctionReturn(0); 116 } 117 118 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 119 { 120 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 121 122 PetscFunctionBegin; 123 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 124 PetscFunctionReturn(0); 125 } 126 127 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 128 { 129 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 130 131 PetscFunctionBegin; 132 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 133 aijkok->a_dual.sync_host(); 134 *array = aijkok->a_dual.view_host().data(); 135 } else { 136 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 137 } 138 PetscFunctionReturn(0); 139 } 140 141 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 142 { 143 PetscFunctionBegin; 144 *array = NULL; 145 PetscFunctionReturn(0); 146 } 147 148 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 149 { 150 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 151 152 PetscFunctionBegin; 153 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 154 *array = aijkok->a_dual.view_host().data(); 155 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 156 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 157 } 158 PetscFunctionReturn(0); 159 } 160 161 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 162 { 163 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 164 165 PetscFunctionBegin; 166 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 167 aijkok->a_dual.clear_sync_state(); 168 aijkok->a_dual.modify_host(); 169 } 170 PetscFunctionReturn(0); 171 } 172 173 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype) 174 { 175 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 176 177 PetscFunctionBegin; 178 PetscCheck(aijkok != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"aijkok is NULL"); 179 180 if (i) *i = aijkok->i_device_data(); 181 if (j) *j = aijkok->j_device_data(); 182 if (a) { 183 aijkok->a_dual.sync_device(); 184 *a = aijkok->a_device_data(); 185 } 186 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 187 PetscFunctionReturn(0); 188 } 189 190 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 191 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 192 { 193 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 194 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 195 196 PetscFunctionBegin; 197 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 198 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 199 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 200 PetscFunctionReturn(0); 201 } 202 203 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 204 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 205 { 206 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 207 208 PetscFunctionBegin; 209 if (aijkok && aijkok->device_mat_d.data()) { 210 *d_mat = aijkok->device_mat_d.data(); 211 } else { 212 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it) 213 *d_mat = NULL; 214 } 215 PetscFunctionReturn(0); 216 } 217 218 /* Generate the transpose on device and cache it internally */ 219 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 220 { 221 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 222 223 PetscFunctionBegin; 224 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 225 if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 226 /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 227 PetscCallCXX(aijkok->a_dual.sync_device()); 228 PetscCallCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 229 PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 230 aijkok->transpose_updated = PETSC_TRUE; 231 } 232 *csrmatT = &aijkok->csrmatT; 233 PetscFunctionReturn(0); 234 } 235 236 /* Generate the Hermitian on device and cache it internally */ 237 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 238 { 239 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 240 241 PetscFunctionBegin; 242 PetscCall(PetscLogGpuTimeBegin()); 243 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 244 if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 245 PetscCallCXX(aijkok->a_dual.sync_device()); 246 PetscCallCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 247 PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 248 #if defined(PETSC_USE_COMPLEX) 249 const auto& a = aijkok->csrmatH.values; 250 Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 251 #endif 252 aijkok->hermitian_updated = PETSC_TRUE; 253 } 254 *csrmatH = &aijkok->csrmatH; 255 PetscCall(PetscLogGpuTimeEnd()); 256 PetscFunctionReturn(0); 257 } 258 259 /* y = A x */ 260 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 261 { 262 Mat_SeqAIJKokkos *aijkok; 263 ConstPetscScalarKokkosView xv; 264 PetscScalarKokkosView yv; 265 266 PetscFunctionBegin; 267 PetscCall(PetscLogGpuTimeBegin()); 268 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 269 PetscCall(VecGetKokkosView(xx,&xv)); 270 PetscCall(VecGetKokkosViewWrite(yy,&yv)); 271 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 272 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 273 PetscCall(VecRestoreKokkosView(xx,&xv)); 274 PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 275 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 276 PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz())); 277 PetscCall(PetscLogGpuTimeEnd()); 278 PetscFunctionReturn(0); 279 } 280 281 /* y = A^T x */ 282 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 283 { 284 Mat_SeqAIJKokkos *aijkok; 285 const char *mode; 286 ConstPetscScalarKokkosView xv; 287 PetscScalarKokkosView yv; 288 KokkosCsrMatrix *csrmat; 289 290 PetscFunctionBegin; 291 PetscCall(PetscLogGpuTimeBegin()); 292 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 293 PetscCall(VecGetKokkosView(xx,&xv)); 294 PetscCall(VecGetKokkosViewWrite(yy,&yv)); 295 if (A->form_explicit_transpose) { 296 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat)); 297 mode = "N"; 298 } else { 299 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 300 csrmat = &aijkok->csrmat; 301 mode = "T"; 302 } 303 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 304 PetscCall(VecRestoreKokkosView(xx,&xv)); 305 PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 306 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 307 PetscCall(PetscLogGpuTimeEnd()); 308 PetscFunctionReturn(0); 309 } 310 311 /* y = A^H x */ 312 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 313 { 314 Mat_SeqAIJKokkos *aijkok; 315 const char *mode; 316 ConstPetscScalarKokkosView xv; 317 PetscScalarKokkosView yv; 318 KokkosCsrMatrix *csrmat; 319 320 PetscFunctionBegin; 321 PetscCall(PetscLogGpuTimeBegin()); 322 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 323 PetscCall(VecGetKokkosView(xx,&xv)); 324 PetscCall(VecGetKokkosViewWrite(yy,&yv)); 325 if (A->form_explicit_transpose) { 326 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat)); 327 mode = "N"; 328 } else { 329 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 330 csrmat = &aijkok->csrmat; 331 mode = "C"; 332 } 333 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 334 PetscCall(VecRestoreKokkosView(xx,&xv)); 335 PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 336 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 337 PetscCall(PetscLogGpuTimeEnd()); 338 PetscFunctionReturn(0); 339 } 340 341 /* z = A x + y */ 342 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 343 { 344 Mat_SeqAIJKokkos *aijkok; 345 ConstPetscScalarKokkosView xv,yv; 346 PetscScalarKokkosView zv; 347 348 PetscFunctionBegin; 349 PetscCall(PetscLogGpuTimeBegin()); 350 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 351 PetscCall(VecGetKokkosView(xx,&xv)); 352 PetscCall(VecGetKokkosView(yy,&yv)); 353 PetscCall(VecGetKokkosViewWrite(zz,&zv)); 354 if (zz != yy) Kokkos::deep_copy(zv,yv); 355 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 356 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 357 PetscCall(VecRestoreKokkosView(xx,&xv)); 358 PetscCall(VecRestoreKokkosView(yy,&yv)); 359 PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 360 PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz())); 361 PetscCall(PetscLogGpuTimeEnd()); 362 PetscFunctionReturn(0); 363 } 364 365 /* z = A^T x + y */ 366 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 367 { 368 Mat_SeqAIJKokkos *aijkok; 369 const char *mode; 370 ConstPetscScalarKokkosView xv,yv; 371 PetscScalarKokkosView zv; 372 KokkosCsrMatrix *csrmat; 373 374 PetscFunctionBegin; 375 PetscCall(PetscLogGpuTimeBegin()); 376 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 377 PetscCall(VecGetKokkosView(xx,&xv)); 378 PetscCall(VecGetKokkosView(yy,&yv)); 379 PetscCall(VecGetKokkosViewWrite(zz,&zv)); 380 if (zz != yy) Kokkos::deep_copy(zv,yv); 381 if (A->form_explicit_transpose) { 382 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat)); 383 mode = "N"; 384 } else { 385 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 386 csrmat = &aijkok->csrmat; 387 mode = "T"; 388 } 389 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 390 PetscCall(VecRestoreKokkosView(xx,&xv)); 391 PetscCall(VecRestoreKokkosView(yy,&yv)); 392 PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 393 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 394 PetscCall(PetscLogGpuTimeEnd()); 395 PetscFunctionReturn(0); 396 } 397 398 /* z = A^H x + y */ 399 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 400 { 401 Mat_SeqAIJKokkos *aijkok; 402 const char *mode; 403 ConstPetscScalarKokkosView xv,yv; 404 PetscScalarKokkosView zv; 405 KokkosCsrMatrix *csrmat; 406 407 PetscFunctionBegin; 408 PetscCall(PetscLogGpuTimeBegin()); 409 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 410 PetscCall(VecGetKokkosView(xx,&xv)); 411 PetscCall(VecGetKokkosView(yy,&yv)); 412 PetscCall(VecGetKokkosViewWrite(zz,&zv)); 413 if (zz != yy) Kokkos::deep_copy(zv,yv); 414 if (A->form_explicit_transpose) { 415 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat)); 416 mode = "N"; 417 } else { 418 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 419 csrmat = &aijkok->csrmat; 420 mode = "C"; 421 } 422 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 423 PetscCall(VecRestoreKokkosView(xx,&xv)); 424 PetscCall(VecRestoreKokkosView(yy,&yv)); 425 PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 426 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 427 PetscCall(PetscLogGpuTimeEnd()); 428 PetscFunctionReturn(0); 429 } 430 431 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 432 { 433 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 434 435 PetscFunctionBegin; 436 switch (op) { 437 case MAT_FORM_EXPLICIT_TRANSPOSE: 438 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 439 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 440 A->form_explicit_transpose = flg; 441 break; 442 default: 443 PetscCall(MatSetOption_SeqAIJ(A,op,flg)); 444 break; 445 } 446 PetscFunctionReturn(0); 447 } 448 449 /* Depending on reuse, either build a new mat, or use the existing mat */ 450 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 451 { 452 Mat_SeqAIJ *aseq; 453 454 PetscFunctionBegin; 455 PetscCall(PetscKokkosInitializeCheck()); 456 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 457 PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); /* the returned newmat is a SeqAIJKokkos */ 458 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 459 PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 460 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 461 PetscCheck(A == *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 462 PetscCall(PetscFree(A->defaultvectype)); 463 PetscCall(PetscStrallocpy(VECKOKKOS,&A->defaultvectype)); /* Allocate and copy the string */ 464 PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS)); 465 PetscCall(MatSetOps_SeqAIJKokkos(A)); 466 aseq = static_cast<Mat_SeqAIJ*>(A->data); 467 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 468 PetscCheck(!A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 469 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 470 } 471 } 472 PetscFunctionReturn(0); 473 } 474 475 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 476 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 477 */ 478 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 479 { 480 Mat_SeqAIJ *bseq; 481 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 482 Mat mat; 483 484 PetscFunctionBegin; 485 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 486 PetscCall(MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B)); 487 mat = *B; 488 if (A->assembled) { 489 bseq = static_cast<Mat_SeqAIJ*>(mat->data); 490 bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 491 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 492 /* Now copy values to B if needed */ 493 if (dupOption == MAT_COPY_VALUES) { 494 if (akok->a_dual.need_sync_device()) { 495 Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 496 bkok->a_dual.modify_host(); 497 } else { /* If device has the latest data, we only copy data on device */ 498 Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 499 bkok->a_dual.modify_device(); 500 } 501 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 502 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 503 bkok->a_dual.modify_host(); 504 } 505 mat->spptr = bkok; 506 } 507 508 PetscCall(PetscFree(mat->defaultvectype)); 509 PetscCall(PetscStrallocpy(VECKOKKOS,&mat->defaultvectype)); /* Allocate and copy the string */ 510 PetscCall(PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS)); 511 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 512 PetscFunctionReturn(0); 513 } 514 515 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 516 { 517 Mat At; 518 KokkosCsrMatrix *internT; 519 Mat_SeqAIJKokkos *atkok,*bkok; 520 521 PetscFunctionBegin; 522 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&internT)); /* Generate a transpose internally */ 523 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 524 /* Deep copy internT, as we want to isolate the internal transpose */ 525 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT))); 526 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At)); 527 if (reuse == MAT_INITIAL_MATRIX) *B = At; 528 else PetscCall(MatHeaderReplace(A,&At)); /* Replace A with At inplace */ 529 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 530 if ((*B)->assembled) { 531 bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 532 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 533 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 534 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 535 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 536 MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 537 MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 538 PetscCallCXX(Kokkos::deep_copy(a_h,internT->values)); 539 PetscCallCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 540 } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 541 } 542 PetscFunctionReturn(0); 543 } 544 545 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 546 { 547 Mat_SeqAIJKokkos *aijkok; 548 549 PetscFunctionBegin; 550 if (A->factortype == MAT_FACTOR_NONE) { 551 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 552 delete aijkok; 553 } else { 554 delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 555 } 556 A->spptr = NULL; 557 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL)); 558 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 559 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 560 PetscCall(MatDestroy_SeqAIJ(A)); 561 PetscFunctionReturn(0); 562 } 563 564 /*MC 565 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 566 567 A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 568 569 Options Database Keys: 570 . -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions() 571 572 Level: beginner 573 574 .seealso: `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 575 M*/ 576 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 577 { 578 PetscFunctionBegin; 579 PetscCall(PetscKokkosInitializeCheck()); 580 PetscCall(MatCreate_SeqAIJ(A)); 581 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A)); 582 PetscFunctionReturn(0); 583 } 584 585 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 586 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 587 { 588 Mat_SeqAIJ *a,*b; 589 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 590 MatScalarKokkosView aa,ba,ca; 591 MatRowMapKokkosView ai,bi,ci; 592 MatColIdxKokkosView aj,bj,cj; 593 PetscInt m,n,nnz,aN; 594 595 PetscFunctionBegin; 596 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 597 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 598 PetscValidPointer(C,4); 599 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 600 PetscCheckTypeName(B,MATSEQAIJKOKKOS); 601 PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 602 PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 603 604 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 605 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 606 a = static_cast<Mat_SeqAIJ*>(A->data); 607 b = static_cast<Mat_SeqAIJ*>(B->data); 608 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 609 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 610 aa = akok->a_dual.view_device(); 611 ai = akok->i_dual.view_device(); 612 ba = bkok->a_dual.view_device(); 613 bi = bkok->i_dual.view_device(); 614 m = A->rmap->n; /* M, N and nnz of C */ 615 n = A->cmap->n + B->cmap->n; 616 nnz = a->nz + b->nz; 617 aN = A->cmap->n; /* N of A */ 618 if (reuse == MAT_INITIAL_MATRIX) { 619 aj = akok->j_dual.view_device(); 620 bj = bkok->j_dual.view_device(); 621 auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 622 auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 623 auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 624 ca = ca_dual.view_device(); 625 ci = ci_dual.view_device(); 626 cj = cj_dual.view_device(); 627 628 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 629 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 630 PetscInt i = t.league_rank(); /* row i */ 631 PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 632 633 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 634 ci(i) = coffset; 635 if (i == m-1) ci(m) = ai(m) + bi(m); 636 }); 637 638 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 639 if (k < alen) { 640 ca(coffset+k) = aa(ai(i)+k); 641 cj(coffset+k) = aj(ai(i)+k); 642 } else { 643 ca(coffset+k) = ba(bi(i)+k-alen); 644 cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 645 } 646 }); 647 }); 648 ca_dual.modify_device(); 649 ci_dual.modify_device(); 650 cj_dual.modify_device(); 651 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 652 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C)); 653 } else if (reuse == MAT_REUSE_MATRIX) { 654 PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 655 PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 656 ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 657 ca = ckok->a_dual.view_device(); 658 ci = ckok->i_dual.view_device(); 659 660 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 661 PetscInt i = t.league_rank(); /* row i */ 662 PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 663 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 664 if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 665 else ca(ci(i)+k) = ba(bi(i)+k-alen); 666 }); 667 }); 668 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 669 } 670 PetscFunctionReturn(0); 671 } 672 673 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 674 { 675 PetscFunctionBegin; 676 delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 677 PetscFunctionReturn(0); 678 } 679 680 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 681 { 682 Mat_Product *product = C->product; 683 Mat A,B; 684 bool transA,transB; /* use bool, since KK needs this type */ 685 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 686 Mat_SeqAIJ *c; 687 MatProductData_SeqAIJKokkos *pdata; 688 KokkosCsrMatrix *csrmatA,*csrmatB; 689 690 PetscFunctionBegin; 691 MatCheckProduct(C,1); 692 PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 693 pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 694 695 if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 696 pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 697 PetscFunctionReturn(0); 698 } 699 700 switch (product->type) { 701 case MATPRODUCT_AB: transA = false; transB = false; break; 702 case MATPRODUCT_AtB: transA = true; transB = false; break; 703 case MATPRODUCT_ABt: transA = false; transB = true; break; 704 default: 705 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 706 } 707 708 A = product->A; 709 B = product->B; 710 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 711 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 712 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 713 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 714 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 715 716 PetscCheck(ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 717 718 csrmatA = &akok->csrmat; 719 csrmatB = &bkok->csrmat; 720 721 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 722 if (transA) { 723 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA)); 724 transA = false; 725 } 726 727 if (transB) { 728 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB)); 729 transB = false; 730 } 731 PetscCall(PetscLogGpuTimeBegin()); 732 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 733 PetscCallCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 734 PetscCall(PetscLogGpuTimeEnd()); 735 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 736 /* shorter version of MatAssemblyEnd_SeqAIJ */ 737 c = (Mat_SeqAIJ*)C->data; 738 PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz)); 739 PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n")); 740 PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax)); 741 c->reallocs = 0; 742 C->info.mallocs = 0; 743 C->info.nz_unneeded = 0; 744 C->assembled = C->was_assembled = PETSC_TRUE; 745 C->num_ass++; 746 PetscFunctionReturn(0); 747 } 748 749 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 750 { 751 Mat_Product *product = C->product; 752 MatProductType ptype; 753 Mat A,B; 754 bool transA,transB; 755 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 756 MatProductData_SeqAIJKokkos *pdata; 757 MPI_Comm comm; 758 KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 759 760 PetscFunctionBegin; 761 MatCheckProduct(C,1); 762 PetscCall(PetscObjectGetComm((PetscObject)C,&comm)); 763 PetscCheck(!product->data,comm,PETSC_ERR_PLIB,"Product data not empty"); 764 A = product->A; 765 B = product->B; 766 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 767 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 768 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 769 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 770 csrmatA = &akok->csrmat; 771 csrmatB = &bkok->csrmat; 772 773 ptype = product->type; 774 switch (ptype) { 775 case MATPRODUCT_AB: transA = false; transB = false; break; 776 case MATPRODUCT_AtB: transA = true; transB = false; break; 777 case MATPRODUCT_ABt: transA = false; transB = true; break; 778 default: 779 SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 780 } 781 782 product->data = pdata = new MatProductData_SeqAIJKokkos(); 783 pdata->kh.set_team_work_size(16); 784 pdata->kh.set_dynamic_scheduling(true); 785 pdata->reusesym = product->api_user; 786 787 /* TODO: add command line options to select spgemm algorithms */ 788 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 789 /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API. 790 We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support. 791 */ 792 pdata->kh.create_spgemm_handle(spgemm_alg); 793 794 PetscCall(PetscLogGpuTimeBegin()); 795 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 796 if (transA) { 797 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA)); 798 transA = false; 799 } 800 801 if (transB) { 802 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB)); 803 transB = false; 804 } 805 806 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 807 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 808 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 809 calling new Mat_SeqAIJKokkos(). 810 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 811 */ 812 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 813 PetscCallCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 814 PetscCall(PetscLogGpuTimeEnd()); 815 816 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 817 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C,ckok)); 818 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 819 PetscFunctionReturn(0); 820 } 821 822 /* handles sparse matrix matrix ops */ 823 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 824 { 825 Mat_Product *product = mat->product; 826 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 827 828 PetscFunctionBegin; 829 MatCheckProduct(mat,1); 830 PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok)); 831 if (product->type == MATPRODUCT_ABC) { 832 PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok)); 833 } 834 if (Biskok && Ciskok) { 835 switch (product->type) { 836 case MATPRODUCT_AB: 837 case MATPRODUCT_AtB: 838 case MATPRODUCT_ABt: 839 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 840 break; 841 case MATPRODUCT_PtAP: 842 case MATPRODUCT_RARt: 843 case MATPRODUCT_ABC: 844 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 845 break; 846 default: 847 break; 848 } 849 } else { /* fallback for AIJ */ 850 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 851 } 852 PetscFunctionReturn(0); 853 } 854 855 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 856 { 857 Mat_SeqAIJKokkos *aijkok; 858 859 PetscFunctionBegin; 860 PetscCall(PetscLogGpuTimeBegin()); 861 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 862 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 863 KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 864 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 865 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 866 PetscCall(PetscLogGpuTimeEnd()); 867 PetscFunctionReturn(0); 868 } 869 870 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 871 { 872 Mat_SeqAIJKokkos *aijkok; 873 874 PetscFunctionBegin; 875 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 876 if (aijkok) { /* Only zero the device if data is already there */ 877 KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 878 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 879 } else { /* Might be preallocated but not assembled */ 880 PetscCall(MatZeroEntries_SeqAIJ(A)); 881 } 882 PetscFunctionReturn(0); 883 } 884 885 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 886 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv) 887 { 888 Mat_SeqAIJKokkos *aijkok; 889 890 PetscFunctionBegin; 891 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 892 PetscValidPointer(kv,2); 893 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 894 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 895 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 896 *kv = aijkok->a_dual.view_device(); 897 PetscFunctionReturn(0); 898 } 899 900 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv) 901 { 902 PetscFunctionBegin; 903 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 904 PetscValidPointer(kv,2); 905 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 906 PetscFunctionReturn(0); 907 } 908 909 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv) 910 { 911 Mat_SeqAIJKokkos *aijkok; 912 913 PetscFunctionBegin; 914 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 915 PetscValidPointer(kv,2); 916 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 917 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 918 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 919 *kv = aijkok->a_dual.view_device(); 920 PetscFunctionReturn(0); 921 } 922 923 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv) 924 { 925 PetscFunctionBegin; 926 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 927 PetscValidPointer(kv,2); 928 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 929 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 930 PetscFunctionReturn(0); 931 } 932 933 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 934 { 935 Mat_SeqAIJKokkos *aijkok; 936 937 PetscFunctionBegin; 938 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 939 PetscValidPointer(kv,2); 940 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 941 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 942 *kv = aijkok->a_dual.view_device(); 943 PetscFunctionReturn(0); 944 } 945 946 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 947 { 948 PetscFunctionBegin; 949 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 950 PetscValidPointer(kv,2); 951 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 952 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 953 PetscFunctionReturn(0); 954 } 955 956 /* Computes Y += alpha X */ 957 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 958 { 959 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 960 Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 961 ConstMatScalarKokkosView Xa; 962 MatScalarKokkosView Ya; 963 964 PetscFunctionBegin; 965 PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 966 PetscCheckTypeName(X,MATSEQAIJKOKKOS); 967 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 968 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 969 PetscCall(PetscLogGpuTimeBegin()); 970 971 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 972 /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 973 PetscBool e; 974 PetscCall(PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e)); 975 if (e) { 976 PetscCall(PetscArraycmp(x->j,y->j,y->nz,&e)); 977 if (e) pattern = SAME_NONZERO_PATTERN; 978 } 979 } 980 981 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 982 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 983 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 984 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 985 */ 986 ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 987 xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 988 Xa = xkok->a_dual.view_device(); 989 Ya = ykok->a_dual.view_device(); 990 991 if (pattern == SAME_NONZERO_PATTERN) { 992 KokkosBlas::axpy(alpha,Xa,Ya); 993 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 994 } else if (pattern == SUBSET_NONZERO_PATTERN) { 995 MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 996 MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 997 998 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 999 PetscInt i = t.league_rank(); /* row i */ 1000 Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 1001 PetscInt p,q = Yi(i); 1002 for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 1003 while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 1004 if (Xj(p) == Yj(q)) { /* Found it */ 1005 Ya(q) += alpha * Xa(p); 1006 q++; 1007 } else { 1008 /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1009 Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1010 */ 1011 if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */ 1012 } 1013 } 1014 }); 1015 }); 1016 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1017 } else { /* different nonzero patterns */ 1018 Mat Z; 1019 KokkosCsrMatrix zcsr; 1020 KernelHandle kh; 1021 kh.create_spadd_handle(false); 1022 KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1023 KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1024 zkok = new Mat_SeqAIJKokkos(zcsr); 1025 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z)); 1026 PetscCall(MatHeaderReplace(Y,&Z)); 1027 kh.destroy_spadd_handle(); 1028 } 1029 PetscCall(PetscLogGpuTimeEnd()); 1030 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0)*2)); /* Because we scaled X and then added it to Y */ 1031 PetscFunctionReturn(0); 1032 } 1033 1034 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 1035 { 1036 Mat_SeqAIJKokkos *akok; 1037 Mat_SeqAIJ *aseq; 1038 1039 PetscFunctionBegin; 1040 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j)); 1041 aseq = static_cast<Mat_SeqAIJ*>(mat->data); 1042 akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr); 1043 delete akok; 1044 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,mat->nonzerostate+1,PETSC_FALSE); 1045 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1046 akok->SetUpCOO(aseq); 1047 PetscFunctionReturn(0); 1048 } 1049 1050 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode) 1051 { 1052 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ*>(A->data); 1053 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1054 PetscCount Annz = aseq->nz; 1055 const PetscCountKokkosView& jmap = akok->jmap_d; 1056 const PetscCountKokkosView& perm = akok->perm_d; 1057 MatScalarKokkosView Aa; 1058 ConstMatScalarKokkosView kv; 1059 PetscMemType memtype; 1060 1061 PetscFunctionBegin; 1062 PetscCall(PetscGetMemType(v,&memtype)); 1063 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1064 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n)); 1065 } else { 1066 kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */ 1067 } 1068 1069 if (imode == INSERT_VALUES) { 1070 PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); /* write matrix values */ 1071 Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */ 1072 } else PetscCall(MatSeqAIJGetKokkosView(A,&Aa)); /* read & write matrix values */ 1073 1074 Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=jmap(i); k<jmap(i+1); k++) Aa(i) += kv(perm(k));}); 1075 1076 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa)); 1077 else PetscCall(MatSeqAIJRestoreKokkosView(A,&Aa)); 1078 PetscFunctionReturn(0); 1079 } 1080 1081 PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A,const PetscInt *diag) 1082 { 1083 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1084 MatScalarKokkosView Aa; 1085 const MatRowMapKokkosView& Ai = akok->i_dual.view_device(); 1086 PetscInt m = A->rmap->n; 1087 ConstMatRowMapKokkosView Adiag(diag,m); /* diag is a device pointer */ 1088 1089 PetscFunctionBegin; 1090 PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); 1091 Kokkos::parallel_for(m,KOKKOS_LAMBDA(const PetscInt i) { 1092 PetscScalar tmp; 1093 if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i+1)) { /* The diagonal element exists */ 1094 tmp = Aa(Ai(i)); 1095 Aa(Ai(i)) = Aa(Adiag(i)); 1096 Aa(Adiag(i)) = tmp; 1097 } 1098 }); 1099 PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa)); 1100 PetscFunctionReturn(0); 1101 } 1102 1103 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1104 { 1105 PetscFunctionBegin; 1106 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1107 PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info)); 1108 B->offloadmask = PETSC_OFFLOAD_CPU; 1109 PetscFunctionReturn(0); 1110 } 1111 1112 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1113 { 1114 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1115 1116 PetscFunctionBegin; 1117 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1118 A->boundtocpu = PETSC_FALSE; 1119 1120 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1121 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1122 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1123 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1124 A->ops->scale = MatScale_SeqAIJKokkos; 1125 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1126 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1127 A->ops->mult = MatMult_SeqAIJKokkos; 1128 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1129 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1130 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1131 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1132 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1133 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1134 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1135 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1136 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1137 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1138 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1139 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1140 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1141 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1142 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1143 1144 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos)); 1145 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJKokkos)); 1146 PetscFunctionReturn(0); 1147 } 1148 1149 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1150 { 1151 Mat_SeqAIJ *aseq; 1152 PetscInt i,m,n; 1153 1154 PetscFunctionBegin; 1155 PetscCheck(!A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1156 1157 m = akok->nrows(); 1158 n = akok->ncols(); 1159 PetscCall(MatSetSizes(A,m,n,m,n)); 1160 PetscCall(MatSetType(A,MATSEQAIJKOKKOS)); 1161 1162 /* Set up data structures of A as a MATSEQAIJ */ 1163 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL)); 1164 aseq = (Mat_SeqAIJ*)(A)->data; 1165 1166 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1167 akok->j_dual.sync_host(); 1168 1169 aseq->i = akok->i_host_data(); 1170 aseq->j = akok->j_host_data(); 1171 aseq->a = akok->a_host_data(); 1172 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1173 aseq->singlemalloc = PETSC_FALSE; 1174 aseq->free_a = PETSC_FALSE; 1175 aseq->free_ij = PETSC_FALSE; 1176 aseq->nz = akok->nnz(); 1177 aseq->maxnz = aseq->nz; 1178 1179 PetscCall(PetscMalloc1(m,&aseq->imax)); 1180 PetscCall(PetscMalloc1(m,&aseq->ilen)); 1181 for (i=0; i<m; i++) { 1182 aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1183 } 1184 1185 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1186 akok->nonzerostate = A->nonzerostate; 1187 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1188 PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY)); 1189 PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY)); 1190 PetscFunctionReturn(0); 1191 } 1192 1193 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1194 1195 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1196 */ 1197 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1198 { 1199 PetscFunctionBegin; 1200 PetscCall(MatCreate(comm,A)); 1201 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A,akok)); 1202 PetscFunctionReturn(0); 1203 } 1204 1205 /* --------------------------------------------------------------------------------*/ 1206 /*@C 1207 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 1208 (the default parallel PETSc format). This matrix will ultimately be handled by 1209 Kokkos for calculations. For good matrix 1210 assembly performance the user should preallocate the matrix storage by setting 1211 the parameter nz (or the array nnz). By setting these parameters accurately, 1212 performance during matrix assembly can be increased by more than a factor of 50. 1213 1214 Collective 1215 1216 Input Parameters: 1217 + comm - MPI communicator, set to PETSC_COMM_SELF 1218 . m - number of rows 1219 . n - number of columns 1220 . nz - number of nonzeros per row (same for all rows) 1221 - nnz - array containing the number of nonzeros in the various rows 1222 (possibly different for each row) or NULL 1223 1224 Output Parameter: 1225 . A - the matrix 1226 1227 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1228 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1229 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1230 1231 Notes: 1232 If nnz is given then nz is ignored 1233 1234 The AIJ format (also called the Yale sparse matrix format or 1235 compressed row storage), is fully compatible with standard Fortran 77 1236 storage. That is, the stored row and column indices can begin at 1237 either one (as in Fortran) or zero. See the users' manual for details. 1238 1239 Specify the preallocated storage with either nz or nnz (not both). 1240 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1241 allocation. For large problems you MUST preallocate memory or you 1242 will get TERRIBLE performance, see the users' manual chapter on matrices. 1243 1244 By default, this format uses inodes (identical nodes) when possible, to 1245 improve numerical efficiency of matrix-vector products and solves. We 1246 search for consecutive rows with the same nonzero structure, thereby 1247 reusing matrix information to achieve increased efficiency. 1248 1249 Level: intermediate 1250 1251 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()` 1252 @*/ 1253 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1254 { 1255 PetscFunctionBegin; 1256 PetscCall(PetscKokkosInitializeCheck()); 1257 PetscCall(MatCreate(comm,A)); 1258 PetscCall(MatSetSizes(*A,m,n,m,n)); 1259 PetscCall(MatSetType(*A,MATSEQAIJKOKKOS)); 1260 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz)); 1261 PetscFunctionReturn(0); 1262 } 1263 1264 typedef Kokkos::TeamPolicy<>::member_type team_member; 1265 // 1266 // This factorization exploits block diagonal matrices with "Nf" (not used). 1267 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1268 // 1269 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1270 { 1271 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1272 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1273 IS isrow = b->row,isicol = b->icol; 1274 const PetscInt *r_h,*ic_h; 1275 const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data(); 1276 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1277 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1278 PetscBool row_identity,col_identity; 1279 PetscInt nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used 1280 1281 PetscFunctionBegin; 1282 PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n); 1283 PetscCall(MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity)); 1284 PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 1285 PetscCall(ISGetIndices(isrow,&r_h)); 1286 PetscCall(ISGetIndices(isicol,&ic_h)); 1287 PetscCall(ISGetSize(isicol,&nc)); 1288 PetscCall(PetscLogGpuTimeBegin()); 1289 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1290 { 1291 #define KOKKOS_SHARED_LEVEL 1 1292 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1293 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1294 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1295 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 1296 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 1297 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 1298 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 1299 size_t flops_h = 0.0; 1300 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 1301 Kokkos::View<size_t> d_flops_k ("flops"); 1302 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1303 const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 1304 Kokkos::deep_copy (d_flops_k, h_flops_k); 1305 Kokkos::deep_copy (d_r_k, h_r_k); 1306 Kokkos::deep_copy (d_ic_k, h_ic_k); 1307 // Fill A --> fact 1308 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1309 const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 1310 const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 1311 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1312 // zero rows of B 1313 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1314 PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 1315 PetscScalar *baL = ba_d + bi_d[rowb]; 1316 PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 1317 /* zero (unfactored row) */ 1318 for (int j=0;j<nzbL;j++) baL[j] = 0; 1319 for (int j=0;j<nzbU;j++) baU[j] = 0; 1320 }); 1321 // copy A into B 1322 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1323 PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 1324 const PetscScalar *av = aa_d + ai_d[rowa]; 1325 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1326 /* load in initial (unfactored row) */ 1327 for (int j=0;j<nza;j++) { 1328 PetscInt colb = ic[ajtmp[j]]; 1329 PetscScalar vala = av[j]; 1330 if (colb == rowb) { 1331 *(ba_d + bdiag_d[rowb]) = vala; 1332 } else { 1333 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1334 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1335 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 1336 for (int j=0; j<nz ; j++) { 1337 if (pbj[j] == colb) { 1338 pba[j] = vala; 1339 set++; 1340 break; 1341 } 1342 } 1343 #if !defined(PETSC_HAVE_SYCL) 1344 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1345 #endif 1346 } 1347 } 1348 }); 1349 }); 1350 Kokkos::fence(); 1351 1352 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 1353 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1354 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1355 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1356 const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 1357 const PetscInt start = field*nloc, end = start + nloc; 1358 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1359 // A22 panel update for each row A(1,:) and col A(:,1) 1360 for (int ii=start; ii<end-1; ii++) { 1361 const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 1362 const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 1363 const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 1364 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1365 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 1366 PetscInt kIdx = j*Ni + field_block_idx; 1367 if (kIdx >= nzUi) /* void */ ; 1368 else { 1369 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1370 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1371 const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 1372 size_t st_idx; 1373 // find and do L(k,i) = A(:k,i) / A(i,i) 1374 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1375 // get column, there has got to be a better way 1376 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 1377 if (pjL[j] == ii) { 1378 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1379 idx = j; // output 1380 *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1381 } 1382 }, st_idx); 1383 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 1384 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1385 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register 1386 #endif 1387 // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1388 // U(i+1,:end) 1389 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 1390 PetscScalar Uij = baUi[uiIdx]; 1391 PetscInt col = bjUi[uiIdx]; 1392 if (col==myk) { 1393 // A_kk = A_kk - L_ki * U_ij(k) 1394 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1395 *Akkv = *Akkv - L_ki() * Uij; // UiK 1396 } else { 1397 PetscScalar *start, *end, *pAkjv=NULL; 1398 PetscInt high, low; 1399 const PetscInt *startj; 1400 if (col<myk) { // L 1401 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1402 PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 1403 start = pLki+1; // start at pLki+1, A22(myk,1) 1404 startj= bj_d + bi_d[myk] + idx; 1405 end = ba_d + bi_d[myk+1]; 1406 } else { 1407 PetscInt idx = bdiag_d[myk+1]+1; 1408 start = ba_d + idx; 1409 startj= bj_d + idx; 1410 end = ba_d + bdiag_d[myk]; 1411 } 1412 // search for 'col', use bisection search - TODO 1413 low = 0; 1414 high = (PetscInt)(end-start); 1415 while (high-low > 5) { 1416 int t = (low+high)/2; 1417 if (startj[t] > col) high = t; 1418 else low = t; 1419 } 1420 for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 1421 if (startj[pAkjv-start] == col) break; 1422 } 1423 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1424 if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register 1425 #endif 1426 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1427 } 1428 }); 1429 } 1430 }); 1431 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1432 if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 1433 } /* endof for (i=0; i<n; i++) { */ 1434 Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 1435 }); 1436 Kokkos::fence(); 1437 Kokkos::deep_copy (h_flops_k, d_flops_k); 1438 PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k())); 1439 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 1440 const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 1441 const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 1442 /* Invert diagonal for simpler triangular solves */ 1443 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 1444 int i = start + outer_index*Ni + lg_rank%Ni; 1445 if (i < end) { 1446 PetscScalar *pv = ba_d + bdiag_d[i]; 1447 *pv = 1.0/(*pv); 1448 } 1449 }); 1450 }); 1451 } 1452 PetscCall(PetscLogGpuTimeEnd()); 1453 PetscCall(ISRestoreIndices(isicol,&ic_h)); 1454 PetscCall(ISRestoreIndices(isrow,&r_h)); 1455 1456 PetscCall(ISIdentity(isrow,&row_identity)); 1457 PetscCall(ISIdentity(isicol,&col_identity)); 1458 if (b->inode.size) { 1459 B->ops->solve = MatSolve_SeqAIJ_Inode; 1460 } else if (row_identity && col_identity) { 1461 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1462 } else { 1463 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1464 } 1465 B->offloadmask = PETSC_OFFLOAD_GPU; 1466 PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU 1467 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1468 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1469 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1470 B->ops->matsolve = MatMatSolve_SeqAIJ; 1471 B->assembled = PETSC_TRUE; 1472 B->preallocated = PETSC_TRUE; 1473 1474 PetscFunctionReturn(0); 1475 } 1476 1477 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1478 { 1479 PetscFunctionBegin; 1480 PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 1481 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1482 PetscFunctionReturn(0); 1483 } 1484 1485 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1486 { 1487 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1488 1489 PetscFunctionBegin; 1490 if (!factors->sptrsv_symbolic_completed) { 1491 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 1492 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 1493 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1494 } 1495 PetscFunctionReturn(0); 1496 } 1497 1498 /* Check if we need to update factors etc for transpose solve */ 1499 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1500 { 1501 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1502 MatColIdxType n = A->rmap->n; 1503 1504 PetscFunctionBegin; 1505 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1506 /* Update L^T and do sptrsv symbolic */ 1507 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 1508 Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1509 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1510 factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 1511 1512 KokkosKernels::Impl::transpose_matrix< 1513 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1514 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1515 MatRowMapKokkosView,DefaultExecutionSpace>( 1516 n,n,factors->iL_d,factors->jL_d,factors->aL_d, 1517 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1518 1519 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1520 We have to sort the indices, until KK provides finer control options. 1521 */ 1522 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1523 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1524 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1525 1526 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 1527 1528 /* Update U^T and do sptrsv symbolic */ 1529 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 1530 Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1531 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1532 factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 1533 1534 KokkosKernels::Impl::transpose_matrix< 1535 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1536 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1537 MatRowMapKokkosView,DefaultExecutionSpace>( 1538 n,n,factors->iU_d, factors->jU_d, factors->aU_d, 1539 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1540 1541 /* Sort indices. See comments above */ 1542 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1543 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1544 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1545 1546 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 1547 factors->transpose_updated = PETSC_TRUE; 1548 } 1549 PetscFunctionReturn(0); 1550 } 1551 1552 /* Solve Ax = b, with A = LU */ 1553 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 1554 { 1555 ConstPetscScalarKokkosView bv; 1556 PetscScalarKokkosView xv; 1557 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1558 1559 PetscFunctionBegin; 1560 PetscCall(PetscLogGpuTimeBegin()); 1561 PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 1562 PetscCall(VecGetKokkosView(b,&bv)); 1563 PetscCall(VecGetKokkosViewWrite(x,&xv)); 1564 /* Solve L tmpv = b */ 1565 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 1566 /* Solve Ux = tmpv */ 1567 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 1568 PetscCall(VecRestoreKokkosView(b,&bv)); 1569 PetscCall(VecRestoreKokkosViewWrite(x,&xv)); 1570 PetscCall(PetscLogGpuTimeEnd()); 1571 PetscFunctionReturn(0); 1572 } 1573 1574 /* Solve A^T x = b, where A^T = U^T L^T */ 1575 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 1576 { 1577 ConstPetscScalarKokkosView bv; 1578 PetscScalarKokkosView xv; 1579 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1580 1581 PetscFunctionBegin; 1582 PetscCall(PetscLogGpuTimeBegin()); 1583 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 1584 PetscCall(VecGetKokkosView(b,&bv)); 1585 PetscCall(VecGetKokkosViewWrite(x,&xv)); 1586 /* Solve U^T tmpv = b */ 1587 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 1588 1589 /* Solve L^T x = tmpv */ 1590 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 1591 PetscCall(VecRestoreKokkosView(b,&bv)); 1592 PetscCall(VecRestoreKokkosViewWrite(x,&xv)); 1593 PetscCall(PetscLogGpuTimeEnd()); 1594 PetscFunctionReturn(0); 1595 } 1596 1597 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1598 { 1599 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1600 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1601 PetscInt fill_lev = info->levels; 1602 1603 PetscFunctionBegin; 1604 PetscCall(PetscLogGpuTimeBegin()); 1605 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1606 1607 auto a_d = aijkok->a_dual.view_device(); 1608 auto i_d = aijkok->i_dual.view_device(); 1609 auto j_d = aijkok->j_dual.view_device(); 1610 1611 KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 1612 1613 B->assembled = PETSC_TRUE; 1614 B->preallocated = PETSC_TRUE; 1615 B->ops->solve = MatSolve_SeqAIJKokkos; 1616 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1617 B->ops->matsolve = NULL; 1618 B->ops->matsolvetranspose = NULL; 1619 B->offloadmask = PETSC_OFFLOAD_GPU; 1620 1621 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1622 factors->transpose_updated = PETSC_FALSE; 1623 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1624 /* TODO: log flops, but how to know that? */ 1625 PetscCall(PetscLogGpuTimeEnd()); 1626 PetscFunctionReturn(0); 1627 } 1628 1629 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1630 { 1631 Mat_SeqAIJKokkos *aijkok; 1632 Mat_SeqAIJ *b; 1633 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1634 PetscInt fill_lev = info->levels; 1635 PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 1636 PetscInt n = A->rmap->n; 1637 1638 PetscFunctionBegin; 1639 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1640 /* Rebuild factors */ 1641 if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 1642 else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 1643 1644 /* Create a spiluk handle and then do symbolic factorization */ 1645 nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 1646 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 1647 1648 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1649 1650 Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 1651 Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 1652 Kokkos::realloc(factors->iU_d,n+1); 1653 Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 1654 1655 aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1656 auto i_d = aijkok->i_dual.view_device(); 1657 auto j_d = aijkok->j_dual.view_device(); 1658 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 1659 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1660 1661 Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1662 Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 1663 Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 1664 Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 1665 1666 /* TODO: add options to select sptrsv algorithms */ 1667 /* Create sptrsv handles for L, U and their transpose */ 1668 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1669 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1670 #else 1671 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1672 #endif 1673 1674 factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 1675 factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 1676 factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 1677 factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 1678 1679 /* Fill fields of the factor matrix B */ 1680 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL)); 1681 b = (Mat_SeqAIJ*)B->data; 1682 b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 1683 B->info.fill_ratio_given = info->fill; 1684 B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 1685 1686 B->offloadmask = PETSC_OFFLOAD_GPU; 1687 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1688 PetscFunctionReturn(0); 1689 } 1690 1691 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1692 { 1693 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1694 const PetscInt nrows = A->rmap->n; 1695 1696 PetscFunctionBegin; 1697 PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 1698 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1699 // move B data into Kokkos 1700 PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok 1701 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok 1702 { 1703 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1704 if (!baijkok->diag_d.extent(0)) { 1705 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1706 baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1707 Kokkos::deep_copy (baijkok->diag_d, h_diag); 1708 } 1709 } 1710 PetscFunctionReturn(0); 1711 } 1712 1713 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1714 { 1715 PetscFunctionBegin; 1716 *type = MATSOLVERKOKKOS; 1717 PetscFunctionReturn(0); 1718 } 1719 1720 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 1721 { 1722 PetscFunctionBegin; 1723 *type = MATSOLVERKOKKOSDEVICE; 1724 PetscFunctionReturn(0); 1725 } 1726 1727 /*MC 1728 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1729 on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1730 1731 Level: beginner 1732 1733 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1734 M*/ 1735 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1736 { 1737 PetscInt n = A->rmap->n; 1738 1739 PetscFunctionBegin; 1740 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 1741 PetscCall(MatSetSizes(*B,n,n,n,n)); 1742 (*B)->factortype = ftype; 1743 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 1744 PetscCall(MatSetType(*B,MATSEQAIJKOKKOS)); 1745 1746 if (ftype == MAT_FACTOR_LU) { 1747 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 1748 (*B)->canuseordering = PETSC_TRUE; 1749 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1750 } else if (ftype == MAT_FACTOR_ILU) { 1751 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 1752 (*B)->canuseordering = PETSC_FALSE; 1753 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1754 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1755 1756 PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 1757 PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos)); 1758 PetscFunctionReturn(0); 1759 } 1760 1761 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 1762 { 1763 PetscInt n = A->rmap->n; 1764 1765 PetscFunctionBegin; 1766 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 1767 PetscCall(MatSetSizes(*B,n,n,n,n)); 1768 (*B)->factortype = ftype; 1769 (*B)->canuseordering = PETSC_TRUE; 1770 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 1771 PetscCall(MatSetType(*B,MATSEQAIJKOKKOS)); 1772 1773 if (ftype == MAT_FACTOR_LU) { 1774 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 1775 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1776 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1777 1778 PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 1779 PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device)); 1780 PetscFunctionReturn(0); 1781 } 1782 1783 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1784 { 1785 PetscFunctionBegin; 1786 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos)); 1787 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos)); 1788 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device)); 1789 PetscFunctionReturn(0); 1790 } 1791 1792 /* Utility to print out a KokkosCsrMatrix for debugging */ 1793 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1794 { 1795 const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1796 const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1797 const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1798 const PetscInt *i = iv.data(); 1799 const PetscInt *j = jv.data(); 1800 const PetscScalar *a = av.data(); 1801 PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1802 1803 PetscFunctionBegin; 1804 PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz)); 1805 for (PetscInt k=0; k<m; k++) { 1806 PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k)); 1807 for (PetscInt p=i[k]; p<i[k+1]; p++) { 1808 PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],(double)PetscRealPart(a[p]))); 1809 } 1810 PetscCall(PetscPrintf(PETSC_COMM_SELF,"\n")); 1811 } 1812 PetscFunctionReturn(0); 1813 } 1814