1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petsc/private/sfimpl.h> 5 #include <petscsystypes.h> 6 #include <petscerror.h> 7 8 #include <Kokkos_Core.hpp> 9 #include <KokkosBlas.hpp> 10 #include <KokkosKernels_Sorting.hpp> 11 #include <KokkosSparse_CrsMatrix.hpp> 12 #include <KokkosSparse_spmv.hpp> 13 #include <KokkosSparse_spiluk.hpp> 14 #include <KokkosSparse_sptrsv.hpp> 15 #include <KokkosSparse_spgemm.hpp> 16 #include <KokkosSparse_spadd.hpp> 17 18 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 19 20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 21 22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 23 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 24 In the latter case, it is important to set a_dual's sync state correctly. 25 */ 26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 27 { 28 Mat_SeqAIJ *aijseq; 29 Mat_SeqAIJKokkos *aijkok; 30 31 PetscFunctionBegin; 32 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 33 PetscCall(MatAssemblyEnd_SeqAIJ(A,mode)); 34 35 aijseq = static_cast<Mat_SeqAIJ*>(A->data); 36 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 37 38 /* If aijkok does not exist, we just copy i, j to device. 39 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 40 In both cases, we build a new aijkok structure. 41 */ 42 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 43 delete aijkok; 44 aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 45 A->spptr = aijkok; 46 } 47 48 if (aijkok->device_mat_d.data()) { 49 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 50 } 51 PetscFunctionReturn(0); 52 } 53 54 /* Sync CSR data to device if not yet */ 55 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 56 { 57 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 58 59 PetscFunctionBegin; 60 PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 61 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 62 if (aijkok->a_dual.need_sync_device()) { 63 aijkok->a_dual.sync_device(); 64 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 65 aijkok->hermitian_updated = PETSC_FALSE; 66 } 67 PetscFunctionReturn(0); 68 } 69 70 /* Mark the CSR data on device as modified */ 71 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 72 { 73 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 74 75 PetscFunctionBegin; 76 PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 77 aijkok->a_dual.clear_sync_state(); 78 aijkok->a_dual.modify_device(); 79 aijkok->transpose_updated = PETSC_FALSE; 80 aijkok->hermitian_updated = PETSC_FALSE; 81 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 82 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 83 PetscFunctionReturn(0); 84 } 85 86 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 87 { 88 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 89 90 PetscFunctionBegin; 91 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 92 /* We do not expect one needs factors on host */ 93 PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 94 PetscCheck(aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 95 aijkok->a_dual.sync_host(); 96 PetscFunctionReturn(0); 97 } 98 99 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 100 { 101 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 102 103 PetscFunctionBegin; 104 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 105 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 106 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 107 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 108 */ 109 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 110 aijkok->a_dual.sync_host(); 111 *array = aijkok->a_dual.view_host().data(); 112 } else { /* Happens when calling MatSetValues on a newly created matrix */ 113 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 114 } 115 PetscFunctionReturn(0); 116 } 117 118 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 119 { 120 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 121 122 PetscFunctionBegin; 123 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 124 PetscFunctionReturn(0); 125 } 126 127 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 128 { 129 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 130 131 PetscFunctionBegin; 132 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 133 aijkok->a_dual.sync_host(); 134 *array = aijkok->a_dual.view_host().data(); 135 } else { 136 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 137 } 138 PetscFunctionReturn(0); 139 } 140 141 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 142 { 143 PetscFunctionBegin; 144 *array = NULL; 145 PetscFunctionReturn(0); 146 } 147 148 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 149 { 150 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 151 152 PetscFunctionBegin; 153 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 154 *array = aijkok->a_dual.view_host().data(); 155 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 156 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 157 } 158 PetscFunctionReturn(0); 159 } 160 161 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 162 { 163 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 164 165 PetscFunctionBegin; 166 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 167 aijkok->a_dual.clear_sync_state(); 168 aijkok->a_dual.modify_host(); 169 } 170 PetscFunctionReturn(0); 171 } 172 173 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype) 174 { 175 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 176 177 PetscFunctionBegin; 178 PetscCheck(aijkok != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"aijkok is NULL"); 179 180 if (i) *i = aijkok->i_device_data(); 181 if (j) *j = aijkok->j_device_data(); 182 if (a) { 183 aijkok->a_dual.sync_device(); 184 *a = aijkok->a_device_data(); 185 } 186 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 187 PetscFunctionReturn(0); 188 } 189 190 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 191 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 192 { 193 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 194 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 195 196 PetscFunctionBegin; 197 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 198 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 199 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 200 PetscFunctionReturn(0); 201 } 202 203 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 204 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 205 { 206 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 207 208 PetscFunctionBegin; 209 if (aijkok && aijkok->device_mat_d.data()) { 210 *d_mat = aijkok->device_mat_d.data(); 211 } else { 212 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it) 213 *d_mat = NULL; 214 } 215 PetscFunctionReturn(0); 216 } 217 218 /* Generate the transpose on device and cache it internally */ 219 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 220 { 221 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 222 223 PetscFunctionBegin; 224 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 225 if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 226 /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 227 PetscCallCXX(aijkok->a_dual.sync_device()); 228 PetscCallCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 229 PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 230 aijkok->transpose_updated = PETSC_TRUE; 231 } 232 *csrmatT = &aijkok->csrmatT; 233 PetscFunctionReturn(0); 234 } 235 236 /* Generate the Hermitian on device and cache it internally */ 237 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 238 { 239 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 240 241 PetscFunctionBegin; 242 PetscCall(PetscLogGpuTimeBegin()); 243 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 244 if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 245 PetscCallCXX(aijkok->a_dual.sync_device()); 246 PetscCallCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 247 PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 248 #if defined(PETSC_USE_COMPLEX) 249 const auto& a = aijkok->csrmatH.values; 250 Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 251 #endif 252 aijkok->hermitian_updated = PETSC_TRUE; 253 } 254 *csrmatH = &aijkok->csrmatH; 255 PetscCall(PetscLogGpuTimeEnd()); 256 PetscFunctionReturn(0); 257 } 258 259 /* y = A x */ 260 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 261 { 262 Mat_SeqAIJKokkos *aijkok; 263 ConstPetscScalarKokkosView xv; 264 PetscScalarKokkosView yv; 265 266 PetscFunctionBegin; 267 PetscCall(PetscLogGpuTimeBegin()); 268 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 269 PetscCall(VecGetKokkosView(xx,&xv)); 270 PetscCall(VecGetKokkosViewWrite(yy,&yv)); 271 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 272 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 273 PetscCall(VecRestoreKokkosView(xx,&xv)); 274 PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 275 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 276 PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz())); 277 PetscCall(PetscLogGpuTimeEnd()); 278 PetscFunctionReturn(0); 279 } 280 281 /* y = A^T x */ 282 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 283 { 284 Mat_SeqAIJKokkos *aijkok; 285 const char *mode; 286 ConstPetscScalarKokkosView xv; 287 PetscScalarKokkosView yv; 288 KokkosCsrMatrix *csrmat; 289 290 PetscFunctionBegin; 291 PetscCall(PetscLogGpuTimeBegin()); 292 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 293 PetscCall(VecGetKokkosView(xx,&xv)); 294 PetscCall(VecGetKokkosViewWrite(yy,&yv)); 295 if (A->form_explicit_transpose) { 296 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat)); 297 mode = "N"; 298 } else { 299 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 300 csrmat = &aijkok->csrmat; 301 mode = "T"; 302 } 303 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 304 PetscCall(VecRestoreKokkosView(xx,&xv)); 305 PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 306 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 307 PetscCall(PetscLogGpuTimeEnd()); 308 PetscFunctionReturn(0); 309 } 310 311 /* y = A^H x */ 312 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 313 { 314 Mat_SeqAIJKokkos *aijkok; 315 const char *mode; 316 ConstPetscScalarKokkosView xv; 317 PetscScalarKokkosView yv; 318 KokkosCsrMatrix *csrmat; 319 320 PetscFunctionBegin; 321 PetscCall(PetscLogGpuTimeBegin()); 322 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 323 PetscCall(VecGetKokkosView(xx,&xv)); 324 PetscCall(VecGetKokkosViewWrite(yy,&yv)); 325 if (A->form_explicit_transpose) { 326 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat)); 327 mode = "N"; 328 } else { 329 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 330 csrmat = &aijkok->csrmat; 331 mode = "C"; 332 } 333 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 334 PetscCall(VecRestoreKokkosView(xx,&xv)); 335 PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 336 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 337 PetscCall(PetscLogGpuTimeEnd()); 338 PetscFunctionReturn(0); 339 } 340 341 /* z = A x + y */ 342 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 343 { 344 Mat_SeqAIJKokkos *aijkok; 345 ConstPetscScalarKokkosView xv,yv; 346 PetscScalarKokkosView zv; 347 348 PetscFunctionBegin; 349 PetscCall(PetscLogGpuTimeBegin()); 350 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 351 PetscCall(VecGetKokkosView(xx,&xv)); 352 PetscCall(VecGetKokkosView(yy,&yv)); 353 PetscCall(VecGetKokkosViewWrite(zz,&zv)); 354 if (zz != yy) Kokkos::deep_copy(zv,yv); 355 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 356 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 357 PetscCall(VecRestoreKokkosView(xx,&xv)); 358 PetscCall(VecRestoreKokkosView(yy,&yv)); 359 PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 360 PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz())); 361 PetscCall(PetscLogGpuTimeEnd()); 362 PetscFunctionReturn(0); 363 } 364 365 /* z = A^T x + y */ 366 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 367 { 368 Mat_SeqAIJKokkos *aijkok; 369 const char *mode; 370 ConstPetscScalarKokkosView xv,yv; 371 PetscScalarKokkosView zv; 372 KokkosCsrMatrix *csrmat; 373 374 PetscFunctionBegin; 375 PetscCall(PetscLogGpuTimeBegin()); 376 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 377 PetscCall(VecGetKokkosView(xx,&xv)); 378 PetscCall(VecGetKokkosView(yy,&yv)); 379 PetscCall(VecGetKokkosViewWrite(zz,&zv)); 380 if (zz != yy) Kokkos::deep_copy(zv,yv); 381 if (A->form_explicit_transpose) { 382 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat)); 383 mode = "N"; 384 } else { 385 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 386 csrmat = &aijkok->csrmat; 387 mode = "T"; 388 } 389 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 390 PetscCall(VecRestoreKokkosView(xx,&xv)); 391 PetscCall(VecRestoreKokkosView(yy,&yv)); 392 PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 393 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 394 PetscCall(PetscLogGpuTimeEnd()); 395 PetscFunctionReturn(0); 396 } 397 398 /* z = A^H x + y */ 399 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 400 { 401 Mat_SeqAIJKokkos *aijkok; 402 const char *mode; 403 ConstPetscScalarKokkosView xv,yv; 404 PetscScalarKokkosView zv; 405 KokkosCsrMatrix *csrmat; 406 407 PetscFunctionBegin; 408 PetscCall(PetscLogGpuTimeBegin()); 409 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 410 PetscCall(VecGetKokkosView(xx,&xv)); 411 PetscCall(VecGetKokkosView(yy,&yv)); 412 PetscCall(VecGetKokkosViewWrite(zz,&zv)); 413 if (zz != yy) Kokkos::deep_copy(zv,yv); 414 if (A->form_explicit_transpose) { 415 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat)); 416 mode = "N"; 417 } else { 418 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 419 csrmat = &aijkok->csrmat; 420 mode = "C"; 421 } 422 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 423 PetscCall(VecRestoreKokkosView(xx,&xv)); 424 PetscCall(VecRestoreKokkosView(yy,&yv)); 425 PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 426 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 427 PetscCall(PetscLogGpuTimeEnd()); 428 PetscFunctionReturn(0); 429 } 430 431 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 432 { 433 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 434 435 PetscFunctionBegin; 436 switch (op) { 437 case MAT_FORM_EXPLICIT_TRANSPOSE: 438 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 439 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 440 A->form_explicit_transpose = flg; 441 break; 442 default: 443 PetscCall(MatSetOption_SeqAIJ(A,op,flg)); 444 break; 445 } 446 PetscFunctionReturn(0); 447 } 448 449 /* Depending on reuse, either build a new mat, or use the existing mat */ 450 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 451 { 452 Mat_SeqAIJ *aseq; 453 454 PetscFunctionBegin; 455 PetscCall(PetscKokkosInitializeCheck()); 456 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 457 PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); /* the returned newmat is a SeqAIJKokkos */ 458 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 459 PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 460 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 461 PetscCheck(A == *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 462 PetscCall(PetscFree(A->defaultvectype)); 463 PetscCall(PetscStrallocpy(VECKOKKOS,&A->defaultvectype)); /* Allocate and copy the string */ 464 PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS)); 465 PetscCall(MatSetOps_SeqAIJKokkos(A)); 466 aseq = static_cast<Mat_SeqAIJ*>(A->data); 467 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 468 PetscCheck(!A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 469 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 470 } 471 } 472 PetscFunctionReturn(0); 473 } 474 475 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 476 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 477 */ 478 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 479 { 480 Mat_SeqAIJ *bseq; 481 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 482 Mat mat; 483 484 PetscFunctionBegin; 485 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 486 PetscCall(MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B)); 487 mat = *B; 488 if (A->assembled) { 489 bseq = static_cast<Mat_SeqAIJ*>(mat->data); 490 bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 491 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 492 /* Now copy values to B if needed */ 493 if (dupOption == MAT_COPY_VALUES) { 494 if (akok->a_dual.need_sync_device()) { 495 Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 496 bkok->a_dual.modify_host(); 497 } else { /* If device has the latest data, we only copy data on device */ 498 Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 499 bkok->a_dual.modify_device(); 500 } 501 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 502 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 503 bkok->a_dual.modify_host(); 504 } 505 mat->spptr = bkok; 506 } 507 508 PetscCall(PetscFree(mat->defaultvectype)); 509 PetscCall(PetscStrallocpy(VECKOKKOS,&mat->defaultvectype)); /* Allocate and copy the string */ 510 PetscCall(PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS)); 511 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 512 PetscFunctionReturn(0); 513 } 514 515 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 516 { 517 Mat At; 518 KokkosCsrMatrix *internT; 519 Mat_SeqAIJKokkos *atkok,*bkok; 520 521 PetscFunctionBegin; 522 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&internT)); /* Generate a transpose internally */ 523 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 524 /* Deep copy internT, as we want to isolate the internal transpose */ 525 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT))); 526 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At)); 527 if (reuse == MAT_INITIAL_MATRIX) *B = At; 528 else PetscCall(MatHeaderReplace(A,&At)); /* Replace A with At inplace */ 529 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 530 if ((*B)->assembled) { 531 bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 532 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 533 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 534 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 535 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 536 MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 537 MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 538 PetscCallCXX(Kokkos::deep_copy(a_h,internT->values)); 539 PetscCallCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 540 } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 541 } 542 PetscFunctionReturn(0); 543 } 544 545 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 546 { 547 Mat_SeqAIJKokkos *aijkok; 548 549 PetscFunctionBegin; 550 if (A->factortype == MAT_FACTOR_NONE) { 551 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 552 delete aijkok; 553 } else { 554 delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 555 } 556 A->spptr = NULL; 557 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL)); 558 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 559 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 560 PetscCall(MatDestroy_SeqAIJ(A)); 561 PetscFunctionReturn(0); 562 } 563 564 /*MC 565 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 566 567 A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 568 569 Options Database Keys: 570 . -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions() 571 572 Level: beginner 573 574 .seealso: `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 575 M*/ 576 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 577 { 578 PetscFunctionBegin; 579 PetscCall(PetscKokkosInitializeCheck()); 580 PetscCall(MatCreate_SeqAIJ(A)); 581 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A)); 582 PetscFunctionReturn(0); 583 } 584 585 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 586 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 587 { 588 Mat_SeqAIJ *a,*b; 589 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 590 MatScalarKokkosView aa,ba,ca; 591 MatRowMapKokkosView ai,bi,ci; 592 MatColIdxKokkosView aj,bj,cj; 593 PetscInt m,n,nnz,aN; 594 595 PetscFunctionBegin; 596 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 597 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 598 PetscValidPointer(C,4); 599 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 600 PetscCheckTypeName(B,MATSEQAIJKOKKOS); 601 PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 602 PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 603 604 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 605 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 606 a = static_cast<Mat_SeqAIJ*>(A->data); 607 b = static_cast<Mat_SeqAIJ*>(B->data); 608 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 609 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 610 aa = akok->a_dual.view_device(); 611 ai = akok->i_dual.view_device(); 612 ba = bkok->a_dual.view_device(); 613 bi = bkok->i_dual.view_device(); 614 m = A->rmap->n; /* M, N and nnz of C */ 615 n = A->cmap->n + B->cmap->n; 616 nnz = a->nz + b->nz; 617 aN = A->cmap->n; /* N of A */ 618 if (reuse == MAT_INITIAL_MATRIX) { 619 aj = akok->j_dual.view_device(); 620 bj = bkok->j_dual.view_device(); 621 auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 622 auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 623 auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 624 ca = ca_dual.view_device(); 625 ci = ci_dual.view_device(); 626 cj = cj_dual.view_device(); 627 628 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 629 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 630 PetscInt i = t.league_rank(); /* row i */ 631 PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 632 633 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 634 ci(i) = coffset; 635 if (i == m-1) ci(m) = ai(m) + bi(m); 636 }); 637 638 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 639 if (k < alen) { 640 ca(coffset+k) = aa(ai(i)+k); 641 cj(coffset+k) = aj(ai(i)+k); 642 } else { 643 ca(coffset+k) = ba(bi(i)+k-alen); 644 cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 645 } 646 }); 647 }); 648 ca_dual.modify_device(); 649 ci_dual.modify_device(); 650 cj_dual.modify_device(); 651 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 652 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C)); 653 } else if (reuse == MAT_REUSE_MATRIX) { 654 PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 655 PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 656 ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 657 ca = ckok->a_dual.view_device(); 658 ci = ckok->i_dual.view_device(); 659 660 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 661 PetscInt i = t.league_rank(); /* row i */ 662 PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 663 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 664 if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 665 else ca(ci(i)+k) = ba(bi(i)+k-alen); 666 }); 667 }); 668 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 669 } 670 PetscFunctionReturn(0); 671 } 672 673 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 674 { 675 PetscFunctionBegin; 676 delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 677 PetscFunctionReturn(0); 678 } 679 680 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 681 { 682 Mat_Product *product = C->product; 683 Mat A,B; 684 bool transA,transB; /* use bool, since KK needs this type */ 685 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 686 Mat_SeqAIJ *c; 687 MatProductData_SeqAIJKokkos *pdata; 688 KokkosCsrMatrix *csrmatA,*csrmatB; 689 690 PetscFunctionBegin; 691 MatCheckProduct(C,1); 692 PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 693 pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 694 695 if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 696 pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 697 PetscFunctionReturn(0); 698 } 699 700 switch (product->type) { 701 case MATPRODUCT_AB: transA = false; transB = false; break; 702 case MATPRODUCT_AtB: transA = true; transB = false; break; 703 case MATPRODUCT_ABt: transA = false; transB = true; break; 704 default: 705 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 706 } 707 708 A = product->A; 709 B = product->B; 710 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 711 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 712 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 713 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 714 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 715 716 PetscCheck(ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 717 718 csrmatA = &akok->csrmat; 719 csrmatB = &bkok->csrmat; 720 721 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 722 if (transA) { 723 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA)); 724 transA = false; 725 } 726 727 if (transB) { 728 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB)); 729 transB = false; 730 } 731 PetscCall(PetscLogGpuTimeBegin()); 732 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 733 PetscCallCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 734 PetscCall(PetscLogGpuTimeEnd()); 735 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 736 /* shorter version of MatAssemblyEnd_SeqAIJ */ 737 c = (Mat_SeqAIJ*)C->data; 738 PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz)); 739 PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n")); 740 PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax)); 741 c->reallocs = 0; 742 C->info.mallocs = 0; 743 C->info.nz_unneeded = 0; 744 C->assembled = C->was_assembled = PETSC_TRUE; 745 C->num_ass++; 746 PetscFunctionReturn(0); 747 } 748 749 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 750 { 751 Mat_Product *product = C->product; 752 MatProductType ptype; 753 Mat A,B; 754 bool transA,transB; 755 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 756 MatProductData_SeqAIJKokkos *pdata; 757 MPI_Comm comm; 758 KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 759 760 PetscFunctionBegin; 761 MatCheckProduct(C,1); 762 PetscCall(PetscObjectGetComm((PetscObject)C,&comm)); 763 PetscCheck(!product->data,comm,PETSC_ERR_PLIB,"Product data not empty"); 764 A = product->A; 765 B = product->B; 766 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 767 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 768 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 769 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 770 csrmatA = &akok->csrmat; 771 csrmatB = &bkok->csrmat; 772 773 ptype = product->type; 774 switch (ptype) { 775 case MATPRODUCT_AB: transA = false; transB = false; break; 776 case MATPRODUCT_AtB: transA = true; transB = false; break; 777 case MATPRODUCT_ABt: transA = false; transB = true; break; 778 default: 779 SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 780 } 781 782 product->data = pdata = new MatProductData_SeqAIJKokkos(); 783 pdata->kh.set_team_work_size(16); 784 pdata->kh.set_dynamic_scheduling(true); 785 pdata->reusesym = product->api_user; 786 787 /* TODO: add command line options to select spgemm algorithms */ 788 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 789 /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API. 790 We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support. 791 */ 792 pdata->kh.create_spgemm_handle(spgemm_alg); 793 794 PetscCall(PetscLogGpuTimeBegin()); 795 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 796 if (transA) { 797 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA)); 798 transA = false; 799 } 800 801 if (transB) { 802 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB)); 803 transB = false; 804 } 805 806 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 807 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 808 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 809 calling new Mat_SeqAIJKokkos(). 810 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 811 */ 812 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 813 PetscCallCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 814 PetscCall(PetscLogGpuTimeEnd()); 815 816 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 817 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C,ckok)); 818 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 819 PetscFunctionReturn(0); 820 } 821 822 /* handles sparse matrix matrix ops */ 823 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 824 { 825 Mat_Product *product = mat->product; 826 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 827 828 PetscFunctionBegin; 829 MatCheckProduct(mat,1); 830 PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok)); 831 if (product->type == MATPRODUCT_ABC) { 832 PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok)); 833 } 834 if (Biskok && Ciskok) { 835 switch (product->type) { 836 case MATPRODUCT_AB: 837 case MATPRODUCT_AtB: 838 case MATPRODUCT_ABt: 839 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 840 break; 841 case MATPRODUCT_PtAP: 842 case MATPRODUCT_RARt: 843 case MATPRODUCT_ABC: 844 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 845 break; 846 default: 847 break; 848 } 849 } else { /* fallback for AIJ */ 850 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 851 } 852 PetscFunctionReturn(0); 853 } 854 855 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 856 { 857 Mat_SeqAIJKokkos *aijkok; 858 859 PetscFunctionBegin; 860 PetscCall(PetscLogGpuTimeBegin()); 861 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 862 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 863 KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 864 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 865 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 866 PetscCall(PetscLogGpuTimeEnd()); 867 PetscFunctionReturn(0); 868 } 869 870 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 871 { 872 Mat_SeqAIJKokkos *aijkok; 873 874 PetscFunctionBegin; 875 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 876 if (aijkok) { /* Only zero the device if data is already there */ 877 KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 878 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 879 } else { /* Might be preallocated but not assembled */ 880 PetscCall(MatZeroEntries_SeqAIJ(A)); 881 } 882 PetscFunctionReturn(0); 883 } 884 885 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 886 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv) 887 { 888 Mat_SeqAIJKokkos *aijkok; 889 890 PetscFunctionBegin; 891 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 892 PetscValidPointer(kv,2); 893 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 894 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 895 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 896 *kv = aijkok->a_dual.view_device(); 897 PetscFunctionReturn(0); 898 } 899 900 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv) 901 { 902 PetscFunctionBegin; 903 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 904 PetscValidPointer(kv,2); 905 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 906 PetscFunctionReturn(0); 907 } 908 909 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv) 910 { 911 Mat_SeqAIJKokkos *aijkok; 912 913 PetscFunctionBegin; 914 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 915 PetscValidPointer(kv,2); 916 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 917 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 918 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 919 *kv = aijkok->a_dual.view_device(); 920 PetscFunctionReturn(0); 921 } 922 923 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv) 924 { 925 PetscFunctionBegin; 926 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 927 PetscValidPointer(kv,2); 928 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 929 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 930 PetscFunctionReturn(0); 931 } 932 933 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 934 { 935 Mat_SeqAIJKokkos *aijkok; 936 937 PetscFunctionBegin; 938 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 939 PetscValidPointer(kv,2); 940 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 941 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 942 *kv = aijkok->a_dual.view_device(); 943 PetscFunctionReturn(0); 944 } 945 946 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 947 { 948 PetscFunctionBegin; 949 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 950 PetscValidPointer(kv,2); 951 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 952 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 953 PetscFunctionReturn(0); 954 } 955 956 /* Computes Y += alpha X */ 957 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 958 { 959 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 960 Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 961 ConstMatScalarKokkosView Xa; 962 MatScalarKokkosView Ya; 963 964 PetscFunctionBegin; 965 PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 966 PetscCheckTypeName(X,MATSEQAIJKOKKOS); 967 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 968 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 969 PetscCall(PetscLogGpuTimeBegin()); 970 971 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 972 /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 973 PetscBool e; 974 PetscCall(PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e)); 975 if (e) { 976 PetscCall(PetscArraycmp(x->j,y->j,y->nz,&e)); 977 if (e) pattern = SAME_NONZERO_PATTERN; 978 } 979 } 980 981 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 982 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 983 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 984 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 985 */ 986 ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 987 xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 988 Xa = xkok->a_dual.view_device(); 989 Ya = ykok->a_dual.view_device(); 990 991 if (pattern == SAME_NONZERO_PATTERN) { 992 KokkosBlas::axpy(alpha,Xa,Ya); 993 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 994 } else if (pattern == SUBSET_NONZERO_PATTERN) { 995 MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 996 MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 997 998 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 999 PetscInt i = t.league_rank(); /* row i */ 1000 Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 1001 PetscInt p,q = Yi(i); 1002 for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 1003 while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 1004 if (Xj(p) == Yj(q)) { /* Found it */ 1005 Ya(q) += alpha * Xa(p); 1006 q++; 1007 } else { 1008 /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1009 Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1010 */ 1011 if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */ 1012 } 1013 } 1014 }); 1015 }); 1016 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1017 } else { /* different nonzero patterns */ 1018 Mat Z; 1019 KokkosCsrMatrix zcsr; 1020 KernelHandle kh; 1021 kh.create_spadd_handle(false); 1022 KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1023 KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1024 zkok = new Mat_SeqAIJKokkos(zcsr); 1025 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z)); 1026 PetscCall(MatHeaderReplace(Y,&Z)); 1027 kh.destroy_spadd_handle(); 1028 } 1029 PetscCall(PetscLogGpuTimeEnd()); 1030 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0)*2)); /* Because we scaled X and then added it to Y */ 1031 PetscFunctionReturn(0); 1032 } 1033 1034 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 1035 { 1036 Mat_SeqAIJKokkos *akok; 1037 Mat_SeqAIJ *aseq; 1038 1039 PetscFunctionBegin; 1040 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j)); 1041 aseq = static_cast<Mat_SeqAIJ*>(mat->data); 1042 akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr); 1043 delete akok; 1044 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,mat->nonzerostate+1,PETSC_FALSE); 1045 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1046 akok->SetUpCOO(aseq); 1047 PetscFunctionReturn(0); 1048 } 1049 1050 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode) 1051 { 1052 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ*>(A->data); 1053 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1054 PetscCount Annz = aseq->nz; 1055 const PetscCountKokkosView& jmap = akok->jmap_d; 1056 const PetscCountKokkosView& perm = akok->perm_d; 1057 MatScalarKokkosView Aa; 1058 ConstMatScalarKokkosView kv; 1059 PetscMemType memtype; 1060 1061 PetscFunctionBegin; 1062 PetscCall(PetscGetMemType(v,&memtype)); 1063 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1064 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n)); 1065 } else { 1066 kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */ 1067 } 1068 1069 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); /* write matrix values */ 1070 else PetscCall(MatSeqAIJGetKokkosView(A,&Aa)); /* read & write matrix values */ 1071 1072 Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) { 1073 PetscScalar sum = 0.0; 1074 for (PetscCount k=jmap(i); k<jmap(i+1); k++) sum += kv(perm(k)); 1075 Aa(i) = (imode == INSERT_VALUES? 0.0 : Aa(i)) + sum; 1076 }); 1077 1078 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa)); 1079 else PetscCall(MatSeqAIJRestoreKokkosView(A,&Aa)); 1080 PetscFunctionReturn(0); 1081 } 1082 1083 PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A,const PetscInt *diag) 1084 { 1085 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1086 MatScalarKokkosView Aa; 1087 const MatRowMapKokkosView& Ai = akok->i_dual.view_device(); 1088 PetscInt m = A->rmap->n; 1089 ConstMatRowMapKokkosView Adiag(diag,m); /* diag is a device pointer */ 1090 1091 PetscFunctionBegin; 1092 PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); 1093 Kokkos::parallel_for(m,KOKKOS_LAMBDA(const PetscInt i) { 1094 PetscScalar tmp; 1095 if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i+1)) { /* The diagonal element exists */ 1096 tmp = Aa(Ai(i)); 1097 Aa(Ai(i)) = Aa(Adiag(i)); 1098 Aa(Adiag(i)) = tmp; 1099 } 1100 }); 1101 PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa)); 1102 PetscFunctionReturn(0); 1103 } 1104 1105 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1106 { 1107 PetscFunctionBegin; 1108 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1109 PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info)); 1110 B->offloadmask = PETSC_OFFLOAD_CPU; 1111 PetscFunctionReturn(0); 1112 } 1113 1114 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1115 { 1116 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1117 1118 PetscFunctionBegin; 1119 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1120 A->boundtocpu = PETSC_FALSE; 1121 1122 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1123 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1124 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1125 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1126 A->ops->scale = MatScale_SeqAIJKokkos; 1127 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1128 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1129 A->ops->mult = MatMult_SeqAIJKokkos; 1130 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1131 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1132 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1133 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1134 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1135 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1136 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1137 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1138 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1139 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1140 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1141 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1142 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1143 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1144 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1145 1146 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos)); 1147 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJKokkos)); 1148 PetscFunctionReturn(0); 1149 } 1150 1151 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1152 { 1153 Mat_SeqAIJ *aseq; 1154 PetscInt i,m,n; 1155 1156 PetscFunctionBegin; 1157 PetscCheck(!A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1158 1159 m = akok->nrows(); 1160 n = akok->ncols(); 1161 PetscCall(MatSetSizes(A,m,n,m,n)); 1162 PetscCall(MatSetType(A,MATSEQAIJKOKKOS)); 1163 1164 /* Set up data structures of A as a MATSEQAIJ */ 1165 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL)); 1166 aseq = (Mat_SeqAIJ*)(A)->data; 1167 1168 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1169 akok->j_dual.sync_host(); 1170 1171 aseq->i = akok->i_host_data(); 1172 aseq->j = akok->j_host_data(); 1173 aseq->a = akok->a_host_data(); 1174 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1175 aseq->singlemalloc = PETSC_FALSE; 1176 aseq->free_a = PETSC_FALSE; 1177 aseq->free_ij = PETSC_FALSE; 1178 aseq->nz = akok->nnz(); 1179 aseq->maxnz = aseq->nz; 1180 1181 PetscCall(PetscMalloc1(m,&aseq->imax)); 1182 PetscCall(PetscMalloc1(m,&aseq->ilen)); 1183 for (i=0; i<m; i++) { 1184 aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1185 } 1186 1187 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1188 akok->nonzerostate = A->nonzerostate; 1189 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1190 PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY)); 1191 PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY)); 1192 PetscFunctionReturn(0); 1193 } 1194 1195 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1196 1197 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1198 */ 1199 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1200 { 1201 PetscFunctionBegin; 1202 PetscCall(MatCreate(comm,A)); 1203 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A,akok)); 1204 PetscFunctionReturn(0); 1205 } 1206 1207 /* --------------------------------------------------------------------------------*/ 1208 /*@C 1209 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 1210 (the default parallel PETSc format). This matrix will ultimately be handled by 1211 Kokkos for calculations. For good matrix 1212 assembly performance the user should preallocate the matrix storage by setting 1213 the parameter nz (or the array nnz). By setting these parameters accurately, 1214 performance during matrix assembly can be increased by more than a factor of 50. 1215 1216 Collective 1217 1218 Input Parameters: 1219 + comm - MPI communicator, set to PETSC_COMM_SELF 1220 . m - number of rows 1221 . n - number of columns 1222 . nz - number of nonzeros per row (same for all rows) 1223 - nnz - array containing the number of nonzeros in the various rows 1224 (possibly different for each row) or NULL 1225 1226 Output Parameter: 1227 . A - the matrix 1228 1229 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1230 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1231 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1232 1233 Notes: 1234 If nnz is given then nz is ignored 1235 1236 The AIJ format (also called the Yale sparse matrix format or 1237 compressed row storage), is fully compatible with standard Fortran 77 1238 storage. That is, the stored row and column indices can begin at 1239 either one (as in Fortran) or zero. See the users' manual for details. 1240 1241 Specify the preallocated storage with either nz or nnz (not both). 1242 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1243 allocation. For large problems you MUST preallocate memory or you 1244 will get TERRIBLE performance, see the users' manual chapter on matrices. 1245 1246 By default, this format uses inodes (identical nodes) when possible, to 1247 improve numerical efficiency of matrix-vector products and solves. We 1248 search for consecutive rows with the same nonzero structure, thereby 1249 reusing matrix information to achieve increased efficiency. 1250 1251 Level: intermediate 1252 1253 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()` 1254 @*/ 1255 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1256 { 1257 PetscFunctionBegin; 1258 PetscCall(PetscKokkosInitializeCheck()); 1259 PetscCall(MatCreate(comm,A)); 1260 PetscCall(MatSetSizes(*A,m,n,m,n)); 1261 PetscCall(MatSetType(*A,MATSEQAIJKOKKOS)); 1262 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz)); 1263 PetscFunctionReturn(0); 1264 } 1265 1266 typedef Kokkos::TeamPolicy<>::member_type team_member; 1267 // 1268 // This factorization exploits block diagonal matrices with "Nf" (not used). 1269 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1270 // 1271 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1272 { 1273 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1274 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1275 IS isrow = b->row,isicol = b->icol; 1276 const PetscInt *r_h,*ic_h; 1277 const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data(); 1278 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1279 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1280 PetscBool row_identity,col_identity; 1281 PetscInt nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used 1282 1283 PetscFunctionBegin; 1284 PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n); 1285 PetscCall(MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity)); 1286 PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 1287 PetscCall(ISGetIndices(isrow,&r_h)); 1288 PetscCall(ISGetIndices(isicol,&ic_h)); 1289 PetscCall(ISGetSize(isicol,&nc)); 1290 PetscCall(PetscLogGpuTimeBegin()); 1291 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1292 { 1293 #define KOKKOS_SHARED_LEVEL 1 1294 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1295 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1296 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1297 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 1298 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 1299 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 1300 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 1301 size_t flops_h = 0.0; 1302 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 1303 Kokkos::View<size_t> d_flops_k ("flops"); 1304 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1305 const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 1306 Kokkos::deep_copy (d_flops_k, h_flops_k); 1307 Kokkos::deep_copy (d_r_k, h_r_k); 1308 Kokkos::deep_copy (d_ic_k, h_ic_k); 1309 // Fill A --> fact 1310 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1311 const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 1312 const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 1313 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1314 // zero rows of B 1315 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1316 PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 1317 PetscScalar *baL = ba_d + bi_d[rowb]; 1318 PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 1319 /* zero (unfactored row) */ 1320 for (int j=0;j<nzbL;j++) baL[j] = 0; 1321 for (int j=0;j<nzbU;j++) baU[j] = 0; 1322 }); 1323 // copy A into B 1324 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1325 PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 1326 const PetscScalar *av = aa_d + ai_d[rowa]; 1327 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1328 /* load in initial (unfactored row) */ 1329 for (int j=0;j<nza;j++) { 1330 PetscInt colb = ic[ajtmp[j]]; 1331 PetscScalar vala = av[j]; 1332 if (colb == rowb) { 1333 *(ba_d + bdiag_d[rowb]) = vala; 1334 } else { 1335 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1336 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1337 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 1338 for (int j=0; j<nz ; j++) { 1339 if (pbj[j] == colb) { 1340 pba[j] = vala; 1341 set++; 1342 break; 1343 } 1344 } 1345 #if !defined(PETSC_HAVE_SYCL) 1346 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1347 #endif 1348 } 1349 } 1350 }); 1351 }); 1352 Kokkos::fence(); 1353 1354 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 1355 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1356 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1357 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1358 const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 1359 const PetscInt start = field*nloc, end = start + nloc; 1360 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1361 // A22 panel update for each row A(1,:) and col A(:,1) 1362 for (int ii=start; ii<end-1; ii++) { 1363 const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 1364 const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 1365 const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 1366 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1367 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 1368 PetscInt kIdx = j*Ni + field_block_idx; 1369 if (kIdx >= nzUi) /* void */ ; 1370 else { 1371 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1372 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1373 const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 1374 size_t st_idx; 1375 // find and do L(k,i) = A(:k,i) / A(i,i) 1376 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1377 // get column, there has got to be a better way 1378 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 1379 if (pjL[j] == ii) { 1380 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1381 idx = j; // output 1382 *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1383 } 1384 }, st_idx); 1385 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 1386 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1387 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register 1388 #endif 1389 // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1390 // U(i+1,:end) 1391 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 1392 PetscScalar Uij = baUi[uiIdx]; 1393 PetscInt col = bjUi[uiIdx]; 1394 if (col==myk) { 1395 // A_kk = A_kk - L_ki * U_ij(k) 1396 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1397 *Akkv = *Akkv - L_ki() * Uij; // UiK 1398 } else { 1399 PetscScalar *start, *end, *pAkjv=NULL; 1400 PetscInt high, low; 1401 const PetscInt *startj; 1402 if (col<myk) { // L 1403 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1404 PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 1405 start = pLki+1; // start at pLki+1, A22(myk,1) 1406 startj= bj_d + bi_d[myk] + idx; 1407 end = ba_d + bi_d[myk+1]; 1408 } else { 1409 PetscInt idx = bdiag_d[myk+1]+1; 1410 start = ba_d + idx; 1411 startj= bj_d + idx; 1412 end = ba_d + bdiag_d[myk]; 1413 } 1414 // search for 'col', use bisection search - TODO 1415 low = 0; 1416 high = (PetscInt)(end-start); 1417 while (high-low > 5) { 1418 int t = (low+high)/2; 1419 if (startj[t] > col) high = t; 1420 else low = t; 1421 } 1422 for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 1423 if (startj[pAkjv-start] == col) break; 1424 } 1425 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1426 if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register 1427 #endif 1428 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1429 } 1430 }); 1431 } 1432 }); 1433 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1434 if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 1435 } /* endof for (i=0; i<n; i++) { */ 1436 Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 1437 }); 1438 Kokkos::fence(); 1439 Kokkos::deep_copy (h_flops_k, d_flops_k); 1440 PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k())); 1441 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 1442 const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 1443 const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 1444 /* Invert diagonal for simpler triangular solves */ 1445 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 1446 int i = start + outer_index*Ni + lg_rank%Ni; 1447 if (i < end) { 1448 PetscScalar *pv = ba_d + bdiag_d[i]; 1449 *pv = 1.0/(*pv); 1450 } 1451 }); 1452 }); 1453 } 1454 PetscCall(PetscLogGpuTimeEnd()); 1455 PetscCall(ISRestoreIndices(isicol,&ic_h)); 1456 PetscCall(ISRestoreIndices(isrow,&r_h)); 1457 1458 PetscCall(ISIdentity(isrow,&row_identity)); 1459 PetscCall(ISIdentity(isicol,&col_identity)); 1460 if (b->inode.size) { 1461 B->ops->solve = MatSolve_SeqAIJ_Inode; 1462 } else if (row_identity && col_identity) { 1463 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1464 } else { 1465 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1466 } 1467 B->offloadmask = PETSC_OFFLOAD_GPU; 1468 PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU 1469 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1470 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1471 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1472 B->ops->matsolve = MatMatSolve_SeqAIJ; 1473 B->assembled = PETSC_TRUE; 1474 B->preallocated = PETSC_TRUE; 1475 1476 PetscFunctionReturn(0); 1477 } 1478 1479 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1480 { 1481 PetscFunctionBegin; 1482 PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 1483 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1484 PetscFunctionReturn(0); 1485 } 1486 1487 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1488 { 1489 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1490 1491 PetscFunctionBegin; 1492 if (!factors->sptrsv_symbolic_completed) { 1493 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 1494 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 1495 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1496 } 1497 PetscFunctionReturn(0); 1498 } 1499 1500 /* Check if we need to update factors etc for transpose solve */ 1501 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1502 { 1503 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1504 MatColIdxType n = A->rmap->n; 1505 1506 PetscFunctionBegin; 1507 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1508 /* Update L^T and do sptrsv symbolic */ 1509 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 1510 Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1511 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1512 factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 1513 1514 KokkosKernels::Impl::transpose_matrix< 1515 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1516 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1517 MatRowMapKokkosView,DefaultExecutionSpace>( 1518 n,n,factors->iL_d,factors->jL_d,factors->aL_d, 1519 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1520 1521 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1522 We have to sort the indices, until KK provides finer control options. 1523 */ 1524 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1525 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1526 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1527 1528 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 1529 1530 /* Update U^T and do sptrsv symbolic */ 1531 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 1532 Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1533 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1534 factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 1535 1536 KokkosKernels::Impl::transpose_matrix< 1537 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1538 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1539 MatRowMapKokkosView,DefaultExecutionSpace>( 1540 n,n,factors->iU_d, factors->jU_d, factors->aU_d, 1541 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1542 1543 /* Sort indices. See comments above */ 1544 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1545 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1546 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1547 1548 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 1549 factors->transpose_updated = PETSC_TRUE; 1550 } 1551 PetscFunctionReturn(0); 1552 } 1553 1554 /* Solve Ax = b, with A = LU */ 1555 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 1556 { 1557 ConstPetscScalarKokkosView bv; 1558 PetscScalarKokkosView xv; 1559 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1560 1561 PetscFunctionBegin; 1562 PetscCall(PetscLogGpuTimeBegin()); 1563 PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 1564 PetscCall(VecGetKokkosView(b,&bv)); 1565 PetscCall(VecGetKokkosViewWrite(x,&xv)); 1566 /* Solve L tmpv = b */ 1567 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 1568 /* Solve Ux = tmpv */ 1569 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 1570 PetscCall(VecRestoreKokkosView(b,&bv)); 1571 PetscCall(VecRestoreKokkosViewWrite(x,&xv)); 1572 PetscCall(PetscLogGpuTimeEnd()); 1573 PetscFunctionReturn(0); 1574 } 1575 1576 /* Solve A^T x = b, where A^T = U^T L^T */ 1577 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 1578 { 1579 ConstPetscScalarKokkosView bv; 1580 PetscScalarKokkosView xv; 1581 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1582 1583 PetscFunctionBegin; 1584 PetscCall(PetscLogGpuTimeBegin()); 1585 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 1586 PetscCall(VecGetKokkosView(b,&bv)); 1587 PetscCall(VecGetKokkosViewWrite(x,&xv)); 1588 /* Solve U^T tmpv = b */ 1589 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 1590 1591 /* Solve L^T x = tmpv */ 1592 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 1593 PetscCall(VecRestoreKokkosView(b,&bv)); 1594 PetscCall(VecRestoreKokkosViewWrite(x,&xv)); 1595 PetscCall(PetscLogGpuTimeEnd()); 1596 PetscFunctionReturn(0); 1597 } 1598 1599 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1600 { 1601 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1602 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1603 PetscInt fill_lev = info->levels; 1604 1605 PetscFunctionBegin; 1606 PetscCall(PetscLogGpuTimeBegin()); 1607 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1608 1609 auto a_d = aijkok->a_dual.view_device(); 1610 auto i_d = aijkok->i_dual.view_device(); 1611 auto j_d = aijkok->j_dual.view_device(); 1612 1613 KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 1614 1615 B->assembled = PETSC_TRUE; 1616 B->preallocated = PETSC_TRUE; 1617 B->ops->solve = MatSolve_SeqAIJKokkos; 1618 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1619 B->ops->matsolve = NULL; 1620 B->ops->matsolvetranspose = NULL; 1621 B->offloadmask = PETSC_OFFLOAD_GPU; 1622 1623 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1624 factors->transpose_updated = PETSC_FALSE; 1625 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1626 /* TODO: log flops, but how to know that? */ 1627 PetscCall(PetscLogGpuTimeEnd()); 1628 PetscFunctionReturn(0); 1629 } 1630 1631 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1632 { 1633 Mat_SeqAIJKokkos *aijkok; 1634 Mat_SeqAIJ *b; 1635 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1636 PetscInt fill_lev = info->levels; 1637 PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 1638 PetscInt n = A->rmap->n; 1639 1640 PetscFunctionBegin; 1641 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1642 /* Rebuild factors */ 1643 if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 1644 else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 1645 1646 /* Create a spiluk handle and then do symbolic factorization */ 1647 nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 1648 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 1649 1650 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1651 1652 Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 1653 Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 1654 Kokkos::realloc(factors->iU_d,n+1); 1655 Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 1656 1657 aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1658 auto i_d = aijkok->i_dual.view_device(); 1659 auto j_d = aijkok->j_dual.view_device(); 1660 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 1661 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1662 1663 Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1664 Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 1665 Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 1666 Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 1667 1668 /* TODO: add options to select sptrsv algorithms */ 1669 /* Create sptrsv handles for L, U and their transpose */ 1670 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1671 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1672 #else 1673 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1674 #endif 1675 1676 factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 1677 factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 1678 factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 1679 factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 1680 1681 /* Fill fields of the factor matrix B */ 1682 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL)); 1683 b = (Mat_SeqAIJ*)B->data; 1684 b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 1685 B->info.fill_ratio_given = info->fill; 1686 B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 1687 1688 B->offloadmask = PETSC_OFFLOAD_GPU; 1689 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1690 PetscFunctionReturn(0); 1691 } 1692 1693 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1694 { 1695 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1696 const PetscInt nrows = A->rmap->n; 1697 1698 PetscFunctionBegin; 1699 PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 1700 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1701 // move B data into Kokkos 1702 PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok 1703 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok 1704 { 1705 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1706 if (!baijkok->diag_d.extent(0)) { 1707 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1708 baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1709 Kokkos::deep_copy (baijkok->diag_d, h_diag); 1710 } 1711 } 1712 PetscFunctionReturn(0); 1713 } 1714 1715 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1716 { 1717 PetscFunctionBegin; 1718 *type = MATSOLVERKOKKOS; 1719 PetscFunctionReturn(0); 1720 } 1721 1722 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 1723 { 1724 PetscFunctionBegin; 1725 *type = MATSOLVERKOKKOSDEVICE; 1726 PetscFunctionReturn(0); 1727 } 1728 1729 /*MC 1730 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1731 on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1732 1733 Level: beginner 1734 1735 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1736 M*/ 1737 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1738 { 1739 PetscInt n = A->rmap->n; 1740 1741 PetscFunctionBegin; 1742 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 1743 PetscCall(MatSetSizes(*B,n,n,n,n)); 1744 (*B)->factortype = ftype; 1745 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 1746 PetscCall(MatSetType(*B,MATSEQAIJKOKKOS)); 1747 1748 if (ftype == MAT_FACTOR_LU) { 1749 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 1750 (*B)->canuseordering = PETSC_TRUE; 1751 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1752 } else if (ftype == MAT_FACTOR_ILU) { 1753 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 1754 (*B)->canuseordering = PETSC_FALSE; 1755 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1756 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1757 1758 PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 1759 PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos)); 1760 PetscFunctionReturn(0); 1761 } 1762 1763 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 1764 { 1765 PetscInt n = A->rmap->n; 1766 1767 PetscFunctionBegin; 1768 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 1769 PetscCall(MatSetSizes(*B,n,n,n,n)); 1770 (*B)->factortype = ftype; 1771 (*B)->canuseordering = PETSC_TRUE; 1772 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 1773 PetscCall(MatSetType(*B,MATSEQAIJKOKKOS)); 1774 1775 if (ftype == MAT_FACTOR_LU) { 1776 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 1777 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1778 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1779 1780 PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 1781 PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device)); 1782 PetscFunctionReturn(0); 1783 } 1784 1785 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1786 { 1787 PetscFunctionBegin; 1788 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos)); 1789 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos)); 1790 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device)); 1791 PetscFunctionReturn(0); 1792 } 1793 1794 /* Utility to print out a KokkosCsrMatrix for debugging */ 1795 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1796 { 1797 const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1798 const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1799 const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1800 const PetscInt *i = iv.data(); 1801 const PetscInt *j = jv.data(); 1802 const PetscScalar *a = av.data(); 1803 PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1804 1805 PetscFunctionBegin; 1806 PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz)); 1807 for (PetscInt k=0; k<m; k++) { 1808 PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k)); 1809 for (PetscInt p=i[k]; p<i[k+1]; p++) { 1810 PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],(double)PetscRealPart(a[p]))); 1811 } 1812 PetscCall(PetscPrintf(PETSC_COMM_SELF,"\n")); 1813 } 1814 PetscFunctionReturn(0); 1815 } 1816