1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petsc/private/sfimpl.h> 5 #include <petscsystypes.h> 6 #include <petscerror.h> 7 8 #include <Kokkos_Core.hpp> 9 #include <KokkosBlas.hpp> 10 #include <KokkosKernels_Sorting.hpp> 11 #include <KokkosSparse_CrsMatrix.hpp> 12 #include <KokkosSparse_spmv.hpp> 13 #include <KokkosSparse_spiluk.hpp> 14 #include <KokkosSparse_sptrsv.hpp> 15 #include <KokkosSparse_spgemm.hpp> 16 #include <KokkosSparse_spadd.hpp> 17 18 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 19 20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 21 22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 23 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 24 In the latter case, it is important to set a_dual's sync state correctly. 25 */ 26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 27 { 28 Mat_SeqAIJ *aijseq; 29 Mat_SeqAIJKokkos *aijkok; 30 31 PetscFunctionBegin; 32 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 33 PetscCall(MatAssemblyEnd_SeqAIJ(A,mode)); 34 35 aijseq = static_cast<Mat_SeqAIJ*>(A->data); 36 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 37 38 /* If aijkok does not exist, we just copy i, j to device. 39 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 40 In both cases, we build a new aijkok structure. 41 */ 42 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 43 delete aijkok; 44 aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 45 A->spptr = aijkok; 46 } 47 48 if (aijkok->device_mat_d.data()) { 49 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 50 } 51 PetscFunctionReturn(0); 52 } 53 54 /* Sync CSR data to device if not yet */ 55 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 56 { 57 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 58 59 PetscFunctionBegin; 60 PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 61 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 62 if (aijkok->a_dual.need_sync_device()) { 63 aijkok->a_dual.sync_device(); 64 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 65 aijkok->hermitian_updated = PETSC_FALSE; 66 } 67 PetscFunctionReturn(0); 68 } 69 70 /* Mark the CSR data on device as modified */ 71 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 72 { 73 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 74 75 PetscFunctionBegin; 76 PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 77 aijkok->a_dual.clear_sync_state(); 78 aijkok->a_dual.modify_device(); 79 aijkok->transpose_updated = PETSC_FALSE; 80 aijkok->hermitian_updated = PETSC_FALSE; 81 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 82 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 83 PetscFunctionReturn(0); 84 } 85 86 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 87 { 88 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 89 90 PetscFunctionBegin; 91 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 92 /* We do not expect one needs factors on host */ 93 PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 94 PetscCheck(aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 95 aijkok->a_dual.sync_host(); 96 PetscFunctionReturn(0); 97 } 98 99 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 100 { 101 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 102 103 PetscFunctionBegin; 104 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 105 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 106 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 107 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 108 */ 109 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 110 aijkok->a_dual.sync_host(); 111 *array = aijkok->a_dual.view_host().data(); 112 } else { /* Happens when calling MatSetValues on a newly created matrix */ 113 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 114 } 115 PetscFunctionReturn(0); 116 } 117 118 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 119 { 120 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 121 122 PetscFunctionBegin; 123 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 124 PetscFunctionReturn(0); 125 } 126 127 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 128 { 129 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 130 131 PetscFunctionBegin; 132 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 133 aijkok->a_dual.sync_host(); 134 *array = aijkok->a_dual.view_host().data(); 135 } else { 136 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 137 } 138 PetscFunctionReturn(0); 139 } 140 141 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 142 { 143 PetscFunctionBegin; 144 *array = NULL; 145 PetscFunctionReturn(0); 146 } 147 148 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 149 { 150 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 151 152 PetscFunctionBegin; 153 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 154 *array = aijkok->a_dual.view_host().data(); 155 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 156 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 157 } 158 PetscFunctionReturn(0); 159 } 160 161 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 162 { 163 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 164 165 PetscFunctionBegin; 166 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 167 aijkok->a_dual.clear_sync_state(); 168 aijkok->a_dual.modify_host(); 169 } 170 PetscFunctionReturn(0); 171 } 172 173 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype) 174 { 175 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 176 177 PetscFunctionBegin; 178 PetscCheck(aijkok != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"aijkok is NULL"); 179 180 if (i) *i = aijkok->i_device_data(); 181 if (j) *j = aijkok->j_device_data(); 182 if (a) { 183 aijkok->a_dual.sync_device(); 184 *a = aijkok->a_device_data(); 185 } 186 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 187 PetscFunctionReturn(0); 188 } 189 190 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 191 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 192 { 193 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 194 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 195 196 PetscFunctionBegin; 197 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 198 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 199 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 200 PetscFunctionReturn(0); 201 } 202 203 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 204 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 205 { 206 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 207 208 PetscFunctionBegin; 209 if (aijkok && aijkok->device_mat_d.data()) { 210 *d_mat = aijkok->device_mat_d.data(); 211 } else { 212 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it) 213 *d_mat = NULL; 214 } 215 PetscFunctionReturn(0); 216 } 217 218 /* Generate the transpose on device and cache it internally */ 219 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 220 { 221 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 222 223 PetscFunctionBegin; 224 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 225 if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 226 /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 227 PetscCallCXX(aijkok->a_dual.sync_device()); 228 PetscCallCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 229 PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 230 aijkok->transpose_updated = PETSC_TRUE; 231 } 232 *csrmatT = &aijkok->csrmatT; 233 PetscFunctionReturn(0); 234 } 235 236 /* Generate the Hermitian on device and cache it internally */ 237 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 238 { 239 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 240 241 PetscFunctionBegin; 242 PetscCall(PetscLogGpuTimeBegin()); 243 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 244 if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 245 PetscCallCXX(aijkok->a_dual.sync_device()); 246 PetscCallCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 247 PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 248 #if defined(PETSC_USE_COMPLEX) 249 const auto& a = aijkok->csrmatH.values; 250 Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 251 #endif 252 aijkok->hermitian_updated = PETSC_TRUE; 253 } 254 *csrmatH = &aijkok->csrmatH; 255 PetscCall(PetscLogGpuTimeEnd()); 256 PetscFunctionReturn(0); 257 } 258 259 /* y = A x */ 260 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 261 { 262 Mat_SeqAIJKokkos *aijkok; 263 ConstPetscScalarKokkosView xv; 264 PetscScalarKokkosView yv; 265 266 PetscFunctionBegin; 267 PetscCall(PetscLogGpuTimeBegin()); 268 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 269 PetscCall(VecGetKokkosView(xx,&xv)); 270 PetscCall(VecGetKokkosViewWrite(yy,&yv)); 271 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 272 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 273 PetscCall(VecRestoreKokkosView(xx,&xv)); 274 PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 275 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 276 PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz())); 277 PetscCall(PetscLogGpuTimeEnd()); 278 PetscFunctionReturn(0); 279 } 280 281 /* y = A^T x */ 282 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 283 { 284 Mat_SeqAIJKokkos *aijkok; 285 const char *mode; 286 ConstPetscScalarKokkosView xv; 287 PetscScalarKokkosView yv; 288 KokkosCsrMatrix *csrmat; 289 290 PetscFunctionBegin; 291 PetscCall(PetscLogGpuTimeBegin()); 292 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 293 PetscCall(VecGetKokkosView(xx,&xv)); 294 PetscCall(VecGetKokkosViewWrite(yy,&yv)); 295 if (A->form_explicit_transpose) { 296 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat)); 297 mode = "N"; 298 } else { 299 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 300 csrmat = &aijkok->csrmat; 301 mode = "T"; 302 } 303 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 304 PetscCall(VecRestoreKokkosView(xx,&xv)); 305 PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 306 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 307 PetscCall(PetscLogGpuTimeEnd()); 308 PetscFunctionReturn(0); 309 } 310 311 /* y = A^H x */ 312 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 313 { 314 Mat_SeqAIJKokkos *aijkok; 315 const char *mode; 316 ConstPetscScalarKokkosView xv; 317 PetscScalarKokkosView yv; 318 KokkosCsrMatrix *csrmat; 319 320 PetscFunctionBegin; 321 PetscCall(PetscLogGpuTimeBegin()); 322 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 323 PetscCall(VecGetKokkosView(xx,&xv)); 324 PetscCall(VecGetKokkosViewWrite(yy,&yv)); 325 if (A->form_explicit_transpose) { 326 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat)); 327 mode = "N"; 328 } else { 329 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 330 csrmat = &aijkok->csrmat; 331 mode = "C"; 332 } 333 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 334 PetscCall(VecRestoreKokkosView(xx,&xv)); 335 PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 336 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 337 PetscCall(PetscLogGpuTimeEnd()); 338 PetscFunctionReturn(0); 339 } 340 341 /* z = A x + y */ 342 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 343 { 344 Mat_SeqAIJKokkos *aijkok; 345 ConstPetscScalarKokkosView xv,yv; 346 PetscScalarKokkosView zv; 347 348 PetscFunctionBegin; 349 PetscCall(PetscLogGpuTimeBegin()); 350 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 351 PetscCall(VecGetKokkosView(xx,&xv)); 352 PetscCall(VecGetKokkosView(yy,&yv)); 353 PetscCall(VecGetKokkosViewWrite(zz,&zv)); 354 if (zz != yy) Kokkos::deep_copy(zv,yv); 355 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 356 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 357 PetscCall(VecRestoreKokkosView(xx,&xv)); 358 PetscCall(VecRestoreKokkosView(yy,&yv)); 359 PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 360 PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz())); 361 PetscCall(PetscLogGpuTimeEnd()); 362 PetscFunctionReturn(0); 363 } 364 365 /* z = A^T x + y */ 366 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 367 { 368 Mat_SeqAIJKokkos *aijkok; 369 const char *mode; 370 ConstPetscScalarKokkosView xv,yv; 371 PetscScalarKokkosView zv; 372 KokkosCsrMatrix *csrmat; 373 374 PetscFunctionBegin; 375 PetscCall(PetscLogGpuTimeBegin()); 376 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 377 PetscCall(VecGetKokkosView(xx,&xv)); 378 PetscCall(VecGetKokkosView(yy,&yv)); 379 PetscCall(VecGetKokkosViewWrite(zz,&zv)); 380 if (zz != yy) Kokkos::deep_copy(zv,yv); 381 if (A->form_explicit_transpose) { 382 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat)); 383 mode = "N"; 384 } else { 385 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 386 csrmat = &aijkok->csrmat; 387 mode = "T"; 388 } 389 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 390 PetscCall(VecRestoreKokkosView(xx,&xv)); 391 PetscCall(VecRestoreKokkosView(yy,&yv)); 392 PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 393 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 394 PetscCall(PetscLogGpuTimeEnd()); 395 PetscFunctionReturn(0); 396 } 397 398 /* z = A^H x + y */ 399 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 400 { 401 Mat_SeqAIJKokkos *aijkok; 402 const char *mode; 403 ConstPetscScalarKokkosView xv,yv; 404 PetscScalarKokkosView zv; 405 KokkosCsrMatrix *csrmat; 406 407 PetscFunctionBegin; 408 PetscCall(PetscLogGpuTimeBegin()); 409 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 410 PetscCall(VecGetKokkosView(xx,&xv)); 411 PetscCall(VecGetKokkosView(yy,&yv)); 412 PetscCall(VecGetKokkosViewWrite(zz,&zv)); 413 if (zz != yy) Kokkos::deep_copy(zv,yv); 414 if (A->form_explicit_transpose) { 415 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat)); 416 mode = "N"; 417 } else { 418 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 419 csrmat = &aijkok->csrmat; 420 mode = "C"; 421 } 422 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 423 PetscCall(VecRestoreKokkosView(xx,&xv)); 424 PetscCall(VecRestoreKokkosView(yy,&yv)); 425 PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 426 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 427 PetscCall(PetscLogGpuTimeEnd()); 428 PetscFunctionReturn(0); 429 } 430 431 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 432 { 433 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 434 435 PetscFunctionBegin; 436 switch (op) { 437 case MAT_FORM_EXPLICIT_TRANSPOSE: 438 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 439 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 440 A->form_explicit_transpose = flg; 441 break; 442 default: 443 PetscCall(MatSetOption_SeqAIJ(A,op,flg)); 444 break; 445 } 446 PetscFunctionReturn(0); 447 } 448 449 /* Depending on reuse, either build a new mat, or use the existing mat */ 450 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 451 { 452 Mat_SeqAIJ *aseq; 453 454 PetscFunctionBegin; 455 PetscCall(PetscKokkosInitializeCheck()); 456 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 457 PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); /* the returned newmat is a SeqAIJKokkos */ 458 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 459 PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 460 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 461 PetscCheck(A == *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 462 PetscCall(PetscFree(A->defaultvectype)); 463 PetscCall(PetscStrallocpy(VECKOKKOS,&A->defaultvectype)); /* Allocate and copy the string */ 464 PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS)); 465 PetscCall(MatSetOps_SeqAIJKokkos(A)); 466 aseq = static_cast<Mat_SeqAIJ*>(A->data); 467 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 468 PetscCheck(!A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 469 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 470 } 471 } 472 PetscFunctionReturn(0); 473 } 474 475 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 476 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 477 */ 478 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 479 { 480 Mat_SeqAIJ *bseq; 481 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 482 Mat mat; 483 484 PetscFunctionBegin; 485 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 486 PetscCall(MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B)); 487 mat = *B; 488 if (A->assembled) { 489 bseq = static_cast<Mat_SeqAIJ*>(mat->data); 490 bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 491 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 492 /* Now copy values to B if needed */ 493 if (dupOption == MAT_COPY_VALUES) { 494 if (akok->a_dual.need_sync_device()) { 495 Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 496 bkok->a_dual.modify_host(); 497 } else { /* If device has the latest data, we only copy data on device */ 498 Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 499 bkok->a_dual.modify_device(); 500 } 501 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 502 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 503 bkok->a_dual.modify_host(); 504 } 505 mat->spptr = bkok; 506 } 507 508 PetscCall(PetscFree(mat->defaultvectype)); 509 PetscCall(PetscStrallocpy(VECKOKKOS,&mat->defaultvectype)); /* Allocate and copy the string */ 510 PetscCall(PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS)); 511 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 512 PetscFunctionReturn(0); 513 } 514 515 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 516 { 517 Mat At; 518 KokkosCsrMatrix *internT; 519 Mat_SeqAIJKokkos *atkok,*bkok; 520 521 PetscFunctionBegin; 522 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A,*B)); 523 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&internT)); /* Generate a transpose internally */ 524 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 525 /* Deep copy internT, as we want to isolate the internal transpose */ 526 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT))); 527 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At)); 528 if (reuse == MAT_INITIAL_MATRIX) *B = At; 529 else PetscCall(MatHeaderReplace(A,&At)); /* Replace A with At inplace */ 530 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 531 if ((*B)->assembled) { 532 bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 533 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 534 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 535 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 536 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 537 MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 538 MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 539 PetscCallCXX(Kokkos::deep_copy(a_h,internT->values)); 540 PetscCallCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 541 } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 542 } 543 PetscFunctionReturn(0); 544 } 545 546 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 547 { 548 Mat_SeqAIJKokkos *aijkok; 549 550 PetscFunctionBegin; 551 if (A->factortype == MAT_FACTOR_NONE) { 552 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 553 delete aijkok; 554 } else { 555 delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 556 } 557 A->spptr = NULL; 558 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL)); 559 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 560 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 561 PetscCall(MatDestroy_SeqAIJ(A)); 562 PetscFunctionReturn(0); 563 } 564 565 /*MC 566 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 567 568 A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 569 570 Options Database Keys: 571 . -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions() 572 573 Level: beginner 574 575 .seealso: `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 576 M*/ 577 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 578 { 579 PetscFunctionBegin; 580 PetscCall(PetscKokkosInitializeCheck()); 581 PetscCall(MatCreate_SeqAIJ(A)); 582 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A)); 583 PetscFunctionReturn(0); 584 } 585 586 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 587 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 588 { 589 Mat_SeqAIJ *a,*b; 590 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 591 MatScalarKokkosView aa,ba,ca; 592 MatRowMapKokkosView ai,bi,ci; 593 MatColIdxKokkosView aj,bj,cj; 594 PetscInt m,n,nnz,aN; 595 596 PetscFunctionBegin; 597 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 598 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 599 PetscValidPointer(C,4); 600 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 601 PetscCheckTypeName(B,MATSEQAIJKOKKOS); 602 PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 603 PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 604 605 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 606 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 607 a = static_cast<Mat_SeqAIJ*>(A->data); 608 b = static_cast<Mat_SeqAIJ*>(B->data); 609 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 610 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 611 aa = akok->a_dual.view_device(); 612 ai = akok->i_dual.view_device(); 613 ba = bkok->a_dual.view_device(); 614 bi = bkok->i_dual.view_device(); 615 m = A->rmap->n; /* M, N and nnz of C */ 616 n = A->cmap->n + B->cmap->n; 617 nnz = a->nz + b->nz; 618 aN = A->cmap->n; /* N of A */ 619 if (reuse == MAT_INITIAL_MATRIX) { 620 aj = akok->j_dual.view_device(); 621 bj = bkok->j_dual.view_device(); 622 auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 623 auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 624 auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 625 ca = ca_dual.view_device(); 626 ci = ci_dual.view_device(); 627 cj = cj_dual.view_device(); 628 629 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 630 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 631 PetscInt i = t.league_rank(); /* row i */ 632 PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 633 634 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 635 ci(i) = coffset; 636 if (i == m-1) ci(m) = ai(m) + bi(m); 637 }); 638 639 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 640 if (k < alen) { 641 ca(coffset+k) = aa(ai(i)+k); 642 cj(coffset+k) = aj(ai(i)+k); 643 } else { 644 ca(coffset+k) = ba(bi(i)+k-alen); 645 cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 646 } 647 }); 648 }); 649 ca_dual.modify_device(); 650 ci_dual.modify_device(); 651 cj_dual.modify_device(); 652 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 653 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C)); 654 } else if (reuse == MAT_REUSE_MATRIX) { 655 PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 656 PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 657 ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 658 ca = ckok->a_dual.view_device(); 659 ci = ckok->i_dual.view_device(); 660 661 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 662 PetscInt i = t.league_rank(); /* row i */ 663 PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 664 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 665 if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 666 else ca(ci(i)+k) = ba(bi(i)+k-alen); 667 }); 668 }); 669 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 670 } 671 PetscFunctionReturn(0); 672 } 673 674 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 675 { 676 PetscFunctionBegin; 677 delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 678 PetscFunctionReturn(0); 679 } 680 681 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 682 { 683 Mat_Product *product = C->product; 684 Mat A,B; 685 bool transA,transB; /* use bool, since KK needs this type */ 686 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 687 Mat_SeqAIJ *c; 688 MatProductData_SeqAIJKokkos *pdata; 689 KokkosCsrMatrix *csrmatA,*csrmatB; 690 691 PetscFunctionBegin; 692 MatCheckProduct(C,1); 693 PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 694 pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 695 696 if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 697 pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 698 PetscFunctionReturn(0); 699 } 700 701 switch (product->type) { 702 case MATPRODUCT_AB: transA = false; transB = false; break; 703 case MATPRODUCT_AtB: transA = true; transB = false; break; 704 case MATPRODUCT_ABt: transA = false; transB = true; break; 705 default: 706 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 707 } 708 709 A = product->A; 710 B = product->B; 711 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 712 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 713 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 714 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 715 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 716 717 PetscCheck(ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 718 719 csrmatA = &akok->csrmat; 720 csrmatB = &bkok->csrmat; 721 722 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 723 if (transA) { 724 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA)); 725 transA = false; 726 } 727 728 if (transB) { 729 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB)); 730 transB = false; 731 } 732 PetscCall(PetscLogGpuTimeBegin()); 733 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 734 PetscCallCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 735 PetscCall(PetscLogGpuTimeEnd()); 736 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 737 /* shorter version of MatAssemblyEnd_SeqAIJ */ 738 c = (Mat_SeqAIJ*)C->data; 739 PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz)); 740 PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n")); 741 PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax)); 742 c->reallocs = 0; 743 C->info.mallocs = 0; 744 C->info.nz_unneeded = 0; 745 C->assembled = C->was_assembled = PETSC_TRUE; 746 C->num_ass++; 747 PetscFunctionReturn(0); 748 } 749 750 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 751 { 752 Mat_Product *product = C->product; 753 MatProductType ptype; 754 Mat A,B; 755 bool transA,transB; 756 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 757 MatProductData_SeqAIJKokkos *pdata; 758 MPI_Comm comm; 759 KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 760 761 PetscFunctionBegin; 762 MatCheckProduct(C,1); 763 PetscCall(PetscObjectGetComm((PetscObject)C,&comm)); 764 PetscCheck(!product->data,comm,PETSC_ERR_PLIB,"Product data not empty"); 765 A = product->A; 766 B = product->B; 767 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 768 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 769 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 770 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 771 csrmatA = &akok->csrmat; 772 csrmatB = &bkok->csrmat; 773 774 ptype = product->type; 775 switch (ptype) { 776 case MATPRODUCT_AB: transA = false; transB = false; break; 777 case MATPRODUCT_AtB: transA = true; transB = false; break; 778 case MATPRODUCT_ABt: transA = false; transB = true; break; 779 default: 780 SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 781 } 782 783 product->data = pdata = new MatProductData_SeqAIJKokkos(); 784 pdata->kh.set_team_work_size(16); 785 pdata->kh.set_dynamic_scheduling(true); 786 pdata->reusesym = product->api_user; 787 788 /* TODO: add command line options to select spgemm algorithms */ 789 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 790 /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API. 791 We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support. 792 */ 793 pdata->kh.create_spgemm_handle(spgemm_alg); 794 795 PetscCall(PetscLogGpuTimeBegin()); 796 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 797 if (transA) { 798 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA)); 799 transA = false; 800 } 801 802 if (transB) { 803 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB)); 804 transB = false; 805 } 806 807 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 808 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 809 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 810 calling new Mat_SeqAIJKokkos(). 811 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 812 */ 813 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 814 PetscCallCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 815 PetscCall(PetscLogGpuTimeEnd()); 816 817 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 818 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C,ckok)); 819 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 820 PetscFunctionReturn(0); 821 } 822 823 /* handles sparse matrix matrix ops */ 824 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 825 { 826 Mat_Product *product = mat->product; 827 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 828 829 PetscFunctionBegin; 830 MatCheckProduct(mat,1); 831 PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok)); 832 if (product->type == MATPRODUCT_ABC) { 833 PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok)); 834 } 835 if (Biskok && Ciskok) { 836 switch (product->type) { 837 case MATPRODUCT_AB: 838 case MATPRODUCT_AtB: 839 case MATPRODUCT_ABt: 840 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 841 break; 842 case MATPRODUCT_PtAP: 843 case MATPRODUCT_RARt: 844 case MATPRODUCT_ABC: 845 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 846 break; 847 default: 848 break; 849 } 850 } else { /* fallback for AIJ */ 851 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 852 } 853 PetscFunctionReturn(0); 854 } 855 856 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 857 { 858 Mat_SeqAIJKokkos *aijkok; 859 860 PetscFunctionBegin; 861 PetscCall(PetscLogGpuTimeBegin()); 862 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 863 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 864 KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 865 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 866 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 867 PetscCall(PetscLogGpuTimeEnd()); 868 PetscFunctionReturn(0); 869 } 870 871 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 872 { 873 Mat_SeqAIJKokkos *aijkok; 874 875 PetscFunctionBegin; 876 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 877 if (aijkok) { /* Only zero the device if data is already there */ 878 KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 879 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 880 } else { /* Might be preallocated but not assembled */ 881 PetscCall(MatZeroEntries_SeqAIJ(A)); 882 } 883 PetscFunctionReturn(0); 884 } 885 886 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A,Vec x) 887 { 888 Mat_SeqAIJ *aijseq; 889 Mat_SeqAIJKokkos *aijkok; 890 PetscInt n; 891 PetscScalarKokkosView xv; 892 893 PetscFunctionBegin; 894 PetscCall(VecGetLocalSize(x,&n)); 895 PetscCheck(n == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 896 PetscCheck(A->factortype == MAT_FACTOR_NONE,PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 897 898 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 899 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 900 901 if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */ 902 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 903 aijseq = static_cast<Mat_SeqAIJ*>(A->data); 904 aijkok->SetDiagonal(aijseq->diag); 905 } 906 907 const auto& Aa = aijkok->a_dual.view_device(); 908 const auto& Ai = aijkok->i_dual.view_device(); 909 const auto& Adiag = aijkok->diag_dual.view_device(); 910 911 PetscCall(VecGetKokkosViewWrite(x,&xv)); 912 Kokkos::parallel_for(n,KOKKOS_LAMBDA(const PetscInt i) { 913 if (Adiag(i) < Ai(i+1)) xv(i) = Aa(Adiag(i)); 914 else xv(i) = 0; 915 }); 916 PetscCall(VecRestoreKokkosViewWrite(x,&xv)); 917 PetscFunctionReturn(0); 918 } 919 920 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 921 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv) 922 { 923 Mat_SeqAIJKokkos *aijkok; 924 925 PetscFunctionBegin; 926 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 927 PetscValidPointer(kv,2); 928 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 929 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 930 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 931 *kv = aijkok->a_dual.view_device(); 932 PetscFunctionReturn(0); 933 } 934 935 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv) 936 { 937 PetscFunctionBegin; 938 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 939 PetscValidPointer(kv,2); 940 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 941 PetscFunctionReturn(0); 942 } 943 944 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv) 945 { 946 Mat_SeqAIJKokkos *aijkok; 947 948 PetscFunctionBegin; 949 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 950 PetscValidPointer(kv,2); 951 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 952 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 953 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 954 *kv = aijkok->a_dual.view_device(); 955 PetscFunctionReturn(0); 956 } 957 958 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv) 959 { 960 PetscFunctionBegin; 961 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 962 PetscValidPointer(kv,2); 963 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 964 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 965 PetscFunctionReturn(0); 966 } 967 968 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 969 { 970 Mat_SeqAIJKokkos *aijkok; 971 972 PetscFunctionBegin; 973 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 974 PetscValidPointer(kv,2); 975 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 976 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 977 *kv = aijkok->a_dual.view_device(); 978 PetscFunctionReturn(0); 979 } 980 981 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 982 { 983 PetscFunctionBegin; 984 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 985 PetscValidPointer(kv,2); 986 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 987 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 988 PetscFunctionReturn(0); 989 } 990 991 /* Computes Y += alpha X */ 992 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 993 { 994 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 995 Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 996 ConstMatScalarKokkosView Xa; 997 MatScalarKokkosView Ya; 998 999 PetscFunctionBegin; 1000 PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 1001 PetscCheckTypeName(X,MATSEQAIJKOKKOS); 1002 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1003 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1004 PetscCall(PetscLogGpuTimeBegin()); 1005 1006 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1007 /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 1008 PetscBool e; 1009 PetscCall(PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e)); 1010 if (e) { 1011 PetscCall(PetscArraycmp(x->j,y->j,y->nz,&e)); 1012 if (e) pattern = SAME_NONZERO_PATTERN; 1013 } 1014 } 1015 1016 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1017 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1018 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1019 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1020 */ 1021 ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 1022 xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 1023 Xa = xkok->a_dual.view_device(); 1024 Ya = ykok->a_dual.view_device(); 1025 1026 if (pattern == SAME_NONZERO_PATTERN) { 1027 KokkosBlas::axpy(alpha,Xa,Ya); 1028 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1029 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1030 MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 1031 MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 1032 1033 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 1034 PetscInt i = t.league_rank(); /* row i */ 1035 Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 1036 PetscInt p,q = Yi(i); 1037 for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 1038 while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 1039 if (Xj(p) == Yj(q)) { /* Found it */ 1040 Ya(q) += alpha * Xa(p); 1041 q++; 1042 } else { 1043 /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1044 Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1045 */ 1046 if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = 1047 #if PETSC_PKG_KOKKOS_VERSION_GE(3,6,99) 1048 Kokkos::nan("1"); /* auto promote the double NaN if needed */ 1049 #else 1050 Kokkos::Experimental::nan("1"); 1051 #endif 1052 } 1053 } 1054 }); 1055 }); 1056 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1057 } else { /* different nonzero patterns */ 1058 Mat Z; 1059 KokkosCsrMatrix zcsr; 1060 KernelHandle kh; 1061 kh.create_spadd_handle(false); 1062 KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1063 KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1064 zkok = new Mat_SeqAIJKokkos(zcsr); 1065 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z)); 1066 PetscCall(MatHeaderReplace(Y,&Z)); 1067 kh.destroy_spadd_handle(); 1068 } 1069 PetscCall(PetscLogGpuTimeEnd()); 1070 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0)*2)); /* Because we scaled X and then added it to Y */ 1071 PetscFunctionReturn(0); 1072 } 1073 1074 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1075 { 1076 Mat_SeqAIJKokkos *akok; 1077 Mat_SeqAIJ *aseq; 1078 1079 PetscFunctionBegin; 1080 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j)); 1081 aseq = static_cast<Mat_SeqAIJ*>(mat->data); 1082 akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr); 1083 delete akok; 1084 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,mat->nonzerostate+1,PETSC_FALSE); 1085 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1086 akok->SetUpCOO(aseq); 1087 PetscFunctionReturn(0); 1088 } 1089 1090 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode) 1091 { 1092 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ*>(A->data); 1093 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1094 PetscCount Annz = aseq->nz; 1095 const PetscCountKokkosView& jmap = akok->jmap_d; 1096 const PetscCountKokkosView& perm = akok->perm_d; 1097 MatScalarKokkosView Aa; 1098 ConstMatScalarKokkosView kv; 1099 PetscMemType memtype; 1100 1101 PetscFunctionBegin; 1102 PetscCall(PetscGetMemType(v,&memtype)); 1103 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1104 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n)); 1105 } else { 1106 kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */ 1107 } 1108 1109 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); /* write matrix values */ 1110 else PetscCall(MatSeqAIJGetKokkosView(A,&Aa)); /* read & write matrix values */ 1111 1112 Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) { 1113 PetscScalar sum = 0.0; 1114 for (PetscCount k=jmap(i); k<jmap(i+1); k++) sum += kv(perm(k)); 1115 Aa(i) = (imode == INSERT_VALUES? 0.0 : Aa(i)) + sum; 1116 }); 1117 1118 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa)); 1119 else PetscCall(MatSeqAIJRestoreKokkosView(A,&Aa)); 1120 PetscFunctionReturn(0); 1121 } 1122 1123 PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A,const PetscInt *diag) 1124 { 1125 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1126 MatScalarKokkosView Aa; 1127 const MatRowMapKokkosView& Ai = akok->i_dual.view_device(); 1128 PetscInt m = A->rmap->n; 1129 ConstMatRowMapKokkosView Adiag(diag,m); /* diag is a device pointer */ 1130 1131 PetscFunctionBegin; 1132 PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); 1133 Kokkos::parallel_for(m,KOKKOS_LAMBDA(const PetscInt i) { 1134 PetscScalar tmp; 1135 if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i+1)) { /* The diagonal element exists */ 1136 tmp = Aa(Ai(i)); 1137 Aa(Ai(i)) = Aa(Adiag(i)); 1138 Aa(Adiag(i)) = tmp; 1139 } 1140 }); 1141 PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa)); 1142 PetscFunctionReturn(0); 1143 } 1144 1145 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1146 { 1147 PetscFunctionBegin; 1148 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1149 PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info)); 1150 B->offloadmask = PETSC_OFFLOAD_CPU; 1151 PetscFunctionReturn(0); 1152 } 1153 1154 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1155 { 1156 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1157 1158 PetscFunctionBegin; 1159 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1160 A->boundtocpu = PETSC_FALSE; 1161 1162 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1163 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1164 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1165 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1166 A->ops->scale = MatScale_SeqAIJKokkos; 1167 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1168 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1169 A->ops->mult = MatMult_SeqAIJKokkos; 1170 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1171 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1172 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1173 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1174 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1175 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1176 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1177 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1178 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1179 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1180 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1181 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1182 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1183 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1184 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1185 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1186 1187 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos)); 1188 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJKokkos)); 1189 PetscFunctionReturn(0); 1190 } 1191 1192 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1193 { 1194 Mat_SeqAIJ *aseq; 1195 PetscInt i,m,n; 1196 1197 PetscFunctionBegin; 1198 PetscCheck(!A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1199 1200 m = akok->nrows(); 1201 n = akok->ncols(); 1202 PetscCall(MatSetSizes(A,m,n,m,n)); 1203 PetscCall(MatSetType(A,MATSEQAIJKOKKOS)); 1204 1205 /* Set up data structures of A as a MATSEQAIJ */ 1206 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL)); 1207 aseq = (Mat_SeqAIJ*)(A)->data; 1208 1209 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1210 akok->j_dual.sync_host(); 1211 1212 aseq->i = akok->i_host_data(); 1213 aseq->j = akok->j_host_data(); 1214 aseq->a = akok->a_host_data(); 1215 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1216 aseq->singlemalloc = PETSC_FALSE; 1217 aseq->free_a = PETSC_FALSE; 1218 aseq->free_ij = PETSC_FALSE; 1219 aseq->nz = akok->nnz(); 1220 aseq->maxnz = aseq->nz; 1221 1222 PetscCall(PetscMalloc1(m,&aseq->imax)); 1223 PetscCall(PetscMalloc1(m,&aseq->ilen)); 1224 for (i=0; i<m; i++) { 1225 aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1226 } 1227 1228 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1229 akok->nonzerostate = A->nonzerostate; 1230 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1231 PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY)); 1232 PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY)); 1233 PetscFunctionReturn(0); 1234 } 1235 1236 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1237 1238 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1239 */ 1240 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1241 { 1242 PetscFunctionBegin; 1243 PetscCall(MatCreate(comm,A)); 1244 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A,akok)); 1245 PetscFunctionReturn(0); 1246 } 1247 1248 /* --------------------------------------------------------------------------------*/ 1249 /*@C 1250 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 1251 (the default parallel PETSc format). This matrix will ultimately be handled by 1252 Kokkos for calculations. For good matrix 1253 assembly performance the user should preallocate the matrix storage by setting 1254 the parameter nz (or the array nnz). By setting these parameters accurately, 1255 performance during matrix assembly can be increased by more than a factor of 50. 1256 1257 Collective 1258 1259 Input Parameters: 1260 + comm - MPI communicator, set to PETSC_COMM_SELF 1261 . m - number of rows 1262 . n - number of columns 1263 . nz - number of nonzeros per row (same for all rows) 1264 - nnz - array containing the number of nonzeros in the various rows 1265 (possibly different for each row) or NULL 1266 1267 Output Parameter: 1268 . A - the matrix 1269 1270 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1271 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1272 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1273 1274 Notes: 1275 If nnz is given then nz is ignored 1276 1277 The AIJ format (also called the Yale sparse matrix format or 1278 compressed row storage), is fully compatible with standard Fortran 77 1279 storage. That is, the stored row and column indices can begin at 1280 either one (as in Fortran) or zero. See the users' manual for details. 1281 1282 Specify the preallocated storage with either nz or nnz (not both). 1283 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1284 allocation. For large problems you MUST preallocate memory or you 1285 will get TERRIBLE performance, see the users' manual chapter on matrices. 1286 1287 By default, this format uses inodes (identical nodes) when possible, to 1288 improve numerical efficiency of matrix-vector products and solves. We 1289 search for consecutive rows with the same nonzero structure, thereby 1290 reusing matrix information to achieve increased efficiency. 1291 1292 Level: intermediate 1293 1294 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()` 1295 @*/ 1296 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1297 { 1298 PetscFunctionBegin; 1299 PetscCall(PetscKokkosInitializeCheck()); 1300 PetscCall(MatCreate(comm,A)); 1301 PetscCall(MatSetSizes(*A,m,n,m,n)); 1302 PetscCall(MatSetType(*A,MATSEQAIJKOKKOS)); 1303 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz)); 1304 PetscFunctionReturn(0); 1305 } 1306 1307 typedef Kokkos::TeamPolicy<>::member_type team_member; 1308 // 1309 // This factorization exploits block diagonal matrices with "Nf" (not used). 1310 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1311 // 1312 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1313 { 1314 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1315 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1316 IS isrow = b->row,isicol = b->icol; 1317 const PetscInt *r_h,*ic_h; 1318 const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data(); 1319 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1320 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1321 PetscBool row_identity,col_identity; 1322 PetscInt nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used 1323 1324 PetscFunctionBegin; 1325 PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n); 1326 PetscCall(MatIsStructurallySymmetric(A,&row_identity)); 1327 PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 1328 PetscCall(ISGetIndices(isrow,&r_h)); 1329 PetscCall(ISGetIndices(isicol,&ic_h)); 1330 PetscCall(ISGetSize(isicol,&nc)); 1331 PetscCall(PetscLogGpuTimeBegin()); 1332 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1333 { 1334 #define KOKKOS_SHARED_LEVEL 1 1335 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1336 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1337 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1338 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 1339 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 1340 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 1341 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 1342 size_t flops_h = 0.0; 1343 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 1344 Kokkos::View<size_t> d_flops_k ("flops"); 1345 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1346 const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 1347 Kokkos::deep_copy (d_flops_k, h_flops_k); 1348 Kokkos::deep_copy (d_r_k, h_r_k); 1349 Kokkos::deep_copy (d_ic_k, h_ic_k); 1350 // Fill A --> fact 1351 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1352 const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 1353 const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 1354 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1355 // zero rows of B 1356 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1357 PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 1358 PetscScalar *baL = ba_d + bi_d[rowb]; 1359 PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 1360 /* zero (unfactored row) */ 1361 for (int j=0;j<nzbL;j++) baL[j] = 0; 1362 for (int j=0;j<nzbU;j++) baU[j] = 0; 1363 }); 1364 // copy A into B 1365 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1366 PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 1367 const PetscScalar *av = aa_d + ai_d[rowa]; 1368 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1369 /* load in initial (unfactored row) */ 1370 for (int j=0;j<nza;j++) { 1371 PetscInt colb = ic[ajtmp[j]]; 1372 PetscScalar vala = av[j]; 1373 if (colb == rowb) { 1374 *(ba_d + bdiag_d[rowb]) = vala; 1375 } else { 1376 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1377 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1378 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 1379 for (int j=0; j<nz ; j++) { 1380 if (pbj[j] == colb) { 1381 pba[j] = vala; 1382 set++; 1383 break; 1384 } 1385 } 1386 #if !defined(PETSC_HAVE_SYCL) 1387 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1388 #endif 1389 } 1390 } 1391 }); 1392 }); 1393 Kokkos::fence(); 1394 1395 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 1396 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1397 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1398 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1399 const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 1400 const PetscInt start = field*nloc, end = start + nloc; 1401 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1402 // A22 panel update for each row A(1,:) and col A(:,1) 1403 for (int ii=start; ii<end-1; ii++) { 1404 const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 1405 const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 1406 const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 1407 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1408 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 1409 PetscInt kIdx = j*Ni + field_block_idx; 1410 if (kIdx >= nzUi) /* void */ ; 1411 else { 1412 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1413 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1414 const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 1415 size_t st_idx; 1416 // find and do L(k,i) = A(:k,i) / A(i,i) 1417 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1418 // get column, there has got to be a better way 1419 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 1420 if (pjL[j] == ii) { 1421 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1422 idx = j; // output 1423 *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1424 } 1425 }, st_idx); 1426 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 1427 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1428 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register 1429 #endif 1430 // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1431 // U(i+1,:end) 1432 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 1433 PetscScalar Uij = baUi[uiIdx]; 1434 PetscInt col = bjUi[uiIdx]; 1435 if (col==myk) { 1436 // A_kk = A_kk - L_ki * U_ij(k) 1437 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1438 *Akkv = *Akkv - L_ki() * Uij; // UiK 1439 } else { 1440 PetscScalar *start, *end, *pAkjv=NULL; 1441 PetscInt high, low; 1442 const PetscInt *startj; 1443 if (col<myk) { // L 1444 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1445 PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 1446 start = pLki+1; // start at pLki+1, A22(myk,1) 1447 startj= bj_d + bi_d[myk] + idx; 1448 end = ba_d + bi_d[myk+1]; 1449 } else { 1450 PetscInt idx = bdiag_d[myk+1]+1; 1451 start = ba_d + idx; 1452 startj= bj_d + idx; 1453 end = ba_d + bdiag_d[myk]; 1454 } 1455 // search for 'col', use bisection search - TODO 1456 low = 0; 1457 high = (PetscInt)(end-start); 1458 while (high-low > 5) { 1459 int t = (low+high)/2; 1460 if (startj[t] > col) high = t; 1461 else low = t; 1462 } 1463 for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 1464 if (startj[pAkjv-start] == col) break; 1465 } 1466 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1467 if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register 1468 #endif 1469 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1470 } 1471 }); 1472 } 1473 }); 1474 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1475 if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 1476 } /* endof for (i=0; i<n; i++) { */ 1477 Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 1478 }); 1479 Kokkos::fence(); 1480 Kokkos::deep_copy (h_flops_k, d_flops_k); 1481 PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k())); 1482 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 1483 const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 1484 const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 1485 /* Invert diagonal for simpler triangular solves */ 1486 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 1487 int i = start + outer_index*Ni + lg_rank%Ni; 1488 if (i < end) { 1489 PetscScalar *pv = ba_d + bdiag_d[i]; 1490 *pv = 1.0/(*pv); 1491 } 1492 }); 1493 }); 1494 } 1495 PetscCall(PetscLogGpuTimeEnd()); 1496 PetscCall(ISRestoreIndices(isicol,&ic_h)); 1497 PetscCall(ISRestoreIndices(isrow,&r_h)); 1498 1499 PetscCall(ISIdentity(isrow,&row_identity)); 1500 PetscCall(ISIdentity(isicol,&col_identity)); 1501 if (b->inode.size) { 1502 B->ops->solve = MatSolve_SeqAIJ_Inode; 1503 } else if (row_identity && col_identity) { 1504 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1505 } else { 1506 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1507 } 1508 B->offloadmask = PETSC_OFFLOAD_GPU; 1509 PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU 1510 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1511 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1512 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1513 B->ops->matsolve = MatMatSolve_SeqAIJ; 1514 B->assembled = PETSC_TRUE; 1515 B->preallocated = PETSC_TRUE; 1516 1517 PetscFunctionReturn(0); 1518 } 1519 1520 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1521 { 1522 PetscFunctionBegin; 1523 PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 1524 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1525 PetscFunctionReturn(0); 1526 } 1527 1528 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1529 { 1530 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1531 1532 PetscFunctionBegin; 1533 if (!factors->sptrsv_symbolic_completed) { 1534 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 1535 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 1536 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1537 } 1538 PetscFunctionReturn(0); 1539 } 1540 1541 /* Check if we need to update factors etc for transpose solve */ 1542 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1543 { 1544 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1545 MatColIdxType n = A->rmap->n; 1546 1547 PetscFunctionBegin; 1548 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1549 /* Update L^T and do sptrsv symbolic */ 1550 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 1551 Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1552 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1553 factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 1554 1555 KokkosKernels::Impl::transpose_matrix< 1556 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1557 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1558 MatRowMapKokkosView,DefaultExecutionSpace>( 1559 n,n,factors->iL_d,factors->jL_d,factors->aL_d, 1560 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1561 1562 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1563 We have to sort the indices, until KK provides finer control options. 1564 */ 1565 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1566 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1567 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1568 1569 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 1570 1571 /* Update U^T and do sptrsv symbolic */ 1572 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 1573 Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1574 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1575 factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 1576 1577 KokkosKernels::Impl::transpose_matrix< 1578 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1579 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1580 MatRowMapKokkosView,DefaultExecutionSpace>( 1581 n,n,factors->iU_d, factors->jU_d, factors->aU_d, 1582 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1583 1584 /* Sort indices. See comments above */ 1585 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1586 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1587 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1588 1589 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 1590 factors->transpose_updated = PETSC_TRUE; 1591 } 1592 PetscFunctionReturn(0); 1593 } 1594 1595 /* Solve Ax = b, with A = LU */ 1596 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 1597 { 1598 ConstPetscScalarKokkosView bv; 1599 PetscScalarKokkosView xv; 1600 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1601 1602 PetscFunctionBegin; 1603 PetscCall(PetscLogGpuTimeBegin()); 1604 PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 1605 PetscCall(VecGetKokkosView(b,&bv)); 1606 PetscCall(VecGetKokkosViewWrite(x,&xv)); 1607 /* Solve L tmpv = b */ 1608 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 1609 /* Solve Ux = tmpv */ 1610 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 1611 PetscCall(VecRestoreKokkosView(b,&bv)); 1612 PetscCall(VecRestoreKokkosViewWrite(x,&xv)); 1613 PetscCall(PetscLogGpuTimeEnd()); 1614 PetscFunctionReturn(0); 1615 } 1616 1617 /* Solve A^T x = b, where A^T = U^T L^T */ 1618 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 1619 { 1620 ConstPetscScalarKokkosView bv; 1621 PetscScalarKokkosView xv; 1622 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1623 1624 PetscFunctionBegin; 1625 PetscCall(PetscLogGpuTimeBegin()); 1626 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 1627 PetscCall(VecGetKokkosView(b,&bv)); 1628 PetscCall(VecGetKokkosViewWrite(x,&xv)); 1629 /* Solve U^T tmpv = b */ 1630 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 1631 1632 /* Solve L^T x = tmpv */ 1633 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 1634 PetscCall(VecRestoreKokkosView(b,&bv)); 1635 PetscCall(VecRestoreKokkosViewWrite(x,&xv)); 1636 PetscCall(PetscLogGpuTimeEnd()); 1637 PetscFunctionReturn(0); 1638 } 1639 1640 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1641 { 1642 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1643 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1644 PetscInt fill_lev = info->levels; 1645 1646 PetscFunctionBegin; 1647 PetscCall(PetscLogGpuTimeBegin()); 1648 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1649 1650 auto a_d = aijkok->a_dual.view_device(); 1651 auto i_d = aijkok->i_dual.view_device(); 1652 auto j_d = aijkok->j_dual.view_device(); 1653 1654 KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 1655 1656 B->assembled = PETSC_TRUE; 1657 B->preallocated = PETSC_TRUE; 1658 B->ops->solve = MatSolve_SeqAIJKokkos; 1659 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1660 B->ops->matsolve = NULL; 1661 B->ops->matsolvetranspose = NULL; 1662 B->offloadmask = PETSC_OFFLOAD_GPU; 1663 1664 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1665 factors->transpose_updated = PETSC_FALSE; 1666 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1667 /* TODO: log flops, but how to know that? */ 1668 PetscCall(PetscLogGpuTimeEnd()); 1669 PetscFunctionReturn(0); 1670 } 1671 1672 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1673 { 1674 Mat_SeqAIJKokkos *aijkok; 1675 Mat_SeqAIJ *b; 1676 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1677 PetscInt fill_lev = info->levels; 1678 PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 1679 PetscInt n = A->rmap->n; 1680 1681 PetscFunctionBegin; 1682 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1683 /* Rebuild factors */ 1684 if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 1685 else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 1686 1687 /* Create a spiluk handle and then do symbolic factorization */ 1688 nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 1689 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 1690 1691 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1692 1693 Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 1694 Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 1695 Kokkos::realloc(factors->iU_d,n+1); 1696 Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 1697 1698 aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1699 auto i_d = aijkok->i_dual.view_device(); 1700 auto j_d = aijkok->j_dual.view_device(); 1701 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 1702 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1703 1704 Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1705 Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 1706 Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 1707 Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 1708 1709 /* TODO: add options to select sptrsv algorithms */ 1710 /* Create sptrsv handles for L, U and their transpose */ 1711 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1712 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1713 #else 1714 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1715 #endif 1716 1717 factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 1718 factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 1719 factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 1720 factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 1721 1722 /* Fill fields of the factor matrix B */ 1723 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL)); 1724 b = (Mat_SeqAIJ*)B->data; 1725 b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 1726 B->info.fill_ratio_given = info->fill; 1727 B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 1728 1729 B->offloadmask = PETSC_OFFLOAD_GPU; 1730 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1731 PetscFunctionReturn(0); 1732 } 1733 1734 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1735 { 1736 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1737 const PetscInt nrows = A->rmap->n; 1738 1739 PetscFunctionBegin; 1740 PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 1741 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1742 // move B data into Kokkos 1743 PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok 1744 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok 1745 { 1746 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1747 if (!baijkok->diag_d.extent(0)) { 1748 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1749 baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1750 Kokkos::deep_copy (baijkok->diag_d, h_diag); 1751 } 1752 } 1753 PetscFunctionReturn(0); 1754 } 1755 1756 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1757 { 1758 PetscFunctionBegin; 1759 *type = MATSOLVERKOKKOS; 1760 PetscFunctionReturn(0); 1761 } 1762 1763 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 1764 { 1765 PetscFunctionBegin; 1766 *type = MATSOLVERKOKKOSDEVICE; 1767 PetscFunctionReturn(0); 1768 } 1769 1770 /*MC 1771 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1772 on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1773 1774 Level: beginner 1775 1776 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1777 M*/ 1778 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1779 { 1780 PetscInt n = A->rmap->n; 1781 1782 PetscFunctionBegin; 1783 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 1784 PetscCall(MatSetSizes(*B,n,n,n,n)); 1785 (*B)->factortype = ftype; 1786 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 1787 PetscCall(MatSetType(*B,MATSEQAIJKOKKOS)); 1788 1789 if (ftype == MAT_FACTOR_LU) { 1790 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 1791 (*B)->canuseordering = PETSC_TRUE; 1792 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1793 } else if (ftype == MAT_FACTOR_ILU) { 1794 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 1795 (*B)->canuseordering = PETSC_FALSE; 1796 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1797 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1798 1799 PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 1800 PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos)); 1801 PetscFunctionReturn(0); 1802 } 1803 1804 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 1805 { 1806 PetscInt n = A->rmap->n; 1807 1808 PetscFunctionBegin; 1809 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 1810 PetscCall(MatSetSizes(*B,n,n,n,n)); 1811 (*B)->factortype = ftype; 1812 (*B)->canuseordering = PETSC_TRUE; 1813 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 1814 PetscCall(MatSetType(*B,MATSEQAIJKOKKOS)); 1815 1816 if (ftype == MAT_FACTOR_LU) { 1817 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 1818 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1819 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1820 1821 PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 1822 PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device)); 1823 PetscFunctionReturn(0); 1824 } 1825 1826 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1827 { 1828 PetscFunctionBegin; 1829 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos)); 1830 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos)); 1831 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device)); 1832 PetscFunctionReturn(0); 1833 } 1834 1835 /* Utility to print out a KokkosCsrMatrix for debugging */ 1836 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1837 { 1838 const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1839 const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1840 const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1841 const PetscInt *i = iv.data(); 1842 const PetscInt *j = jv.data(); 1843 const PetscScalar *a = av.data(); 1844 PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1845 1846 PetscFunctionBegin; 1847 PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz)); 1848 for (PetscInt k=0; k<m; k++) { 1849 PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k)); 1850 for (PetscInt p=i[k]; p<i[k+1]; p++) { 1851 PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],(double)PetscRealPart(a[p]))); 1852 } 1853 PetscCall(PetscPrintf(PETSC_COMM_SELF,"\n")); 1854 } 1855 PetscFunctionReturn(0); 1856 } 1857