1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petsc/private/sfimpl.h> 5 #include <petscsystypes.h> 6 #include <petscerror.h> 7 8 #include <Kokkos_Core.hpp> 9 #include <KokkosBlas.hpp> 10 #include <KokkosKernels_Sorting.hpp> 11 #include <KokkosSparse_CrsMatrix.hpp> 12 #include <KokkosSparse_spmv.hpp> 13 #include <KokkosSparse_spiluk.hpp> 14 #include <KokkosSparse_sptrsv.hpp> 15 #include <KokkosSparse_spgemm.hpp> 16 #include <KokkosSparse_spadd.hpp> 17 18 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 19 20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 21 22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 23 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 24 In the latter case, it is important to set a_dual's sync state correctly. 25 */ 26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 27 { 28 Mat_SeqAIJ *aijseq; 29 Mat_SeqAIJKokkos *aijkok; 30 31 PetscFunctionBegin; 32 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 33 PetscCall(MatAssemblyEnd_SeqAIJ(A,mode)); 34 35 aijseq = static_cast<Mat_SeqAIJ*>(A->data); 36 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 37 38 /* If aijkok does not exist, we just copy i, j to device. 39 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 40 In both cases, we build a new aijkok structure. 41 */ 42 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 43 delete aijkok; 44 aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 45 A->spptr = aijkok; 46 } 47 48 if (aijkok->device_mat_d.data()) { 49 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 50 } 51 PetscFunctionReturn(0); 52 } 53 54 /* Sync CSR data to device if not yet */ 55 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 56 { 57 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 58 59 PetscFunctionBegin; 60 PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 61 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 62 if (aijkok->a_dual.need_sync_device()) { 63 aijkok->a_dual.sync_device(); 64 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 65 aijkok->hermitian_updated = PETSC_FALSE; 66 } 67 PetscFunctionReturn(0); 68 } 69 70 /* Mark the CSR data on device as modified */ 71 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 72 { 73 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 74 75 PetscFunctionBegin; 76 PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 77 aijkok->a_dual.clear_sync_state(); 78 aijkok->a_dual.modify_device(); 79 aijkok->transpose_updated = PETSC_FALSE; 80 aijkok->hermitian_updated = PETSC_FALSE; 81 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 82 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 83 PetscFunctionReturn(0); 84 } 85 86 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 87 { 88 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 89 90 PetscFunctionBegin; 91 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 92 /* We do not expect one needs factors on host */ 93 PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 94 PetscCheck(aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 95 aijkok->a_dual.sync_host(); 96 PetscFunctionReturn(0); 97 } 98 99 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 100 { 101 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 102 103 PetscFunctionBegin; 104 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 105 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 106 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 107 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 108 */ 109 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 110 aijkok->a_dual.sync_host(); 111 *array = aijkok->a_dual.view_host().data(); 112 } else { /* Happens when calling MatSetValues on a newly created matrix */ 113 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 114 } 115 PetscFunctionReturn(0); 116 } 117 118 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 119 { 120 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 121 122 PetscFunctionBegin; 123 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 124 PetscFunctionReturn(0); 125 } 126 127 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 128 { 129 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 130 131 PetscFunctionBegin; 132 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 133 aijkok->a_dual.sync_host(); 134 *array = aijkok->a_dual.view_host().data(); 135 } else { 136 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 137 } 138 PetscFunctionReturn(0); 139 } 140 141 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 142 { 143 PetscFunctionBegin; 144 *array = NULL; 145 PetscFunctionReturn(0); 146 } 147 148 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 149 { 150 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 151 152 PetscFunctionBegin; 153 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 154 *array = aijkok->a_dual.view_host().data(); 155 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 156 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 157 } 158 PetscFunctionReturn(0); 159 } 160 161 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 162 { 163 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 164 165 PetscFunctionBegin; 166 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 167 aijkok->a_dual.clear_sync_state(); 168 aijkok->a_dual.modify_host(); 169 } 170 PetscFunctionReturn(0); 171 } 172 173 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype) 174 { 175 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 176 177 PetscFunctionBegin; 178 PetscCheck(aijkok != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"aijkok is NULL"); 179 180 if (i) *i = aijkok->i_device_data(); 181 if (j) *j = aijkok->j_device_data(); 182 if (a) { 183 aijkok->a_dual.sync_device(); 184 *a = aijkok->a_device_data(); 185 } 186 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 187 PetscFunctionReturn(0); 188 } 189 190 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 191 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 192 { 193 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 194 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 195 196 PetscFunctionBegin; 197 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 198 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 199 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 200 PetscFunctionReturn(0); 201 } 202 203 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 204 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 205 { 206 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 207 208 PetscFunctionBegin; 209 if (aijkok && aijkok->device_mat_d.data()) { 210 *d_mat = aijkok->device_mat_d.data(); 211 } else { 212 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it) 213 *d_mat = NULL; 214 } 215 PetscFunctionReturn(0); 216 } 217 218 /* Generate the transpose on device and cache it internally */ 219 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 220 { 221 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 222 223 PetscFunctionBegin; 224 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 225 if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 226 /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 227 PetscCallCXX(aijkok->a_dual.sync_device()); 228 PetscCallCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 229 PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 230 aijkok->transpose_updated = PETSC_TRUE; 231 } 232 *csrmatT = &aijkok->csrmatT; 233 PetscFunctionReturn(0); 234 } 235 236 /* Generate the Hermitian on device and cache it internally */ 237 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 238 { 239 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 240 241 PetscFunctionBegin; 242 PetscCall(PetscLogGpuTimeBegin()); 243 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 244 if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 245 PetscCallCXX(aijkok->a_dual.sync_device()); 246 PetscCallCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 247 PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 248 #if defined(PETSC_USE_COMPLEX) 249 const auto& a = aijkok->csrmatH.values; 250 Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 251 #endif 252 aijkok->hermitian_updated = PETSC_TRUE; 253 } 254 *csrmatH = &aijkok->csrmatH; 255 PetscCall(PetscLogGpuTimeEnd()); 256 PetscFunctionReturn(0); 257 } 258 259 /* y = A x */ 260 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 261 { 262 Mat_SeqAIJKokkos *aijkok; 263 ConstPetscScalarKokkosView xv; 264 PetscScalarKokkosView yv; 265 266 PetscFunctionBegin; 267 PetscCall(PetscLogGpuTimeBegin()); 268 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 269 PetscCall(VecGetKokkosView(xx,&xv)); 270 PetscCall(VecGetKokkosViewWrite(yy,&yv)); 271 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 272 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 273 PetscCall(VecRestoreKokkosView(xx,&xv)); 274 PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 275 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 276 PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz())); 277 PetscCall(PetscLogGpuTimeEnd()); 278 PetscFunctionReturn(0); 279 } 280 281 /* y = A^T x */ 282 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 283 { 284 Mat_SeqAIJKokkos *aijkok; 285 const char *mode; 286 ConstPetscScalarKokkosView xv; 287 PetscScalarKokkosView yv; 288 KokkosCsrMatrix *csrmat; 289 290 PetscFunctionBegin; 291 PetscCall(PetscLogGpuTimeBegin()); 292 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 293 PetscCall(VecGetKokkosView(xx,&xv)); 294 PetscCall(VecGetKokkosViewWrite(yy,&yv)); 295 if (A->form_explicit_transpose) { 296 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat)); 297 mode = "N"; 298 } else { 299 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 300 csrmat = &aijkok->csrmat; 301 mode = "T"; 302 } 303 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 304 PetscCall(VecRestoreKokkosView(xx,&xv)); 305 PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 306 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 307 PetscCall(PetscLogGpuTimeEnd()); 308 PetscFunctionReturn(0); 309 } 310 311 /* y = A^H x */ 312 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 313 { 314 Mat_SeqAIJKokkos *aijkok; 315 const char *mode; 316 ConstPetscScalarKokkosView xv; 317 PetscScalarKokkosView yv; 318 KokkosCsrMatrix *csrmat; 319 320 PetscFunctionBegin; 321 PetscCall(PetscLogGpuTimeBegin()); 322 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 323 PetscCall(VecGetKokkosView(xx,&xv)); 324 PetscCall(VecGetKokkosViewWrite(yy,&yv)); 325 if (A->form_explicit_transpose) { 326 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat)); 327 mode = "N"; 328 } else { 329 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 330 csrmat = &aijkok->csrmat; 331 mode = "C"; 332 } 333 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 334 PetscCall(VecRestoreKokkosView(xx,&xv)); 335 PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 336 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 337 PetscCall(PetscLogGpuTimeEnd()); 338 PetscFunctionReturn(0); 339 } 340 341 /* z = A x + y */ 342 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 343 { 344 Mat_SeqAIJKokkos *aijkok; 345 ConstPetscScalarKokkosView xv,yv; 346 PetscScalarKokkosView zv; 347 348 PetscFunctionBegin; 349 PetscCall(PetscLogGpuTimeBegin()); 350 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 351 PetscCall(VecGetKokkosView(xx,&xv)); 352 PetscCall(VecGetKokkosView(yy,&yv)); 353 PetscCall(VecGetKokkosViewWrite(zz,&zv)); 354 if (zz != yy) Kokkos::deep_copy(zv,yv); 355 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 356 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 357 PetscCall(VecRestoreKokkosView(xx,&xv)); 358 PetscCall(VecRestoreKokkosView(yy,&yv)); 359 PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 360 PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz())); 361 PetscCall(PetscLogGpuTimeEnd()); 362 PetscFunctionReturn(0); 363 } 364 365 /* z = A^T x + y */ 366 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 367 { 368 Mat_SeqAIJKokkos *aijkok; 369 const char *mode; 370 ConstPetscScalarKokkosView xv,yv; 371 PetscScalarKokkosView zv; 372 KokkosCsrMatrix *csrmat; 373 374 PetscFunctionBegin; 375 PetscCall(PetscLogGpuTimeBegin()); 376 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 377 PetscCall(VecGetKokkosView(xx,&xv)); 378 PetscCall(VecGetKokkosView(yy,&yv)); 379 PetscCall(VecGetKokkosViewWrite(zz,&zv)); 380 if (zz != yy) Kokkos::deep_copy(zv,yv); 381 if (A->form_explicit_transpose) { 382 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat)); 383 mode = "N"; 384 } else { 385 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 386 csrmat = &aijkok->csrmat; 387 mode = "T"; 388 } 389 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 390 PetscCall(VecRestoreKokkosView(xx,&xv)); 391 PetscCall(VecRestoreKokkosView(yy,&yv)); 392 PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 393 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 394 PetscCall(PetscLogGpuTimeEnd()); 395 PetscFunctionReturn(0); 396 } 397 398 /* z = A^H x + y */ 399 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 400 { 401 Mat_SeqAIJKokkos *aijkok; 402 const char *mode; 403 ConstPetscScalarKokkosView xv,yv; 404 PetscScalarKokkosView zv; 405 KokkosCsrMatrix *csrmat; 406 407 PetscFunctionBegin; 408 PetscCall(PetscLogGpuTimeBegin()); 409 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 410 PetscCall(VecGetKokkosView(xx,&xv)); 411 PetscCall(VecGetKokkosView(yy,&yv)); 412 PetscCall(VecGetKokkosViewWrite(zz,&zv)); 413 if (zz != yy) Kokkos::deep_copy(zv,yv); 414 if (A->form_explicit_transpose) { 415 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat)); 416 mode = "N"; 417 } else { 418 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 419 csrmat = &aijkok->csrmat; 420 mode = "C"; 421 } 422 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 423 PetscCall(VecRestoreKokkosView(xx,&xv)); 424 PetscCall(VecRestoreKokkosView(yy,&yv)); 425 PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 426 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 427 PetscCall(PetscLogGpuTimeEnd()); 428 PetscFunctionReturn(0); 429 } 430 431 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 432 { 433 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 434 435 PetscFunctionBegin; 436 switch (op) { 437 case MAT_FORM_EXPLICIT_TRANSPOSE: 438 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 439 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 440 A->form_explicit_transpose = flg; 441 break; 442 default: 443 PetscCall(MatSetOption_SeqAIJ(A,op,flg)); 444 break; 445 } 446 PetscFunctionReturn(0); 447 } 448 449 /* Depending on reuse, either build a new mat, or use the existing mat */ 450 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 451 { 452 Mat_SeqAIJ *aseq; 453 454 PetscFunctionBegin; 455 PetscCall(PetscKokkosInitializeCheck()); 456 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 457 PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); /* the returned newmat is a SeqAIJKokkos */ 458 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 459 PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 460 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 461 PetscCheck(A == *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 462 PetscCall(PetscFree(A->defaultvectype)); 463 PetscCall(PetscStrallocpy(VECKOKKOS,&A->defaultvectype)); /* Allocate and copy the string */ 464 PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS)); 465 PetscCall(MatSetOps_SeqAIJKokkos(A)); 466 aseq = static_cast<Mat_SeqAIJ*>(A->data); 467 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 468 PetscCheck(!A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 469 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 470 } 471 } 472 PetscFunctionReturn(0); 473 } 474 475 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 476 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 477 */ 478 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 479 { 480 Mat_SeqAIJ *bseq; 481 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 482 Mat mat; 483 484 PetscFunctionBegin; 485 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 486 PetscCall(MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B)); 487 mat = *B; 488 if (A->assembled) { 489 bseq = static_cast<Mat_SeqAIJ*>(mat->data); 490 bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 491 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 492 /* Now copy values to B if needed */ 493 if (dupOption == MAT_COPY_VALUES) { 494 if (akok->a_dual.need_sync_device()) { 495 Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 496 bkok->a_dual.modify_host(); 497 } else { /* If device has the latest data, we only copy data on device */ 498 Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 499 bkok->a_dual.modify_device(); 500 } 501 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 502 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 503 bkok->a_dual.modify_host(); 504 } 505 mat->spptr = bkok; 506 } 507 508 PetscCall(PetscFree(mat->defaultvectype)); 509 PetscCall(PetscStrallocpy(VECKOKKOS,&mat->defaultvectype)); /* Allocate and copy the string */ 510 PetscCall(PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS)); 511 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 512 PetscFunctionReturn(0); 513 } 514 515 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 516 { 517 Mat At; 518 KokkosCsrMatrix *internT; 519 Mat_SeqAIJKokkos *atkok,*bkok; 520 521 PetscFunctionBegin; 522 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&internT)); /* Generate a transpose internally */ 523 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 524 /* Deep copy internT, as we want to isolate the internal transpose */ 525 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT))); 526 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At)); 527 if (reuse == MAT_INITIAL_MATRIX) *B = At; 528 else PetscCall(MatHeaderReplace(A,&At)); /* Replace A with At inplace */ 529 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 530 if ((*B)->assembled) { 531 bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 532 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 533 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 534 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 535 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 536 MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 537 MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 538 PetscCallCXX(Kokkos::deep_copy(a_h,internT->values)); 539 PetscCallCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 540 } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 541 } 542 PetscFunctionReturn(0); 543 } 544 545 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 546 { 547 Mat_SeqAIJKokkos *aijkok; 548 549 PetscFunctionBegin; 550 if (A->factortype == MAT_FACTOR_NONE) { 551 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 552 delete aijkok; 553 } else { 554 delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 555 } 556 A->spptr = NULL; 557 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL)); 558 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 559 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 560 PetscCall(MatDestroy_SeqAIJ(A)); 561 PetscFunctionReturn(0); 562 } 563 564 /*MC 565 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 566 567 A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 568 569 Options Database Keys: 570 . -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions() 571 572 Level: beginner 573 574 .seealso: `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 575 M*/ 576 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 577 { 578 PetscFunctionBegin; 579 PetscCall(PetscKokkosInitializeCheck()); 580 PetscCall(MatCreate_SeqAIJ(A)); 581 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A)); 582 PetscFunctionReturn(0); 583 } 584 585 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 586 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 587 { 588 Mat_SeqAIJ *a,*b; 589 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 590 MatScalarKokkosView aa,ba,ca; 591 MatRowMapKokkosView ai,bi,ci; 592 MatColIdxKokkosView aj,bj,cj; 593 PetscInt m,n,nnz,aN; 594 595 PetscFunctionBegin; 596 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 597 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 598 PetscValidPointer(C,4); 599 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 600 PetscCheckTypeName(B,MATSEQAIJKOKKOS); 601 PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 602 PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 603 604 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 605 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 606 a = static_cast<Mat_SeqAIJ*>(A->data); 607 b = static_cast<Mat_SeqAIJ*>(B->data); 608 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 609 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 610 aa = akok->a_dual.view_device(); 611 ai = akok->i_dual.view_device(); 612 ba = bkok->a_dual.view_device(); 613 bi = bkok->i_dual.view_device(); 614 m = A->rmap->n; /* M, N and nnz of C */ 615 n = A->cmap->n + B->cmap->n; 616 nnz = a->nz + b->nz; 617 aN = A->cmap->n; /* N of A */ 618 if (reuse == MAT_INITIAL_MATRIX) { 619 aj = akok->j_dual.view_device(); 620 bj = bkok->j_dual.view_device(); 621 auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 622 auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 623 auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 624 ca = ca_dual.view_device(); 625 ci = ci_dual.view_device(); 626 cj = cj_dual.view_device(); 627 628 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 629 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 630 PetscInt i = t.league_rank(); /* row i */ 631 PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 632 633 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 634 ci(i) = coffset; 635 if (i == m-1) ci(m) = ai(m) + bi(m); 636 }); 637 638 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 639 if (k < alen) { 640 ca(coffset+k) = aa(ai(i)+k); 641 cj(coffset+k) = aj(ai(i)+k); 642 } else { 643 ca(coffset+k) = ba(bi(i)+k-alen); 644 cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 645 } 646 }); 647 }); 648 ca_dual.modify_device(); 649 ci_dual.modify_device(); 650 cj_dual.modify_device(); 651 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 652 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C)); 653 } else if (reuse == MAT_REUSE_MATRIX) { 654 PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 655 PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 656 ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 657 ca = ckok->a_dual.view_device(); 658 ci = ckok->i_dual.view_device(); 659 660 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 661 PetscInt i = t.league_rank(); /* row i */ 662 PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 663 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 664 if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 665 else ca(ci(i)+k) = ba(bi(i)+k-alen); 666 }); 667 }); 668 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 669 } 670 PetscFunctionReturn(0); 671 } 672 673 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 674 { 675 PetscFunctionBegin; 676 delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 677 PetscFunctionReturn(0); 678 } 679 680 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 681 { 682 Mat_Product *product = C->product; 683 Mat A,B; 684 bool transA,transB; /* use bool, since KK needs this type */ 685 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 686 Mat_SeqAIJ *c; 687 MatProductData_SeqAIJKokkos *pdata; 688 KokkosCsrMatrix *csrmatA,*csrmatB; 689 690 PetscFunctionBegin; 691 MatCheckProduct(C,1); 692 PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 693 pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 694 695 if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 696 pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 697 PetscFunctionReturn(0); 698 } 699 700 switch (product->type) { 701 case MATPRODUCT_AB: transA = false; transB = false; break; 702 case MATPRODUCT_AtB: transA = true; transB = false; break; 703 case MATPRODUCT_ABt: transA = false; transB = true; break; 704 default: 705 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 706 } 707 708 A = product->A; 709 B = product->B; 710 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 711 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 712 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 713 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 714 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 715 716 PetscCheck(ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 717 718 csrmatA = &akok->csrmat; 719 csrmatB = &bkok->csrmat; 720 721 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 722 if (transA) { 723 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA)); 724 transA = false; 725 } 726 727 if (transB) { 728 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB)); 729 transB = false; 730 } 731 PetscCall(PetscLogGpuTimeBegin()); 732 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 733 PetscCallCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 734 PetscCall(PetscLogGpuTimeEnd()); 735 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 736 /* shorter version of MatAssemblyEnd_SeqAIJ */ 737 c = (Mat_SeqAIJ*)C->data; 738 PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz)); 739 PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n")); 740 PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax)); 741 c->reallocs = 0; 742 C->info.mallocs = 0; 743 C->info.nz_unneeded = 0; 744 C->assembled = C->was_assembled = PETSC_TRUE; 745 C->num_ass++; 746 PetscFunctionReturn(0); 747 } 748 749 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 750 { 751 Mat_Product *product = C->product; 752 MatProductType ptype; 753 Mat A,B; 754 bool transA,transB; 755 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 756 MatProductData_SeqAIJKokkos *pdata; 757 MPI_Comm comm; 758 KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 759 760 PetscFunctionBegin; 761 MatCheckProduct(C,1); 762 PetscCall(PetscObjectGetComm((PetscObject)C,&comm)); 763 PetscCheck(!product->data,comm,PETSC_ERR_PLIB,"Product data not empty"); 764 A = product->A; 765 B = product->B; 766 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 767 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 768 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 769 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 770 csrmatA = &akok->csrmat; 771 csrmatB = &bkok->csrmat; 772 773 ptype = product->type; 774 switch (ptype) { 775 case MATPRODUCT_AB: transA = false; transB = false; break; 776 case MATPRODUCT_AtB: transA = true; transB = false; break; 777 case MATPRODUCT_ABt: transA = false; transB = true; break; 778 default: 779 SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 780 } 781 782 product->data = pdata = new MatProductData_SeqAIJKokkos(); 783 pdata->kh.set_team_work_size(16); 784 pdata->kh.set_dynamic_scheduling(true); 785 pdata->reusesym = product->api_user; 786 787 /* TODO: add command line options to select spgemm algorithms */ 788 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 789 /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API. 790 We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support. 791 */ 792 pdata->kh.create_spgemm_handle(spgemm_alg); 793 794 PetscCall(PetscLogGpuTimeBegin()); 795 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 796 if (transA) { 797 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA)); 798 transA = false; 799 } 800 801 if (transB) { 802 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB)); 803 transB = false; 804 } 805 806 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 807 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 808 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 809 calling new Mat_SeqAIJKokkos(). 810 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 811 */ 812 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 813 PetscCallCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 814 PetscCall(PetscLogGpuTimeEnd()); 815 816 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 817 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C,ckok)); 818 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 819 PetscFunctionReturn(0); 820 } 821 822 /* handles sparse matrix matrix ops */ 823 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 824 { 825 Mat_Product *product = mat->product; 826 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 827 828 PetscFunctionBegin; 829 MatCheckProduct(mat,1); 830 PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok)); 831 if (product->type == MATPRODUCT_ABC) { 832 PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok)); 833 } 834 if (Biskok && Ciskok) { 835 switch (product->type) { 836 case MATPRODUCT_AB: 837 case MATPRODUCT_AtB: 838 case MATPRODUCT_ABt: 839 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 840 break; 841 case MATPRODUCT_PtAP: 842 case MATPRODUCT_RARt: 843 case MATPRODUCT_ABC: 844 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 845 break; 846 default: 847 break; 848 } 849 } else { /* fallback for AIJ */ 850 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 851 } 852 PetscFunctionReturn(0); 853 } 854 855 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 856 { 857 Mat_SeqAIJKokkos *aijkok; 858 859 PetscFunctionBegin; 860 PetscCall(PetscLogGpuTimeBegin()); 861 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 862 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 863 KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 864 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 865 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 866 PetscCall(PetscLogGpuTimeEnd()); 867 PetscFunctionReturn(0); 868 } 869 870 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 871 { 872 Mat_SeqAIJKokkos *aijkok; 873 874 PetscFunctionBegin; 875 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 876 if (aijkok) { /* Only zero the device if data is already there */ 877 KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 878 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 879 } else { /* Might be preallocated but not assembled */ 880 PetscCall(MatZeroEntries_SeqAIJ(A)); 881 } 882 PetscFunctionReturn(0); 883 } 884 885 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A,Vec x) 886 { 887 Mat_SeqAIJ *aijseq; 888 Mat_SeqAIJKokkos *aijkok; 889 PetscInt n; 890 PetscScalarKokkosView xv; 891 892 PetscFunctionBegin; 893 PetscCall(VecGetLocalSize(x,&n)); 894 PetscCheck(n == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 895 PetscCheck(A->factortype == MAT_FACTOR_NONE,PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 896 897 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 898 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 899 900 if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */ 901 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 902 aijseq = static_cast<Mat_SeqAIJ*>(A->data); 903 aijkok->SetDiagonal(aijseq->diag); 904 } 905 906 const auto& Aa = aijkok->a_dual.view_device(); 907 const auto& Ai = aijkok->i_dual.view_device(); 908 const auto& Adiag = aijkok->diag_dual.view_device(); 909 910 PetscCall(VecGetKokkosViewWrite(x,&xv)); 911 Kokkos::parallel_for(n,KOKKOS_LAMBDA(const PetscInt i) { 912 if (Adiag(i) < Ai(i+1)) xv(i) = Aa(Adiag(i)); 913 else xv(i) = 0; 914 }); 915 PetscCall(VecRestoreKokkosViewWrite(x,&xv)); 916 PetscFunctionReturn(0); 917 } 918 919 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 920 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv) 921 { 922 Mat_SeqAIJKokkos *aijkok; 923 924 PetscFunctionBegin; 925 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 926 PetscValidPointer(kv,2); 927 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 928 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 929 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 930 *kv = aijkok->a_dual.view_device(); 931 PetscFunctionReturn(0); 932 } 933 934 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv) 935 { 936 PetscFunctionBegin; 937 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 938 PetscValidPointer(kv,2); 939 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 940 PetscFunctionReturn(0); 941 } 942 943 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv) 944 { 945 Mat_SeqAIJKokkos *aijkok; 946 947 PetscFunctionBegin; 948 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 949 PetscValidPointer(kv,2); 950 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 951 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 952 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 953 *kv = aijkok->a_dual.view_device(); 954 PetscFunctionReturn(0); 955 } 956 957 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv) 958 { 959 PetscFunctionBegin; 960 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 961 PetscValidPointer(kv,2); 962 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 963 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 964 PetscFunctionReturn(0); 965 } 966 967 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 968 { 969 Mat_SeqAIJKokkos *aijkok; 970 971 PetscFunctionBegin; 972 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 973 PetscValidPointer(kv,2); 974 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 975 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 976 *kv = aijkok->a_dual.view_device(); 977 PetscFunctionReturn(0); 978 } 979 980 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 981 { 982 PetscFunctionBegin; 983 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 984 PetscValidPointer(kv,2); 985 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 986 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 987 PetscFunctionReturn(0); 988 } 989 990 /* Computes Y += alpha X */ 991 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 992 { 993 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 994 Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 995 ConstMatScalarKokkosView Xa; 996 MatScalarKokkosView Ya; 997 998 PetscFunctionBegin; 999 PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 1000 PetscCheckTypeName(X,MATSEQAIJKOKKOS); 1001 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1002 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1003 PetscCall(PetscLogGpuTimeBegin()); 1004 1005 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1006 /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 1007 PetscBool e; 1008 PetscCall(PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e)); 1009 if (e) { 1010 PetscCall(PetscArraycmp(x->j,y->j,y->nz,&e)); 1011 if (e) pattern = SAME_NONZERO_PATTERN; 1012 } 1013 } 1014 1015 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1016 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1017 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1018 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1019 */ 1020 ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 1021 xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 1022 Xa = xkok->a_dual.view_device(); 1023 Ya = ykok->a_dual.view_device(); 1024 1025 if (pattern == SAME_NONZERO_PATTERN) { 1026 KokkosBlas::axpy(alpha,Xa,Ya); 1027 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1028 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1029 MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 1030 MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 1031 1032 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 1033 PetscInt i = t.league_rank(); /* row i */ 1034 Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 1035 PetscInt p,q = Yi(i); 1036 for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 1037 while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 1038 if (Xj(p) == Yj(q)) { /* Found it */ 1039 Ya(q) += alpha * Xa(p); 1040 q++; 1041 } else { 1042 /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1043 Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1044 */ 1045 if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = 1046 #if PETSC_PKG_KOKKOS_VERSION_GE(3,6,99) 1047 Kokkos::nan("1"); /* auto promote the double NaN if needed */ 1048 #else 1049 Kokkos::Experimental::nan("1"); 1050 #endif 1051 } 1052 } 1053 }); 1054 }); 1055 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1056 } else { /* different nonzero patterns */ 1057 Mat Z; 1058 KokkosCsrMatrix zcsr; 1059 KernelHandle kh; 1060 kh.create_spadd_handle(false); 1061 KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1062 KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1063 zkok = new Mat_SeqAIJKokkos(zcsr); 1064 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z)); 1065 PetscCall(MatHeaderReplace(Y,&Z)); 1066 kh.destroy_spadd_handle(); 1067 } 1068 PetscCall(PetscLogGpuTimeEnd()); 1069 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0)*2)); /* Because we scaled X and then added it to Y */ 1070 PetscFunctionReturn(0); 1071 } 1072 1073 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1074 { 1075 Mat_SeqAIJKokkos *akok; 1076 Mat_SeqAIJ *aseq; 1077 1078 PetscFunctionBegin; 1079 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j)); 1080 aseq = static_cast<Mat_SeqAIJ*>(mat->data); 1081 akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr); 1082 delete akok; 1083 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,mat->nonzerostate+1,PETSC_FALSE); 1084 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1085 akok->SetUpCOO(aseq); 1086 PetscFunctionReturn(0); 1087 } 1088 1089 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode) 1090 { 1091 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ*>(A->data); 1092 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1093 PetscCount Annz = aseq->nz; 1094 const PetscCountKokkosView& jmap = akok->jmap_d; 1095 const PetscCountKokkosView& perm = akok->perm_d; 1096 MatScalarKokkosView Aa; 1097 ConstMatScalarKokkosView kv; 1098 PetscMemType memtype; 1099 1100 PetscFunctionBegin; 1101 PetscCall(PetscGetMemType(v,&memtype)); 1102 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1103 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n)); 1104 } else { 1105 kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */ 1106 } 1107 1108 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); /* write matrix values */ 1109 else PetscCall(MatSeqAIJGetKokkosView(A,&Aa)); /* read & write matrix values */ 1110 1111 Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) { 1112 PetscScalar sum = 0.0; 1113 for (PetscCount k=jmap(i); k<jmap(i+1); k++) sum += kv(perm(k)); 1114 Aa(i) = (imode == INSERT_VALUES? 0.0 : Aa(i)) + sum; 1115 }); 1116 1117 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa)); 1118 else PetscCall(MatSeqAIJRestoreKokkosView(A,&Aa)); 1119 PetscFunctionReturn(0); 1120 } 1121 1122 PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A,const PetscInt *diag) 1123 { 1124 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1125 MatScalarKokkosView Aa; 1126 const MatRowMapKokkosView& Ai = akok->i_dual.view_device(); 1127 PetscInt m = A->rmap->n; 1128 ConstMatRowMapKokkosView Adiag(diag,m); /* diag is a device pointer */ 1129 1130 PetscFunctionBegin; 1131 PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); 1132 Kokkos::parallel_for(m,KOKKOS_LAMBDA(const PetscInt i) { 1133 PetscScalar tmp; 1134 if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i+1)) { /* The diagonal element exists */ 1135 tmp = Aa(Ai(i)); 1136 Aa(Ai(i)) = Aa(Adiag(i)); 1137 Aa(Adiag(i)) = tmp; 1138 } 1139 }); 1140 PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa)); 1141 PetscFunctionReturn(0); 1142 } 1143 1144 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1145 { 1146 PetscFunctionBegin; 1147 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1148 PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info)); 1149 B->offloadmask = PETSC_OFFLOAD_CPU; 1150 PetscFunctionReturn(0); 1151 } 1152 1153 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1154 { 1155 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1156 1157 PetscFunctionBegin; 1158 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1159 A->boundtocpu = PETSC_FALSE; 1160 1161 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1162 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1163 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1164 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1165 A->ops->scale = MatScale_SeqAIJKokkos; 1166 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1167 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1168 A->ops->mult = MatMult_SeqAIJKokkos; 1169 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1170 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1171 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1172 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1173 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1174 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1175 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1176 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1177 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1178 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1179 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1180 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1181 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1182 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1183 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1184 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1185 1186 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos)); 1187 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJKokkos)); 1188 PetscFunctionReturn(0); 1189 } 1190 1191 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1192 { 1193 Mat_SeqAIJ *aseq; 1194 PetscInt i,m,n; 1195 1196 PetscFunctionBegin; 1197 PetscCheck(!A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1198 1199 m = akok->nrows(); 1200 n = akok->ncols(); 1201 PetscCall(MatSetSizes(A,m,n,m,n)); 1202 PetscCall(MatSetType(A,MATSEQAIJKOKKOS)); 1203 1204 /* Set up data structures of A as a MATSEQAIJ */ 1205 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL)); 1206 aseq = (Mat_SeqAIJ*)(A)->data; 1207 1208 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1209 akok->j_dual.sync_host(); 1210 1211 aseq->i = akok->i_host_data(); 1212 aseq->j = akok->j_host_data(); 1213 aseq->a = akok->a_host_data(); 1214 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1215 aseq->singlemalloc = PETSC_FALSE; 1216 aseq->free_a = PETSC_FALSE; 1217 aseq->free_ij = PETSC_FALSE; 1218 aseq->nz = akok->nnz(); 1219 aseq->maxnz = aseq->nz; 1220 1221 PetscCall(PetscMalloc1(m,&aseq->imax)); 1222 PetscCall(PetscMalloc1(m,&aseq->ilen)); 1223 for (i=0; i<m; i++) { 1224 aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1225 } 1226 1227 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1228 akok->nonzerostate = A->nonzerostate; 1229 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1230 PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY)); 1231 PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY)); 1232 PetscFunctionReturn(0); 1233 } 1234 1235 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1236 1237 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1238 */ 1239 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1240 { 1241 PetscFunctionBegin; 1242 PetscCall(MatCreate(comm,A)); 1243 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A,akok)); 1244 PetscFunctionReturn(0); 1245 } 1246 1247 /* --------------------------------------------------------------------------------*/ 1248 /*@C 1249 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 1250 (the default parallel PETSc format). This matrix will ultimately be handled by 1251 Kokkos for calculations. For good matrix 1252 assembly performance the user should preallocate the matrix storage by setting 1253 the parameter nz (or the array nnz). By setting these parameters accurately, 1254 performance during matrix assembly can be increased by more than a factor of 50. 1255 1256 Collective 1257 1258 Input Parameters: 1259 + comm - MPI communicator, set to PETSC_COMM_SELF 1260 . m - number of rows 1261 . n - number of columns 1262 . nz - number of nonzeros per row (same for all rows) 1263 - nnz - array containing the number of nonzeros in the various rows 1264 (possibly different for each row) or NULL 1265 1266 Output Parameter: 1267 . A - the matrix 1268 1269 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1270 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1271 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1272 1273 Notes: 1274 If nnz is given then nz is ignored 1275 1276 The AIJ format (also called the Yale sparse matrix format or 1277 compressed row storage), is fully compatible with standard Fortran 77 1278 storage. That is, the stored row and column indices can begin at 1279 either one (as in Fortran) or zero. See the users' manual for details. 1280 1281 Specify the preallocated storage with either nz or nnz (not both). 1282 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1283 allocation. For large problems you MUST preallocate memory or you 1284 will get TERRIBLE performance, see the users' manual chapter on matrices. 1285 1286 By default, this format uses inodes (identical nodes) when possible, to 1287 improve numerical efficiency of matrix-vector products and solves. We 1288 search for consecutive rows with the same nonzero structure, thereby 1289 reusing matrix information to achieve increased efficiency. 1290 1291 Level: intermediate 1292 1293 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()` 1294 @*/ 1295 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1296 { 1297 PetscFunctionBegin; 1298 PetscCall(PetscKokkosInitializeCheck()); 1299 PetscCall(MatCreate(comm,A)); 1300 PetscCall(MatSetSizes(*A,m,n,m,n)); 1301 PetscCall(MatSetType(*A,MATSEQAIJKOKKOS)); 1302 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz)); 1303 PetscFunctionReturn(0); 1304 } 1305 1306 typedef Kokkos::TeamPolicy<>::member_type team_member; 1307 // 1308 // This factorization exploits block diagonal matrices with "Nf" (not used). 1309 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1310 // 1311 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1312 { 1313 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1314 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1315 IS isrow = b->row,isicol = b->icol; 1316 const PetscInt *r_h,*ic_h; 1317 const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data(); 1318 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1319 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1320 PetscBool row_identity,col_identity; 1321 PetscInt nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used 1322 1323 PetscFunctionBegin; 1324 PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n); 1325 PetscCall(MatIsStructurallySymmetric(A,&row_identity)); 1326 PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 1327 PetscCall(ISGetIndices(isrow,&r_h)); 1328 PetscCall(ISGetIndices(isicol,&ic_h)); 1329 PetscCall(ISGetSize(isicol,&nc)); 1330 PetscCall(PetscLogGpuTimeBegin()); 1331 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1332 { 1333 #define KOKKOS_SHARED_LEVEL 1 1334 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1335 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1336 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1337 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 1338 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 1339 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 1340 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 1341 size_t flops_h = 0.0; 1342 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 1343 Kokkos::View<size_t> d_flops_k ("flops"); 1344 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1345 const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 1346 Kokkos::deep_copy (d_flops_k, h_flops_k); 1347 Kokkos::deep_copy (d_r_k, h_r_k); 1348 Kokkos::deep_copy (d_ic_k, h_ic_k); 1349 // Fill A --> fact 1350 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1351 const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 1352 const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 1353 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1354 // zero rows of B 1355 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1356 PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 1357 PetscScalar *baL = ba_d + bi_d[rowb]; 1358 PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 1359 /* zero (unfactored row) */ 1360 for (int j=0;j<nzbL;j++) baL[j] = 0; 1361 for (int j=0;j<nzbU;j++) baU[j] = 0; 1362 }); 1363 // copy A into B 1364 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1365 PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 1366 const PetscScalar *av = aa_d + ai_d[rowa]; 1367 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1368 /* load in initial (unfactored row) */ 1369 for (int j=0;j<nza;j++) { 1370 PetscInt colb = ic[ajtmp[j]]; 1371 PetscScalar vala = av[j]; 1372 if (colb == rowb) { 1373 *(ba_d + bdiag_d[rowb]) = vala; 1374 } else { 1375 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1376 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1377 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 1378 for (int j=0; j<nz ; j++) { 1379 if (pbj[j] == colb) { 1380 pba[j] = vala; 1381 set++; 1382 break; 1383 } 1384 } 1385 #if !defined(PETSC_HAVE_SYCL) 1386 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1387 #endif 1388 } 1389 } 1390 }); 1391 }); 1392 Kokkos::fence(); 1393 1394 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 1395 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1396 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1397 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1398 const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 1399 const PetscInt start = field*nloc, end = start + nloc; 1400 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1401 // A22 panel update for each row A(1,:) and col A(:,1) 1402 for (int ii=start; ii<end-1; ii++) { 1403 const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 1404 const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 1405 const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 1406 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1407 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 1408 PetscInt kIdx = j*Ni + field_block_idx; 1409 if (kIdx >= nzUi) /* void */ ; 1410 else { 1411 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1412 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1413 const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 1414 size_t st_idx; 1415 // find and do L(k,i) = A(:k,i) / A(i,i) 1416 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1417 // get column, there has got to be a better way 1418 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 1419 if (pjL[j] == ii) { 1420 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1421 idx = j; // output 1422 *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1423 } 1424 }, st_idx); 1425 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 1426 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1427 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register 1428 #endif 1429 // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1430 // U(i+1,:end) 1431 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 1432 PetscScalar Uij = baUi[uiIdx]; 1433 PetscInt col = bjUi[uiIdx]; 1434 if (col==myk) { 1435 // A_kk = A_kk - L_ki * U_ij(k) 1436 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1437 *Akkv = *Akkv - L_ki() * Uij; // UiK 1438 } else { 1439 PetscScalar *start, *end, *pAkjv=NULL; 1440 PetscInt high, low; 1441 const PetscInt *startj; 1442 if (col<myk) { // L 1443 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1444 PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 1445 start = pLki+1; // start at pLki+1, A22(myk,1) 1446 startj= bj_d + bi_d[myk] + idx; 1447 end = ba_d + bi_d[myk+1]; 1448 } else { 1449 PetscInt idx = bdiag_d[myk+1]+1; 1450 start = ba_d + idx; 1451 startj= bj_d + idx; 1452 end = ba_d + bdiag_d[myk]; 1453 } 1454 // search for 'col', use bisection search - TODO 1455 low = 0; 1456 high = (PetscInt)(end-start); 1457 while (high-low > 5) { 1458 int t = (low+high)/2; 1459 if (startj[t] > col) high = t; 1460 else low = t; 1461 } 1462 for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 1463 if (startj[pAkjv-start] == col) break; 1464 } 1465 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1466 if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register 1467 #endif 1468 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1469 } 1470 }); 1471 } 1472 }); 1473 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1474 if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 1475 } /* endof for (i=0; i<n; i++) { */ 1476 Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 1477 }); 1478 Kokkos::fence(); 1479 Kokkos::deep_copy (h_flops_k, d_flops_k); 1480 PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k())); 1481 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 1482 const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 1483 const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 1484 /* Invert diagonal for simpler triangular solves */ 1485 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 1486 int i = start + outer_index*Ni + lg_rank%Ni; 1487 if (i < end) { 1488 PetscScalar *pv = ba_d + bdiag_d[i]; 1489 *pv = 1.0/(*pv); 1490 } 1491 }); 1492 }); 1493 } 1494 PetscCall(PetscLogGpuTimeEnd()); 1495 PetscCall(ISRestoreIndices(isicol,&ic_h)); 1496 PetscCall(ISRestoreIndices(isrow,&r_h)); 1497 1498 PetscCall(ISIdentity(isrow,&row_identity)); 1499 PetscCall(ISIdentity(isicol,&col_identity)); 1500 if (b->inode.size) { 1501 B->ops->solve = MatSolve_SeqAIJ_Inode; 1502 } else if (row_identity && col_identity) { 1503 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1504 } else { 1505 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1506 } 1507 B->offloadmask = PETSC_OFFLOAD_GPU; 1508 PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU 1509 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1510 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1511 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1512 B->ops->matsolve = MatMatSolve_SeqAIJ; 1513 B->assembled = PETSC_TRUE; 1514 B->preallocated = PETSC_TRUE; 1515 1516 PetscFunctionReturn(0); 1517 } 1518 1519 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1520 { 1521 PetscFunctionBegin; 1522 PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 1523 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1524 PetscFunctionReturn(0); 1525 } 1526 1527 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1528 { 1529 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1530 1531 PetscFunctionBegin; 1532 if (!factors->sptrsv_symbolic_completed) { 1533 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 1534 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 1535 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1536 } 1537 PetscFunctionReturn(0); 1538 } 1539 1540 /* Check if we need to update factors etc for transpose solve */ 1541 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1542 { 1543 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1544 MatColIdxType n = A->rmap->n; 1545 1546 PetscFunctionBegin; 1547 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1548 /* Update L^T and do sptrsv symbolic */ 1549 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 1550 Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1551 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1552 factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 1553 1554 KokkosKernels::Impl::transpose_matrix< 1555 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1556 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1557 MatRowMapKokkosView,DefaultExecutionSpace>( 1558 n,n,factors->iL_d,factors->jL_d,factors->aL_d, 1559 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1560 1561 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1562 We have to sort the indices, until KK provides finer control options. 1563 */ 1564 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1565 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1566 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1567 1568 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 1569 1570 /* Update U^T and do sptrsv symbolic */ 1571 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 1572 Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1573 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1574 factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 1575 1576 KokkosKernels::Impl::transpose_matrix< 1577 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1578 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1579 MatRowMapKokkosView,DefaultExecutionSpace>( 1580 n,n,factors->iU_d, factors->jU_d, factors->aU_d, 1581 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1582 1583 /* Sort indices. See comments above */ 1584 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1585 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1586 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1587 1588 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 1589 factors->transpose_updated = PETSC_TRUE; 1590 } 1591 PetscFunctionReturn(0); 1592 } 1593 1594 /* Solve Ax = b, with A = LU */ 1595 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 1596 { 1597 ConstPetscScalarKokkosView bv; 1598 PetscScalarKokkosView xv; 1599 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1600 1601 PetscFunctionBegin; 1602 PetscCall(PetscLogGpuTimeBegin()); 1603 PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 1604 PetscCall(VecGetKokkosView(b,&bv)); 1605 PetscCall(VecGetKokkosViewWrite(x,&xv)); 1606 /* Solve L tmpv = b */ 1607 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 1608 /* Solve Ux = tmpv */ 1609 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 1610 PetscCall(VecRestoreKokkosView(b,&bv)); 1611 PetscCall(VecRestoreKokkosViewWrite(x,&xv)); 1612 PetscCall(PetscLogGpuTimeEnd()); 1613 PetscFunctionReturn(0); 1614 } 1615 1616 /* Solve A^T x = b, where A^T = U^T L^T */ 1617 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 1618 { 1619 ConstPetscScalarKokkosView bv; 1620 PetscScalarKokkosView xv; 1621 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1622 1623 PetscFunctionBegin; 1624 PetscCall(PetscLogGpuTimeBegin()); 1625 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 1626 PetscCall(VecGetKokkosView(b,&bv)); 1627 PetscCall(VecGetKokkosViewWrite(x,&xv)); 1628 /* Solve U^T tmpv = b */ 1629 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 1630 1631 /* Solve L^T x = tmpv */ 1632 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 1633 PetscCall(VecRestoreKokkosView(b,&bv)); 1634 PetscCall(VecRestoreKokkosViewWrite(x,&xv)); 1635 PetscCall(PetscLogGpuTimeEnd()); 1636 PetscFunctionReturn(0); 1637 } 1638 1639 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1640 { 1641 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1642 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1643 PetscInt fill_lev = info->levels; 1644 1645 PetscFunctionBegin; 1646 PetscCall(PetscLogGpuTimeBegin()); 1647 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1648 1649 auto a_d = aijkok->a_dual.view_device(); 1650 auto i_d = aijkok->i_dual.view_device(); 1651 auto j_d = aijkok->j_dual.view_device(); 1652 1653 KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 1654 1655 B->assembled = PETSC_TRUE; 1656 B->preallocated = PETSC_TRUE; 1657 B->ops->solve = MatSolve_SeqAIJKokkos; 1658 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1659 B->ops->matsolve = NULL; 1660 B->ops->matsolvetranspose = NULL; 1661 B->offloadmask = PETSC_OFFLOAD_GPU; 1662 1663 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1664 factors->transpose_updated = PETSC_FALSE; 1665 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1666 /* TODO: log flops, but how to know that? */ 1667 PetscCall(PetscLogGpuTimeEnd()); 1668 PetscFunctionReturn(0); 1669 } 1670 1671 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1672 { 1673 Mat_SeqAIJKokkos *aijkok; 1674 Mat_SeqAIJ *b; 1675 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1676 PetscInt fill_lev = info->levels; 1677 PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 1678 PetscInt n = A->rmap->n; 1679 1680 PetscFunctionBegin; 1681 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1682 /* Rebuild factors */ 1683 if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 1684 else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 1685 1686 /* Create a spiluk handle and then do symbolic factorization */ 1687 nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 1688 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 1689 1690 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1691 1692 Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 1693 Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 1694 Kokkos::realloc(factors->iU_d,n+1); 1695 Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 1696 1697 aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1698 auto i_d = aijkok->i_dual.view_device(); 1699 auto j_d = aijkok->j_dual.view_device(); 1700 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 1701 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1702 1703 Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1704 Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 1705 Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 1706 Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 1707 1708 /* TODO: add options to select sptrsv algorithms */ 1709 /* Create sptrsv handles for L, U and their transpose */ 1710 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1711 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1712 #else 1713 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1714 #endif 1715 1716 factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 1717 factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 1718 factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 1719 factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 1720 1721 /* Fill fields of the factor matrix B */ 1722 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL)); 1723 b = (Mat_SeqAIJ*)B->data; 1724 b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 1725 B->info.fill_ratio_given = info->fill; 1726 B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 1727 1728 B->offloadmask = PETSC_OFFLOAD_GPU; 1729 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1730 PetscFunctionReturn(0); 1731 } 1732 1733 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1734 { 1735 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1736 const PetscInt nrows = A->rmap->n; 1737 1738 PetscFunctionBegin; 1739 PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 1740 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1741 // move B data into Kokkos 1742 PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok 1743 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok 1744 { 1745 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1746 if (!baijkok->diag_d.extent(0)) { 1747 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1748 baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1749 Kokkos::deep_copy (baijkok->diag_d, h_diag); 1750 } 1751 } 1752 PetscFunctionReturn(0); 1753 } 1754 1755 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1756 { 1757 PetscFunctionBegin; 1758 *type = MATSOLVERKOKKOS; 1759 PetscFunctionReturn(0); 1760 } 1761 1762 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 1763 { 1764 PetscFunctionBegin; 1765 *type = MATSOLVERKOKKOSDEVICE; 1766 PetscFunctionReturn(0); 1767 } 1768 1769 /*MC 1770 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1771 on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1772 1773 Level: beginner 1774 1775 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1776 M*/ 1777 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1778 { 1779 PetscInt n = A->rmap->n; 1780 1781 PetscFunctionBegin; 1782 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 1783 PetscCall(MatSetSizes(*B,n,n,n,n)); 1784 (*B)->factortype = ftype; 1785 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 1786 PetscCall(MatSetType(*B,MATSEQAIJKOKKOS)); 1787 1788 if (ftype == MAT_FACTOR_LU) { 1789 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 1790 (*B)->canuseordering = PETSC_TRUE; 1791 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1792 } else if (ftype == MAT_FACTOR_ILU) { 1793 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 1794 (*B)->canuseordering = PETSC_FALSE; 1795 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1796 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1797 1798 PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 1799 PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos)); 1800 PetscFunctionReturn(0); 1801 } 1802 1803 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 1804 { 1805 PetscInt n = A->rmap->n; 1806 1807 PetscFunctionBegin; 1808 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 1809 PetscCall(MatSetSizes(*B,n,n,n,n)); 1810 (*B)->factortype = ftype; 1811 (*B)->canuseordering = PETSC_TRUE; 1812 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 1813 PetscCall(MatSetType(*B,MATSEQAIJKOKKOS)); 1814 1815 if (ftype == MAT_FACTOR_LU) { 1816 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 1817 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1818 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1819 1820 PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 1821 PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device)); 1822 PetscFunctionReturn(0); 1823 } 1824 1825 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1826 { 1827 PetscFunctionBegin; 1828 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos)); 1829 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos)); 1830 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device)); 1831 PetscFunctionReturn(0); 1832 } 1833 1834 /* Utility to print out a KokkosCsrMatrix for debugging */ 1835 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1836 { 1837 const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1838 const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1839 const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1840 const PetscInt *i = iv.data(); 1841 const PetscInt *j = jv.data(); 1842 const PetscScalar *a = av.data(); 1843 PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1844 1845 PetscFunctionBegin; 1846 PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz)); 1847 for (PetscInt k=0; k<m; k++) { 1848 PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k)); 1849 for (PetscInt p=i[k]; p<i[k+1]; p++) { 1850 PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],(double)PetscRealPart(a[p]))); 1851 } 1852 PetscCall(PetscPrintf(PETSC_COMM_SELF,"\n")); 1853 } 1854 PetscFunctionReturn(0); 1855 } 1856