1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petsc/private/sfimpl.h> 5 #include <petscsystypes.h> 6 #include <petscerror.h> 7 8 #include <Kokkos_Core.hpp> 9 #include <KokkosBlas.hpp> 10 #include <KokkosSparse_CrsMatrix.hpp> 11 #include <KokkosSparse_spmv.hpp> 12 #include <KokkosSparse_spiluk.hpp> 13 #include <KokkosSparse_sptrsv.hpp> 14 #include <KokkosSparse_spgemm.hpp> 15 #include <KokkosSparse_spadd.hpp> 16 17 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 18 19 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3,6,99) 20 #include <KokkosSparse_Utils.hpp> 21 using KokkosSparse::Impl::transpose_matrix; 22 using KokkosSparse::sort_crs_matrix; 23 #else 24 #include <KokkosKernels_Sorting.hpp> 25 using KokkosKernels::Impl::transpose_matrix; 26 using KokkosKernels::sort_crs_matrix; 27 #endif 28 29 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 30 31 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 32 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 33 In the latter case, it is important to set a_dual's sync state correctly. 34 */ 35 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 36 { 37 Mat_SeqAIJ *aijseq; 38 Mat_SeqAIJKokkos *aijkok; 39 40 PetscFunctionBegin; 41 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 42 PetscCall(MatAssemblyEnd_SeqAIJ(A,mode)); 43 44 aijseq = static_cast<Mat_SeqAIJ*>(A->data); 45 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 46 47 /* If aijkok does not exist, we just copy i, j to device. 48 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 49 In both cases, we build a new aijkok structure. 50 */ 51 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 52 delete aijkok; 53 aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 54 A->spptr = aijkok; 55 } 56 57 if (aijkok->device_mat_d.data()) { 58 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 59 } 60 PetscFunctionReturn(0); 61 } 62 63 /* Sync CSR data to device if not yet */ 64 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 65 { 66 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 67 68 PetscFunctionBegin; 69 PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 70 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 71 if (aijkok->a_dual.need_sync_device()) { 72 aijkok->a_dual.sync_device(); 73 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 74 aijkok->hermitian_updated = PETSC_FALSE; 75 } 76 PetscFunctionReturn(0); 77 } 78 79 /* Mark the CSR data on device as modified */ 80 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 81 { 82 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 83 84 PetscFunctionBegin; 85 PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 86 aijkok->a_dual.clear_sync_state(); 87 aijkok->a_dual.modify_device(); 88 aijkok->transpose_updated = PETSC_FALSE; 89 aijkok->hermitian_updated = PETSC_FALSE; 90 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 91 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 92 PetscFunctionReturn(0); 93 } 94 95 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 96 { 97 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 98 99 PetscFunctionBegin; 100 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 101 /* We do not expect one needs factors on host */ 102 PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 103 PetscCheck(aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 104 aijkok->a_dual.sync_host(); 105 PetscFunctionReturn(0); 106 } 107 108 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 109 { 110 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 111 112 PetscFunctionBegin; 113 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 114 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 115 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 116 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 117 */ 118 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 119 aijkok->a_dual.sync_host(); 120 *array = aijkok->a_dual.view_host().data(); 121 } else { /* Happens when calling MatSetValues on a newly created matrix */ 122 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 123 } 124 PetscFunctionReturn(0); 125 } 126 127 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 128 { 129 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 130 131 PetscFunctionBegin; 132 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 133 PetscFunctionReturn(0); 134 } 135 136 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 137 { 138 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 139 140 PetscFunctionBegin; 141 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 142 aijkok->a_dual.sync_host(); 143 *array = aijkok->a_dual.view_host().data(); 144 } else { 145 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 146 } 147 PetscFunctionReturn(0); 148 } 149 150 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 151 { 152 PetscFunctionBegin; 153 *array = NULL; 154 PetscFunctionReturn(0); 155 } 156 157 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 158 { 159 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 160 161 PetscFunctionBegin; 162 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 163 *array = aijkok->a_dual.view_host().data(); 164 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 165 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 166 } 167 PetscFunctionReturn(0); 168 } 169 170 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 171 { 172 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 173 174 PetscFunctionBegin; 175 if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 176 aijkok->a_dual.clear_sync_state(); 177 aijkok->a_dual.modify_host(); 178 } 179 PetscFunctionReturn(0); 180 } 181 182 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype) 183 { 184 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 185 186 PetscFunctionBegin; 187 PetscCheck(aijkok != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"aijkok is NULL"); 188 189 if (i) *i = aijkok->i_device_data(); 190 if (j) *j = aijkok->j_device_data(); 191 if (a) { 192 aijkok->a_dual.sync_device(); 193 *a = aijkok->a_device_data(); 194 } 195 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 196 PetscFunctionReturn(0); 197 } 198 199 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 200 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 201 { 202 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 203 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 204 205 PetscFunctionBegin; 206 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 207 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 208 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 209 PetscFunctionReturn(0); 210 } 211 212 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 213 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 214 { 215 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 216 217 PetscFunctionBegin; 218 if (aijkok && aijkok->device_mat_d.data()) { 219 *d_mat = aijkok->device_mat_d.data(); 220 } else { 221 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it) 222 *d_mat = NULL; 223 } 224 PetscFunctionReturn(0); 225 } 226 227 /* Generate the transpose on device and cache it internally */ 228 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 229 { 230 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 231 232 PetscFunctionBegin; 233 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 234 if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 235 /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 236 PetscCallCXX(aijkok->a_dual.sync_device()); 237 PetscCallCXX(aijkok->csrmatT = transpose_matrix(aijkok->csrmat)); 238 PetscCallCXX(sort_crs_matrix(aijkok->csrmatT)); 239 aijkok->transpose_updated = PETSC_TRUE; 240 } 241 *csrmatT = &aijkok->csrmatT; 242 PetscFunctionReturn(0); 243 } 244 245 /* Generate the Hermitian on device and cache it internally */ 246 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 247 { 248 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 249 250 PetscFunctionBegin; 251 PetscCall(PetscLogGpuTimeBegin()); 252 PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 253 if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 254 PetscCallCXX(aijkok->a_dual.sync_device()); 255 PetscCallCXX(aijkok->csrmatH = transpose_matrix(aijkok->csrmat)); 256 PetscCallCXX(sort_crs_matrix(aijkok->csrmatH)); 257 #if defined(PETSC_USE_COMPLEX) 258 const auto& a = aijkok->csrmatH.values; 259 Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 260 #endif 261 aijkok->hermitian_updated = PETSC_TRUE; 262 } 263 *csrmatH = &aijkok->csrmatH; 264 PetscCall(PetscLogGpuTimeEnd()); 265 PetscFunctionReturn(0); 266 } 267 268 /* y = A x */ 269 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 270 { 271 Mat_SeqAIJKokkos *aijkok; 272 ConstPetscScalarKokkosView xv; 273 PetscScalarKokkosView yv; 274 275 PetscFunctionBegin; 276 PetscCall(PetscLogGpuTimeBegin()); 277 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 278 PetscCall(VecGetKokkosView(xx,&xv)); 279 PetscCall(VecGetKokkosViewWrite(yy,&yv)); 280 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 281 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 282 PetscCall(VecRestoreKokkosView(xx,&xv)); 283 PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 284 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 285 PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz())); 286 PetscCall(PetscLogGpuTimeEnd()); 287 PetscFunctionReturn(0); 288 } 289 290 /* y = A^T x */ 291 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 292 { 293 Mat_SeqAIJKokkos *aijkok; 294 const char *mode; 295 ConstPetscScalarKokkosView xv; 296 PetscScalarKokkosView yv; 297 KokkosCsrMatrix *csrmat; 298 299 PetscFunctionBegin; 300 PetscCall(PetscLogGpuTimeBegin()); 301 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 302 PetscCall(VecGetKokkosView(xx,&xv)); 303 PetscCall(VecGetKokkosViewWrite(yy,&yv)); 304 if (A->form_explicit_transpose) { 305 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat)); 306 mode = "N"; 307 } else { 308 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 309 csrmat = &aijkok->csrmat; 310 mode = "T"; 311 } 312 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 313 PetscCall(VecRestoreKokkosView(xx,&xv)); 314 PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 315 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 316 PetscCall(PetscLogGpuTimeEnd()); 317 PetscFunctionReturn(0); 318 } 319 320 /* y = A^H x */ 321 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 322 { 323 Mat_SeqAIJKokkos *aijkok; 324 const char *mode; 325 ConstPetscScalarKokkosView xv; 326 PetscScalarKokkosView yv; 327 KokkosCsrMatrix *csrmat; 328 329 PetscFunctionBegin; 330 PetscCall(PetscLogGpuTimeBegin()); 331 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 332 PetscCall(VecGetKokkosView(xx,&xv)); 333 PetscCall(VecGetKokkosViewWrite(yy,&yv)); 334 if (A->form_explicit_transpose) { 335 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat)); 336 mode = "N"; 337 } else { 338 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 339 csrmat = &aijkok->csrmat; 340 mode = "C"; 341 } 342 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 343 PetscCall(VecRestoreKokkosView(xx,&xv)); 344 PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 345 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 346 PetscCall(PetscLogGpuTimeEnd()); 347 PetscFunctionReturn(0); 348 } 349 350 /* z = A x + y */ 351 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 352 { 353 Mat_SeqAIJKokkos *aijkok; 354 ConstPetscScalarKokkosView xv,yv; 355 PetscScalarKokkosView zv; 356 357 PetscFunctionBegin; 358 PetscCall(PetscLogGpuTimeBegin()); 359 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 360 PetscCall(VecGetKokkosView(xx,&xv)); 361 PetscCall(VecGetKokkosView(yy,&yv)); 362 PetscCall(VecGetKokkosViewWrite(zz,&zv)); 363 if (zz != yy) Kokkos::deep_copy(zv,yv); 364 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 365 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 366 PetscCall(VecRestoreKokkosView(xx,&xv)); 367 PetscCall(VecRestoreKokkosView(yy,&yv)); 368 PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 369 PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz())); 370 PetscCall(PetscLogGpuTimeEnd()); 371 PetscFunctionReturn(0); 372 } 373 374 /* z = A^T x + y */ 375 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 376 { 377 Mat_SeqAIJKokkos *aijkok; 378 const char *mode; 379 ConstPetscScalarKokkosView xv,yv; 380 PetscScalarKokkosView zv; 381 KokkosCsrMatrix *csrmat; 382 383 PetscFunctionBegin; 384 PetscCall(PetscLogGpuTimeBegin()); 385 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 386 PetscCall(VecGetKokkosView(xx,&xv)); 387 PetscCall(VecGetKokkosView(yy,&yv)); 388 PetscCall(VecGetKokkosViewWrite(zz,&zv)); 389 if (zz != yy) Kokkos::deep_copy(zv,yv); 390 if (A->form_explicit_transpose) { 391 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat)); 392 mode = "N"; 393 } else { 394 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 395 csrmat = &aijkok->csrmat; 396 mode = "T"; 397 } 398 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 399 PetscCall(VecRestoreKokkosView(xx,&xv)); 400 PetscCall(VecRestoreKokkosView(yy,&yv)); 401 PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 402 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 403 PetscCall(PetscLogGpuTimeEnd()); 404 PetscFunctionReturn(0); 405 } 406 407 /* z = A^H x + y */ 408 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 409 { 410 Mat_SeqAIJKokkos *aijkok; 411 const char *mode; 412 ConstPetscScalarKokkosView xv,yv; 413 PetscScalarKokkosView zv; 414 KokkosCsrMatrix *csrmat; 415 416 PetscFunctionBegin; 417 PetscCall(PetscLogGpuTimeBegin()); 418 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 419 PetscCall(VecGetKokkosView(xx,&xv)); 420 PetscCall(VecGetKokkosView(yy,&yv)); 421 PetscCall(VecGetKokkosViewWrite(zz,&zv)); 422 if (zz != yy) Kokkos::deep_copy(zv,yv); 423 if (A->form_explicit_transpose) { 424 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat)); 425 mode = "N"; 426 } else { 427 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 428 csrmat = &aijkok->csrmat; 429 mode = "C"; 430 } 431 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 432 PetscCall(VecRestoreKokkosView(xx,&xv)); 433 PetscCall(VecRestoreKokkosView(yy,&yv)); 434 PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 435 PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 436 PetscCall(PetscLogGpuTimeEnd()); 437 PetscFunctionReturn(0); 438 } 439 440 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 441 { 442 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 443 444 PetscFunctionBegin; 445 switch (op) { 446 case MAT_FORM_EXPLICIT_TRANSPOSE: 447 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 448 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 449 A->form_explicit_transpose = flg; 450 break; 451 default: 452 PetscCall(MatSetOption_SeqAIJ(A,op,flg)); 453 break; 454 } 455 PetscFunctionReturn(0); 456 } 457 458 /* Depending on reuse, either build a new mat, or use the existing mat */ 459 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 460 { 461 Mat_SeqAIJ *aseq; 462 463 PetscFunctionBegin; 464 PetscCall(PetscKokkosInitializeCheck()); 465 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 466 PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); /* the returned newmat is a SeqAIJKokkos */ 467 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 468 PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 469 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 470 PetscCheck(A == *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 471 PetscCall(PetscFree(A->defaultvectype)); 472 PetscCall(PetscStrallocpy(VECKOKKOS,&A->defaultvectype)); /* Allocate and copy the string */ 473 PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS)); 474 PetscCall(MatSetOps_SeqAIJKokkos(A)); 475 aseq = static_cast<Mat_SeqAIJ*>(A->data); 476 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 477 PetscCheck(!A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 478 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 479 } 480 } 481 PetscFunctionReturn(0); 482 } 483 484 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 485 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 486 */ 487 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 488 { 489 Mat_SeqAIJ *bseq; 490 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 491 Mat mat; 492 493 PetscFunctionBegin; 494 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 495 PetscCall(MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B)); 496 mat = *B; 497 if (A->assembled) { 498 bseq = static_cast<Mat_SeqAIJ*>(mat->data); 499 bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 500 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 501 /* Now copy values to B if needed */ 502 if (dupOption == MAT_COPY_VALUES) { 503 if (akok->a_dual.need_sync_device()) { 504 Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 505 bkok->a_dual.modify_host(); 506 } else { /* If device has the latest data, we only copy data on device */ 507 Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 508 bkok->a_dual.modify_device(); 509 } 510 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 511 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 512 bkok->a_dual.modify_host(); 513 } 514 mat->spptr = bkok; 515 } 516 517 PetscCall(PetscFree(mat->defaultvectype)); 518 PetscCall(PetscStrallocpy(VECKOKKOS,&mat->defaultvectype)); /* Allocate and copy the string */ 519 PetscCall(PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS)); 520 PetscCall(MatSetOps_SeqAIJKokkos(mat)); 521 PetscFunctionReturn(0); 522 } 523 524 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 525 { 526 Mat At; 527 KokkosCsrMatrix *internT; 528 Mat_SeqAIJKokkos *atkok,*bkok; 529 530 PetscFunctionBegin; 531 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A,*B)); 532 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&internT)); /* Generate a transpose internally */ 533 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 534 /* Deep copy internT, as we want to isolate the internal transpose */ 535 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT))); 536 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At)); 537 if (reuse == MAT_INITIAL_MATRIX) *B = At; 538 else PetscCall(MatHeaderReplace(A,&At)); /* Replace A with At inplace */ 539 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 540 if ((*B)->assembled) { 541 bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 542 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 543 PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 544 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 545 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 546 MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 547 MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 548 PetscCallCXX(Kokkos::deep_copy(a_h,internT->values)); 549 PetscCallCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 550 } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 551 } 552 PetscFunctionReturn(0); 553 } 554 555 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 556 { 557 Mat_SeqAIJKokkos *aijkok; 558 559 PetscFunctionBegin; 560 if (A->factortype == MAT_FACTOR_NONE) { 561 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 562 delete aijkok; 563 } else { 564 delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 565 } 566 A->spptr = NULL; 567 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL)); 568 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 569 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 570 PetscCall(MatDestroy_SeqAIJ(A)); 571 PetscFunctionReturn(0); 572 } 573 574 /*MC 575 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 576 577 A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 578 579 Options Database Keys: 580 . -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions() 581 582 Level: beginner 583 584 .seealso: `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 585 M*/ 586 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 587 { 588 PetscFunctionBegin; 589 PetscCall(PetscKokkosInitializeCheck()); 590 PetscCall(MatCreate_SeqAIJ(A)); 591 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A)); 592 PetscFunctionReturn(0); 593 } 594 595 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 596 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 597 { 598 Mat_SeqAIJ *a,*b; 599 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 600 MatScalarKokkosView aa,ba,ca; 601 MatRowMapKokkosView ai,bi,ci; 602 MatColIdxKokkosView aj,bj,cj; 603 PetscInt m,n,nnz,aN; 604 605 PetscFunctionBegin; 606 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 607 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 608 PetscValidPointer(C,4); 609 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 610 PetscCheckTypeName(B,MATSEQAIJKOKKOS); 611 PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 612 PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 613 614 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 615 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 616 a = static_cast<Mat_SeqAIJ*>(A->data); 617 b = static_cast<Mat_SeqAIJ*>(B->data); 618 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 619 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 620 aa = akok->a_dual.view_device(); 621 ai = akok->i_dual.view_device(); 622 ba = bkok->a_dual.view_device(); 623 bi = bkok->i_dual.view_device(); 624 m = A->rmap->n; /* M, N and nnz of C */ 625 n = A->cmap->n + B->cmap->n; 626 nnz = a->nz + b->nz; 627 aN = A->cmap->n; /* N of A */ 628 if (reuse == MAT_INITIAL_MATRIX) { 629 aj = akok->j_dual.view_device(); 630 bj = bkok->j_dual.view_device(); 631 auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 632 auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 633 auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 634 ca = ca_dual.view_device(); 635 ci = ci_dual.view_device(); 636 cj = cj_dual.view_device(); 637 638 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 639 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 640 PetscInt i = t.league_rank(); /* row i */ 641 PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 642 643 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 644 ci(i) = coffset; 645 if (i == m-1) ci(m) = ai(m) + bi(m); 646 }); 647 648 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 649 if (k < alen) { 650 ca(coffset+k) = aa(ai(i)+k); 651 cj(coffset+k) = aj(ai(i)+k); 652 } else { 653 ca(coffset+k) = ba(bi(i)+k-alen); 654 cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 655 } 656 }); 657 }); 658 ca_dual.modify_device(); 659 ci_dual.modify_device(); 660 cj_dual.modify_device(); 661 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 662 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C)); 663 } else if (reuse == MAT_REUSE_MATRIX) { 664 PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 665 PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 666 ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 667 ca = ckok->a_dual.view_device(); 668 ci = ckok->i_dual.view_device(); 669 670 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 671 PetscInt i = t.league_rank(); /* row i */ 672 PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 673 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 674 if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 675 else ca(ci(i)+k) = ba(bi(i)+k-alen); 676 }); 677 }); 678 PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 679 } 680 PetscFunctionReturn(0); 681 } 682 683 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 684 { 685 PetscFunctionBegin; 686 delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 687 PetscFunctionReturn(0); 688 } 689 690 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 691 { 692 Mat_Product *product = C->product; 693 Mat A,B; 694 bool transA,transB; /* use bool, since KK needs this type */ 695 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 696 Mat_SeqAIJ *c; 697 MatProductData_SeqAIJKokkos *pdata; 698 KokkosCsrMatrix *csrmatA,*csrmatB; 699 700 PetscFunctionBegin; 701 MatCheckProduct(C,1); 702 PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 703 pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 704 705 if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 706 pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 707 PetscFunctionReturn(0); 708 } 709 710 switch (product->type) { 711 case MATPRODUCT_AB: transA = false; transB = false; break; 712 case MATPRODUCT_AtB: transA = true; transB = false; break; 713 case MATPRODUCT_ABt: transA = false; transB = true; break; 714 default: 715 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 716 } 717 718 A = product->A; 719 B = product->B; 720 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 721 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 722 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 723 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 724 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 725 726 PetscCheck(ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 727 728 csrmatA = &akok->csrmat; 729 csrmatB = &bkok->csrmat; 730 731 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 732 if (transA) { 733 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA)); 734 transA = false; 735 } 736 737 if (transB) { 738 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB)); 739 transB = false; 740 } 741 PetscCall(PetscLogGpuTimeBegin()); 742 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 743 PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 744 PetscCall(PetscLogGpuTimeEnd()); 745 PetscCall(MatSeqAIJKokkosModifyDevice(C)); 746 /* shorter version of MatAssemblyEnd_SeqAIJ */ 747 c = (Mat_SeqAIJ*)C->data; 748 PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz)); 749 PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n")); 750 PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax)); 751 c->reallocs = 0; 752 C->info.mallocs = 0; 753 C->info.nz_unneeded = 0; 754 C->assembled = C->was_assembled = PETSC_TRUE; 755 C->num_ass++; 756 PetscFunctionReturn(0); 757 } 758 759 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 760 { 761 Mat_Product *product = C->product; 762 MatProductType ptype; 763 Mat A,B; 764 bool transA,transB; 765 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 766 MatProductData_SeqAIJKokkos *pdata; 767 MPI_Comm comm; 768 KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 769 770 PetscFunctionBegin; 771 MatCheckProduct(C,1); 772 PetscCall(PetscObjectGetComm((PetscObject)C,&comm)); 773 PetscCheck(!product->data,comm,PETSC_ERR_PLIB,"Product data not empty"); 774 A = product->A; 775 B = product->B; 776 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 777 PetscCall(MatSeqAIJKokkosSyncDevice(B)); 778 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 779 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 780 csrmatA = &akok->csrmat; 781 csrmatB = &bkok->csrmat; 782 783 ptype = product->type; 784 switch (ptype) { 785 case MATPRODUCT_AB: transA = false; transB = false; break; 786 case MATPRODUCT_AtB: transA = true; transB = false; break; 787 case MATPRODUCT_ABt: transA = false; transB = true; break; 788 default: 789 SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 790 } 791 792 product->data = pdata = new MatProductData_SeqAIJKokkos(); 793 pdata->kh.set_team_work_size(16); 794 pdata->kh.set_dynamic_scheduling(true); 795 pdata->reusesym = product->api_user; 796 797 /* TODO: add command line options to select spgemm algorithms */ 798 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 799 /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API. 800 We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support. 801 */ 802 pdata->kh.create_spgemm_handle(spgemm_alg); 803 804 PetscCall(PetscLogGpuTimeBegin()); 805 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 806 if (transA) { 807 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA)); 808 transA = false; 809 } 810 811 if (transB) { 812 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB)); 813 transB = false; 814 } 815 816 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 817 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 818 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 819 calling new Mat_SeqAIJKokkos(). 820 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 821 */ 822 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 823 PetscCallCXX(sort_crs_matrix(csrmatC)); 824 PetscCall(PetscLogGpuTimeEnd()); 825 826 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 827 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C,ckok)); 828 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 829 PetscFunctionReturn(0); 830 } 831 832 /* handles sparse matrix matrix ops */ 833 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 834 { 835 Mat_Product *product = mat->product; 836 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 837 838 PetscFunctionBegin; 839 MatCheckProduct(mat,1); 840 PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok)); 841 if (product->type == MATPRODUCT_ABC) { 842 PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok)); 843 } 844 if (Biskok && Ciskok) { 845 switch (product->type) { 846 case MATPRODUCT_AB: 847 case MATPRODUCT_AtB: 848 case MATPRODUCT_ABt: 849 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 850 break; 851 case MATPRODUCT_PtAP: 852 case MATPRODUCT_RARt: 853 case MATPRODUCT_ABC: 854 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 855 break; 856 default: 857 break; 858 } 859 } else { /* fallback for AIJ */ 860 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 861 } 862 PetscFunctionReturn(0); 863 } 864 865 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 866 { 867 Mat_SeqAIJKokkos *aijkok; 868 869 PetscFunctionBegin; 870 PetscCall(PetscLogGpuTimeBegin()); 871 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 872 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 873 KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 874 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 875 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 876 PetscCall(PetscLogGpuTimeEnd()); 877 PetscFunctionReturn(0); 878 } 879 880 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 881 { 882 Mat_SeqAIJKokkos *aijkok; 883 884 PetscFunctionBegin; 885 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 886 if (aijkok) { /* Only zero the device if data is already there */ 887 KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 888 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 889 } else { /* Might be preallocated but not assembled */ 890 PetscCall(MatZeroEntries_SeqAIJ(A)); 891 } 892 PetscFunctionReturn(0); 893 } 894 895 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A,Vec x) 896 { 897 Mat_SeqAIJ *aijseq; 898 Mat_SeqAIJKokkos *aijkok; 899 PetscInt n; 900 PetscScalarKokkosView xv; 901 902 PetscFunctionBegin; 903 PetscCall(VecGetLocalSize(x,&n)); 904 PetscCheck(n == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 905 PetscCheck(A->factortype == MAT_FACTOR_NONE,PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 906 907 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 908 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 909 910 if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */ 911 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 912 aijseq = static_cast<Mat_SeqAIJ*>(A->data); 913 aijkok->SetDiagonal(aijseq->diag); 914 } 915 916 const auto& Aa = aijkok->a_dual.view_device(); 917 const auto& Ai = aijkok->i_dual.view_device(); 918 const auto& Adiag = aijkok->diag_dual.view_device(); 919 920 PetscCall(VecGetKokkosViewWrite(x,&xv)); 921 Kokkos::parallel_for(n,KOKKOS_LAMBDA(const PetscInt i) { 922 if (Adiag(i) < Ai(i+1)) xv(i) = Aa(Adiag(i)); 923 else xv(i) = 0; 924 }); 925 PetscCall(VecRestoreKokkosViewWrite(x,&xv)); 926 PetscFunctionReturn(0); 927 } 928 929 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 930 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv) 931 { 932 Mat_SeqAIJKokkos *aijkok; 933 934 PetscFunctionBegin; 935 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 936 PetscValidPointer(kv,2); 937 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 938 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 939 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 940 *kv = aijkok->a_dual.view_device(); 941 PetscFunctionReturn(0); 942 } 943 944 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv) 945 { 946 PetscFunctionBegin; 947 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 948 PetscValidPointer(kv,2); 949 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 950 PetscFunctionReturn(0); 951 } 952 953 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv) 954 { 955 Mat_SeqAIJKokkos *aijkok; 956 957 PetscFunctionBegin; 958 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 959 PetscValidPointer(kv,2); 960 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 961 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 962 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 963 *kv = aijkok->a_dual.view_device(); 964 PetscFunctionReturn(0); 965 } 966 967 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv) 968 { 969 PetscFunctionBegin; 970 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 971 PetscValidPointer(kv,2); 972 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 973 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 974 PetscFunctionReturn(0); 975 } 976 977 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 978 { 979 Mat_SeqAIJKokkos *aijkok; 980 981 PetscFunctionBegin; 982 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 983 PetscValidPointer(kv,2); 984 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 985 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 986 *kv = aijkok->a_dual.view_device(); 987 PetscFunctionReturn(0); 988 } 989 990 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 991 { 992 PetscFunctionBegin; 993 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 994 PetscValidPointer(kv,2); 995 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 996 PetscCall(MatSeqAIJKokkosModifyDevice(A)); 997 PetscFunctionReturn(0); 998 } 999 1000 /* Computes Y += alpha X */ 1001 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 1002 { 1003 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 1004 Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 1005 ConstMatScalarKokkosView Xa; 1006 MatScalarKokkosView Ya; 1007 1008 PetscFunctionBegin; 1009 PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 1010 PetscCheckTypeName(X,MATSEQAIJKOKKOS); 1011 PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1012 PetscCall(MatSeqAIJKokkosSyncDevice(X)); 1013 PetscCall(PetscLogGpuTimeBegin()); 1014 1015 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1016 /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 1017 PetscBool e; 1018 PetscCall(PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e)); 1019 if (e) { 1020 PetscCall(PetscArraycmp(x->j,y->j,y->nz,&e)); 1021 if (e) pattern = SAME_NONZERO_PATTERN; 1022 } 1023 } 1024 1025 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1026 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1027 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1028 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1029 */ 1030 ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 1031 xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 1032 Xa = xkok->a_dual.view_device(); 1033 Ya = ykok->a_dual.view_device(); 1034 1035 if (pattern == SAME_NONZERO_PATTERN) { 1036 KokkosBlas::axpy(alpha,Xa,Ya); 1037 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1038 } else if (pattern == SUBSET_NONZERO_PATTERN) { 1039 MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 1040 MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 1041 1042 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 1043 PetscInt i = t.league_rank(); /* row i */ 1044 Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 1045 PetscInt p,q = Yi(i); 1046 for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 1047 while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 1048 if (Xj(p) == Yj(q)) { /* Found it */ 1049 Ya(q) += alpha * Xa(p); 1050 q++; 1051 } else { 1052 /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1053 Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1054 */ 1055 if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = 1056 #if PETSC_PKG_KOKKOS_VERSION_GE(3,6,99) 1057 Kokkos::nan("1"); /* auto promote the double NaN if needed */ 1058 #else 1059 Kokkos::Experimental::nan("1"); 1060 #endif 1061 } 1062 } 1063 }); 1064 }); 1065 PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1066 } else { /* different nonzero patterns */ 1067 Mat Z; 1068 KokkosCsrMatrix zcsr; 1069 KernelHandle kh; 1070 kh.create_spadd_handle(false); 1071 KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1072 KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1073 zkok = new Mat_SeqAIJKokkos(zcsr); 1074 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z)); 1075 PetscCall(MatHeaderReplace(Y,&Z)); 1076 kh.destroy_spadd_handle(); 1077 } 1078 PetscCall(PetscLogGpuTimeEnd()); 1079 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0)*2)); /* Because we scaled X and then added it to Y */ 1080 PetscFunctionReturn(0); 1081 } 1082 1083 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1084 { 1085 Mat_SeqAIJKokkos *akok; 1086 Mat_SeqAIJ *aseq; 1087 1088 PetscFunctionBegin; 1089 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j)); 1090 aseq = static_cast<Mat_SeqAIJ*>(mat->data); 1091 akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr); 1092 delete akok; 1093 mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,mat->nonzerostate+1,PETSC_FALSE); 1094 PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1095 akok->SetUpCOO(aseq); 1096 PetscFunctionReturn(0); 1097 } 1098 1099 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode) 1100 { 1101 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ*>(A->data); 1102 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1103 PetscCount Annz = aseq->nz; 1104 const PetscCountKokkosView& jmap = akok->jmap_d; 1105 const PetscCountKokkosView& perm = akok->perm_d; 1106 MatScalarKokkosView Aa; 1107 ConstMatScalarKokkosView kv; 1108 PetscMemType memtype; 1109 1110 PetscFunctionBegin; 1111 PetscCall(PetscGetMemType(v,&memtype)); 1112 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1113 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n)); 1114 } else { 1115 kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */ 1116 } 1117 1118 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); /* write matrix values */ 1119 else PetscCall(MatSeqAIJGetKokkosView(A,&Aa)); /* read & write matrix values */ 1120 1121 Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) { 1122 PetscScalar sum = 0.0; 1123 for (PetscCount k=jmap(i); k<jmap(i+1); k++) sum += kv(perm(k)); 1124 Aa(i) = (imode == INSERT_VALUES? 0.0 : Aa(i)) + sum; 1125 }); 1126 1127 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa)); 1128 else PetscCall(MatSeqAIJRestoreKokkosView(A,&Aa)); 1129 PetscFunctionReturn(0); 1130 } 1131 1132 PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A,const PetscInt *diag) 1133 { 1134 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1135 MatScalarKokkosView Aa; 1136 const MatRowMapKokkosView& Ai = akok->i_dual.view_device(); 1137 PetscInt m = A->rmap->n; 1138 ConstMatRowMapKokkosView Adiag(diag,m); /* diag is a device pointer */ 1139 1140 PetscFunctionBegin; 1141 PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); 1142 Kokkos::parallel_for(m,KOKKOS_LAMBDA(const PetscInt i) { 1143 PetscScalar tmp; 1144 if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i+1)) { /* The diagonal element exists */ 1145 tmp = Aa(Ai(i)); 1146 Aa(Ai(i)) = Aa(Adiag(i)); 1147 Aa(Adiag(i)) = tmp; 1148 } 1149 }); 1150 PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa)); 1151 PetscFunctionReturn(0); 1152 } 1153 1154 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1155 { 1156 PetscFunctionBegin; 1157 PetscCall(MatSeqAIJKokkosSyncHost(A)); 1158 PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info)); 1159 B->offloadmask = PETSC_OFFLOAD_CPU; 1160 PetscFunctionReturn(0); 1161 } 1162 1163 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1164 { 1165 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1166 1167 PetscFunctionBegin; 1168 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1169 A->boundtocpu = PETSC_FALSE; 1170 1171 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1172 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1173 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1174 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1175 A->ops->scale = MatScale_SeqAIJKokkos; 1176 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1177 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1178 A->ops->mult = MatMult_SeqAIJKokkos; 1179 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1180 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1181 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1182 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1183 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1184 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1185 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1186 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1187 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1188 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1189 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1190 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1191 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1192 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1193 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1194 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 1195 1196 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos)); 1197 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJKokkos)); 1198 PetscFunctionReturn(0); 1199 } 1200 1201 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1202 { 1203 Mat_SeqAIJ *aseq; 1204 PetscInt i,m,n; 1205 1206 PetscFunctionBegin; 1207 PetscCheck(!A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1208 1209 m = akok->nrows(); 1210 n = akok->ncols(); 1211 PetscCall(MatSetSizes(A,m,n,m,n)); 1212 PetscCall(MatSetType(A,MATSEQAIJKOKKOS)); 1213 1214 /* Set up data structures of A as a MATSEQAIJ */ 1215 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL)); 1216 aseq = (Mat_SeqAIJ*)(A)->data; 1217 1218 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1219 akok->j_dual.sync_host(); 1220 1221 aseq->i = akok->i_host_data(); 1222 aseq->j = akok->j_host_data(); 1223 aseq->a = akok->a_host_data(); 1224 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1225 aseq->singlemalloc = PETSC_FALSE; 1226 aseq->free_a = PETSC_FALSE; 1227 aseq->free_ij = PETSC_FALSE; 1228 aseq->nz = akok->nnz(); 1229 aseq->maxnz = aseq->nz; 1230 1231 PetscCall(PetscMalloc1(m,&aseq->imax)); 1232 PetscCall(PetscMalloc1(m,&aseq->ilen)); 1233 for (i=0; i<m; i++) { 1234 aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1235 } 1236 1237 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1238 akok->nonzerostate = A->nonzerostate; 1239 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 1240 PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY)); 1241 PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY)); 1242 PetscFunctionReturn(0); 1243 } 1244 1245 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1246 1247 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1248 */ 1249 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1250 { 1251 PetscFunctionBegin; 1252 PetscCall(MatCreate(comm,A)); 1253 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A,akok)); 1254 PetscFunctionReturn(0); 1255 } 1256 1257 /* --------------------------------------------------------------------------------*/ 1258 /*@C 1259 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 1260 (the default parallel PETSc format). This matrix will ultimately be handled by 1261 Kokkos for calculations. For good matrix 1262 assembly performance the user should preallocate the matrix storage by setting 1263 the parameter nz (or the array nnz). By setting these parameters accurately, 1264 performance during matrix assembly can be increased by more than a factor of 50. 1265 1266 Collective 1267 1268 Input Parameters: 1269 + comm - MPI communicator, set to PETSC_COMM_SELF 1270 . m - number of rows 1271 . n - number of columns 1272 . nz - number of nonzeros per row (same for all rows) 1273 - nnz - array containing the number of nonzeros in the various rows 1274 (possibly different for each row) or NULL 1275 1276 Output Parameter: 1277 . A - the matrix 1278 1279 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1280 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1281 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1282 1283 Notes: 1284 If nnz is given then nz is ignored 1285 1286 The AIJ format (also called the Yale sparse matrix format or 1287 compressed row storage), is fully compatible with standard Fortran 77 1288 storage. That is, the stored row and column indices can begin at 1289 either one (as in Fortran) or zero. See the users' manual for details. 1290 1291 Specify the preallocated storage with either nz or nnz (not both). 1292 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1293 allocation. For large problems you MUST preallocate memory or you 1294 will get TERRIBLE performance, see the users' manual chapter on matrices. 1295 1296 By default, this format uses inodes (identical nodes) when possible, to 1297 improve numerical efficiency of matrix-vector products and solves. We 1298 search for consecutive rows with the same nonzero structure, thereby 1299 reusing matrix information to achieve increased efficiency. 1300 1301 Level: intermediate 1302 1303 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()` 1304 @*/ 1305 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1306 { 1307 PetscFunctionBegin; 1308 PetscCall(PetscKokkosInitializeCheck()); 1309 PetscCall(MatCreate(comm,A)); 1310 PetscCall(MatSetSizes(*A,m,n,m,n)); 1311 PetscCall(MatSetType(*A,MATSEQAIJKOKKOS)); 1312 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz)); 1313 PetscFunctionReturn(0); 1314 } 1315 1316 typedef Kokkos::TeamPolicy<>::member_type team_member; 1317 // 1318 // This factorization exploits block diagonal matrices with "Nf" (not used). 1319 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1320 // 1321 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1322 { 1323 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1324 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1325 IS isrow = b->row,isicol = b->icol; 1326 const PetscInt *r_h,*ic_h; 1327 const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data(); 1328 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1329 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1330 PetscBool row_identity,col_identity; 1331 PetscInt nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used 1332 1333 PetscFunctionBegin; 1334 PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n); 1335 PetscCall(MatIsStructurallySymmetric(A,&row_identity)); 1336 PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 1337 PetscCall(ISGetIndices(isrow,&r_h)); 1338 PetscCall(ISGetIndices(isicol,&ic_h)); 1339 PetscCall(ISGetSize(isicol,&nc)); 1340 PetscCall(PetscLogGpuTimeBegin()); 1341 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1342 { 1343 #define KOKKOS_SHARED_LEVEL 1 1344 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1345 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1346 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1347 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 1348 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 1349 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 1350 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 1351 size_t flops_h = 0.0; 1352 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 1353 Kokkos::View<size_t> d_flops_k ("flops"); 1354 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1355 const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 1356 Kokkos::deep_copy (d_flops_k, h_flops_k); 1357 Kokkos::deep_copy (d_r_k, h_r_k); 1358 Kokkos::deep_copy (d_ic_k, h_ic_k); 1359 // Fill A --> fact 1360 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1361 const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 1362 const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 1363 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1364 // zero rows of B 1365 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1366 PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 1367 PetscScalar *baL = ba_d + bi_d[rowb]; 1368 PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 1369 /* zero (unfactored row) */ 1370 for (int j=0;j<nzbL;j++) baL[j] = 0; 1371 for (int j=0;j<nzbU;j++) baU[j] = 0; 1372 }); 1373 // copy A into B 1374 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1375 PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 1376 const PetscScalar *av = aa_d + ai_d[rowa]; 1377 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1378 /* load in initial (unfactored row) */ 1379 for (int j=0;j<nza;j++) { 1380 PetscInt colb = ic[ajtmp[j]]; 1381 PetscScalar vala = av[j]; 1382 if (colb == rowb) { 1383 *(ba_d + bdiag_d[rowb]) = vala; 1384 } else { 1385 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1386 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1387 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 1388 for (int j=0; j<nz ; j++) { 1389 if (pbj[j] == colb) { 1390 pba[j] = vala; 1391 set++; 1392 break; 1393 } 1394 } 1395 #if !defined(PETSC_HAVE_SYCL) 1396 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1397 #endif 1398 } 1399 } 1400 }); 1401 }); 1402 Kokkos::fence(); 1403 1404 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 1405 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1406 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1407 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1408 const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 1409 const PetscInt start = field*nloc, end = start + nloc; 1410 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1411 // A22 panel update for each row A(1,:) and col A(:,1) 1412 for (int ii=start; ii<end-1; ii++) { 1413 const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 1414 const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 1415 const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 1416 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1417 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 1418 PetscInt kIdx = j*Ni + field_block_idx; 1419 if (kIdx >= nzUi) /* void */ ; 1420 else { 1421 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1422 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1423 const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 1424 size_t st_idx; 1425 // find and do L(k,i) = A(:k,i) / A(i,i) 1426 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1427 // get column, there has got to be a better way 1428 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 1429 if (pjL[j] == ii) { 1430 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1431 idx = j; // output 1432 *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1433 } 1434 }, st_idx); 1435 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 1436 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1437 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register 1438 #endif 1439 // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1440 // U(i+1,:end) 1441 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 1442 PetscScalar Uij = baUi[uiIdx]; 1443 PetscInt col = bjUi[uiIdx]; 1444 if (col==myk) { 1445 // A_kk = A_kk - L_ki * U_ij(k) 1446 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1447 *Akkv = *Akkv - L_ki() * Uij; // UiK 1448 } else { 1449 PetscScalar *start, *end, *pAkjv=NULL; 1450 PetscInt high, low; 1451 const PetscInt *startj; 1452 if (col<myk) { // L 1453 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1454 PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 1455 start = pLki+1; // start at pLki+1, A22(myk,1) 1456 startj= bj_d + bi_d[myk] + idx; 1457 end = ba_d + bi_d[myk+1]; 1458 } else { 1459 PetscInt idx = bdiag_d[myk+1]+1; 1460 start = ba_d + idx; 1461 startj= bj_d + idx; 1462 end = ba_d + bdiag_d[myk]; 1463 } 1464 // search for 'col', use bisection search - TODO 1465 low = 0; 1466 high = (PetscInt)(end-start); 1467 while (high-low > 5) { 1468 int t = (low+high)/2; 1469 if (startj[t] > col) high = t; 1470 else low = t; 1471 } 1472 for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 1473 if (startj[pAkjv-start] == col) break; 1474 } 1475 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1476 if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register 1477 #endif 1478 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1479 } 1480 }); 1481 } 1482 }); 1483 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1484 if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 1485 } /* endof for (i=0; i<n; i++) { */ 1486 Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 1487 }); 1488 Kokkos::fence(); 1489 Kokkos::deep_copy (h_flops_k, d_flops_k); 1490 PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k())); 1491 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 1492 const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 1493 const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 1494 /* Invert diagonal for simpler triangular solves */ 1495 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 1496 int i = start + outer_index*Ni + lg_rank%Ni; 1497 if (i < end) { 1498 PetscScalar *pv = ba_d + bdiag_d[i]; 1499 *pv = 1.0/(*pv); 1500 } 1501 }); 1502 }); 1503 } 1504 PetscCall(PetscLogGpuTimeEnd()); 1505 PetscCall(ISRestoreIndices(isicol,&ic_h)); 1506 PetscCall(ISRestoreIndices(isrow,&r_h)); 1507 1508 PetscCall(ISIdentity(isrow,&row_identity)); 1509 PetscCall(ISIdentity(isicol,&col_identity)); 1510 if (b->inode.size) { 1511 B->ops->solve = MatSolve_SeqAIJ_Inode; 1512 } else if (row_identity && col_identity) { 1513 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1514 } else { 1515 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1516 } 1517 B->offloadmask = PETSC_OFFLOAD_GPU; 1518 PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU 1519 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1520 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1521 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1522 B->ops->matsolve = MatMatSolve_SeqAIJ; 1523 B->assembled = PETSC_TRUE; 1524 B->preallocated = PETSC_TRUE; 1525 1526 PetscFunctionReturn(0); 1527 } 1528 1529 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1530 { 1531 PetscFunctionBegin; 1532 PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 1533 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1534 PetscFunctionReturn(0); 1535 } 1536 1537 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1538 { 1539 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1540 1541 PetscFunctionBegin; 1542 if (!factors->sptrsv_symbolic_completed) { 1543 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 1544 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 1545 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1546 } 1547 PetscFunctionReturn(0); 1548 } 1549 1550 /* Check if we need to update factors etc for transpose solve */ 1551 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1552 { 1553 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1554 MatColIdxType n = A->rmap->n; 1555 1556 PetscFunctionBegin; 1557 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1558 /* Update L^T and do sptrsv symbolic */ 1559 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 1560 Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1561 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1562 factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 1563 1564 transpose_matrix< 1565 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1566 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1567 MatRowMapKokkosView,DefaultExecutionSpace>( 1568 n,n,factors->iL_d,factors->jL_d,factors->aL_d, 1569 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1570 1571 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1572 We have to sort the indices, until KK provides finer control options. 1573 */ 1574 sort_crs_matrix<DefaultExecutionSpace, 1575 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1576 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1577 1578 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 1579 1580 /* Update U^T and do sptrsv symbolic */ 1581 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 1582 Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1583 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1584 factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 1585 1586 transpose_matrix< 1587 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1588 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1589 MatRowMapKokkosView,DefaultExecutionSpace>( 1590 n,n,factors->iU_d, factors->jU_d, factors->aU_d, 1591 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1592 1593 /* Sort indices. See comments above */ 1594 sort_crs_matrix<DefaultExecutionSpace, 1595 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1596 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1597 1598 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 1599 factors->transpose_updated = PETSC_TRUE; 1600 } 1601 PetscFunctionReturn(0); 1602 } 1603 1604 /* Solve Ax = b, with A = LU */ 1605 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 1606 { 1607 ConstPetscScalarKokkosView bv; 1608 PetscScalarKokkosView xv; 1609 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1610 1611 PetscFunctionBegin; 1612 PetscCall(PetscLogGpuTimeBegin()); 1613 PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 1614 PetscCall(VecGetKokkosView(b,&bv)); 1615 PetscCall(VecGetKokkosViewWrite(x,&xv)); 1616 /* Solve L tmpv = b */ 1617 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 1618 /* Solve Ux = tmpv */ 1619 PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 1620 PetscCall(VecRestoreKokkosView(b,&bv)); 1621 PetscCall(VecRestoreKokkosViewWrite(x,&xv)); 1622 PetscCall(PetscLogGpuTimeEnd()); 1623 PetscFunctionReturn(0); 1624 } 1625 1626 /* Solve A^T x = b, where A^T = U^T L^T */ 1627 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 1628 { 1629 ConstPetscScalarKokkosView bv; 1630 PetscScalarKokkosView xv; 1631 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1632 1633 PetscFunctionBegin; 1634 PetscCall(PetscLogGpuTimeBegin()); 1635 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 1636 PetscCall(VecGetKokkosView(b,&bv)); 1637 PetscCall(VecGetKokkosViewWrite(x,&xv)); 1638 /* Solve U^T tmpv = b */ 1639 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 1640 1641 /* Solve L^T x = tmpv */ 1642 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 1643 PetscCall(VecRestoreKokkosView(b,&bv)); 1644 PetscCall(VecRestoreKokkosViewWrite(x,&xv)); 1645 PetscCall(PetscLogGpuTimeEnd()); 1646 PetscFunctionReturn(0); 1647 } 1648 1649 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1650 { 1651 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1652 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1653 PetscInt fill_lev = info->levels; 1654 1655 PetscFunctionBegin; 1656 PetscCall(PetscLogGpuTimeBegin()); 1657 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1658 1659 auto a_d = aijkok->a_dual.view_device(); 1660 auto i_d = aijkok->i_dual.view_device(); 1661 auto j_d = aijkok->j_dual.view_device(); 1662 1663 KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 1664 1665 B->assembled = PETSC_TRUE; 1666 B->preallocated = PETSC_TRUE; 1667 B->ops->solve = MatSolve_SeqAIJKokkos; 1668 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1669 B->ops->matsolve = NULL; 1670 B->ops->matsolvetranspose = NULL; 1671 B->offloadmask = PETSC_OFFLOAD_GPU; 1672 1673 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1674 factors->transpose_updated = PETSC_FALSE; 1675 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1676 /* TODO: log flops, but how to know that? */ 1677 PetscCall(PetscLogGpuTimeEnd()); 1678 PetscFunctionReturn(0); 1679 } 1680 1681 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1682 { 1683 Mat_SeqAIJKokkos *aijkok; 1684 Mat_SeqAIJ *b; 1685 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1686 PetscInt fill_lev = info->levels; 1687 PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 1688 PetscInt n = A->rmap->n; 1689 1690 PetscFunctionBegin; 1691 PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1692 /* Rebuild factors */ 1693 if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 1694 else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 1695 1696 /* Create a spiluk handle and then do symbolic factorization */ 1697 nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 1698 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 1699 1700 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1701 1702 Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 1703 Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 1704 Kokkos::realloc(factors->iU_d,n+1); 1705 Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 1706 1707 aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1708 auto i_d = aijkok->i_dual.view_device(); 1709 auto j_d = aijkok->j_dual.view_device(); 1710 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 1711 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1712 1713 Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1714 Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 1715 Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 1716 Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 1717 1718 /* TODO: add options to select sptrsv algorithms */ 1719 /* Create sptrsv handles for L, U and their transpose */ 1720 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1721 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1722 #else 1723 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1724 #endif 1725 1726 factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 1727 factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 1728 factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 1729 factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 1730 1731 /* Fill fields of the factor matrix B */ 1732 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL)); 1733 b = (Mat_SeqAIJ*)B->data; 1734 b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 1735 B->info.fill_ratio_given = info->fill; 1736 B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 1737 1738 B->offloadmask = PETSC_OFFLOAD_GPU; 1739 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1740 PetscFunctionReturn(0); 1741 } 1742 1743 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1744 { 1745 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1746 const PetscInt nrows = A->rmap->n; 1747 1748 PetscFunctionBegin; 1749 PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 1750 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1751 // move B data into Kokkos 1752 PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok 1753 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok 1754 { 1755 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1756 if (!baijkok->diag_d.extent(0)) { 1757 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1758 baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1759 Kokkos::deep_copy (baijkok->diag_d, h_diag); 1760 } 1761 } 1762 PetscFunctionReturn(0); 1763 } 1764 1765 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1766 { 1767 PetscFunctionBegin; 1768 *type = MATSOLVERKOKKOS; 1769 PetscFunctionReturn(0); 1770 } 1771 1772 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 1773 { 1774 PetscFunctionBegin; 1775 *type = MATSOLVERKOKKOSDEVICE; 1776 PetscFunctionReturn(0); 1777 } 1778 1779 /*MC 1780 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1781 on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1782 1783 Level: beginner 1784 1785 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1786 M*/ 1787 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1788 { 1789 PetscInt n = A->rmap->n; 1790 1791 PetscFunctionBegin; 1792 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 1793 PetscCall(MatSetSizes(*B,n,n,n,n)); 1794 (*B)->factortype = ftype; 1795 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 1796 PetscCall(MatSetType(*B,MATSEQAIJKOKKOS)); 1797 1798 if (ftype == MAT_FACTOR_LU) { 1799 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 1800 (*B)->canuseordering = PETSC_TRUE; 1801 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1802 } else if (ftype == MAT_FACTOR_ILU) { 1803 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 1804 (*B)->canuseordering = PETSC_FALSE; 1805 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1806 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1807 1808 PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 1809 PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos)); 1810 PetscFunctionReturn(0); 1811 } 1812 1813 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 1814 { 1815 PetscInt n = A->rmap->n; 1816 1817 PetscFunctionBegin; 1818 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 1819 PetscCall(MatSetSizes(*B,n,n,n,n)); 1820 (*B)->factortype = ftype; 1821 (*B)->canuseordering = PETSC_TRUE; 1822 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 1823 PetscCall(MatSetType(*B,MATSEQAIJKOKKOS)); 1824 1825 if (ftype == MAT_FACTOR_LU) { 1826 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 1827 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1828 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1829 1830 PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 1831 PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device)); 1832 PetscFunctionReturn(0); 1833 } 1834 1835 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1836 { 1837 PetscFunctionBegin; 1838 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos)); 1839 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos)); 1840 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device)); 1841 PetscFunctionReturn(0); 1842 } 1843 1844 /* Utility to print out a KokkosCsrMatrix for debugging */ 1845 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1846 { 1847 const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1848 const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1849 const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1850 const PetscInt *i = iv.data(); 1851 const PetscInt *j = jv.data(); 1852 const PetscScalar *a = av.data(); 1853 PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1854 1855 PetscFunctionBegin; 1856 PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz)); 1857 for (PetscInt k=0; k<m; k++) { 1858 PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k)); 1859 for (PetscInt p=i[k]; p<i[k+1]; p++) { 1860 PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],(double)PetscRealPart(a[p]))); 1861 } 1862 PetscCall(PetscPrintf(PETSC_COMM_SELF,"\n")); 1863 } 1864 PetscFunctionReturn(0); 1865 } 1866