1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petscsystypes.h> 5 #include <petscerror.h> 6 7 #include <Kokkos_Core.hpp> 8 #include <KokkosBlas.hpp> 9 #include <KokkosKernels_Sorting.hpp> 10 #include <KokkosSparse_CrsMatrix.hpp> 11 #include <KokkosSparse_spmv.hpp> 12 #include <KokkosSparse_spiluk.hpp> 13 #include <KokkosSparse_sptrsv.hpp> 14 #include <KokkosSparse_spgemm.hpp> 15 #include <KokkosSparse_spadd.hpp> 16 17 #include <../src/mat/impls/aij/seq/aij.h> 18 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 19 20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 21 22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 23 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 24 In the latter case, it is important to set a_dual's sync state correctly. 25 */ 26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 27 { 28 PetscErrorCode ierr; 29 Mat_SeqAIJ *aijseq; 30 Mat_SeqAIJKokkos *aijkok; 31 32 PetscFunctionBegin; 33 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 34 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 35 36 aijseq = static_cast<Mat_SeqAIJ*>(A->data); 37 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 38 39 /* If aijkok does not exist, we just copy i, j to device. 40 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 41 In both cases, we build a new aijkok structure. 42 */ 43 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 44 delete aijkok; 45 aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 46 A->spptr = aijkok; 47 } 48 49 if (aijkok && aijkok->device_mat_d.data()) { 50 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 51 } 52 PetscFunctionReturn(0); 53 } 54 55 /* Sync CSR data to device if not yet */ 56 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 57 { 58 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 59 60 PetscFunctionBegin; 61 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 62 if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device"); 63 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 64 if (aijkok->a_dual.need_sync_device()) { 65 aijkok->a_dual.sync_device(); 66 aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */ 67 aijkok->hermitian_updated = PETSC_FALSE; 68 } 69 PetscFunctionReturn(0); 70 } 71 72 /* Mark the CSR data on device as modified */ 73 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 74 { 75 PetscErrorCode ierr; 76 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 77 78 PetscFunctionBegin; 79 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 80 aijkok->a_dual.clear_sync_state(); 81 aijkok->a_dual.modify_device(); 82 aijkok->transpose_updated = PETSC_FALSE; 83 aijkok->hermitian_updated = PETSC_FALSE; 84 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 85 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 86 PetscFunctionReturn(0); 87 } 88 89 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 90 { 91 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 92 93 PetscFunctionBegin; 94 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 95 /* We do not expect one needs factors on host */ 96 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 97 if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 98 aijkok->a_dual.sync_host(); 99 PetscFunctionReturn(0); 100 } 101 102 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 103 { 104 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 105 106 PetscFunctionBegin; 107 if (aijkok) { 108 aijkok->a_dual.sync_host(); 109 *array = aijkok->a_dual.view_host().data(); 110 } else { /* Happens when calling MatSetValues on a newly created matrix */ 111 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 112 } 113 PetscFunctionReturn(0); 114 } 115 116 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 117 { 118 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 119 120 PetscFunctionBegin; 121 if (aijkok) aijkok->a_dual.modify_host(); 122 PetscFunctionReturn(0); 123 } 124 125 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 126 { 127 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 128 129 PetscFunctionBegin; 130 if (aijkok) { 131 aijkok->a_dual.sync_host(); 132 *array = aijkok->a_dual.view_host().data(); 133 } else { 134 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 135 } 136 PetscFunctionReturn(0); 137 } 138 139 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 140 { 141 PetscFunctionBegin; 142 *array = NULL; 143 PetscFunctionReturn(0); 144 } 145 146 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 147 { 148 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 149 150 PetscFunctionBegin; 151 if (aijkok) { 152 *array = aijkok->a_dual.view_host().data(); 153 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 154 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 155 } 156 PetscFunctionReturn(0); 157 } 158 159 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 160 { 161 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 162 163 PetscFunctionBegin; 164 if (aijkok) { 165 aijkok->a_dual.clear_sync_state(); 166 aijkok->a_dual.modify_host(); 167 } 168 PetscFunctionReturn(0); 169 } 170 171 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 172 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 173 { 174 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 175 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 176 177 PetscFunctionBegin; 178 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 179 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 180 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 181 PetscFunctionReturn(0); 182 } 183 184 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 185 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 186 { 187 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 188 189 PetscFunctionBegin; 190 if (aijkok && aijkok->device_mat_d.data()) { 191 *d_mat = aijkok->device_mat_d.data(); 192 } else { 193 PetscErrorCode ierr; 194 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 195 *d_mat = NULL; 196 } 197 PetscFunctionReturn(0); 198 } 199 200 /* Generate the transpose on device and cache it internally */ 201 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 202 { 203 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 204 205 PetscFunctionBegin; 206 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 207 if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 208 /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 209 CHKERRCXX(aijkok->a_dual.sync_device()); 210 CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 211 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 212 aijkok->transpose_updated = PETSC_TRUE; 213 } 214 *csrmatT = &aijkok->csrmatT; 215 PetscFunctionReturn(0); 216 } 217 218 /* Generate the Hermitian on device and cache it internally */ 219 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 220 { 221 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 222 223 PetscFunctionBegin; 224 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 225 if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 226 CHKERRCXX(aijkok->a_dual.sync_device()); 227 CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 228 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 229 #if defined(PETSC_USE_COMPLEX) 230 const auto& a = aijkok->csrmatH.values; 231 Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 232 #endif 233 aijkok->hermitian_updated = PETSC_TRUE; 234 } 235 *csrmatH = &aijkok->csrmatH; 236 PetscFunctionReturn(0); 237 } 238 239 /* y = A x */ 240 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 241 { 242 PetscErrorCode ierr; 243 Mat_SeqAIJKokkos *aijkok; 244 ConstPetscScalarKokkosView xv; 245 PetscScalarKokkosView yv; 246 247 PetscFunctionBegin; 248 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 249 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 250 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 251 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 252 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 253 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 254 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 255 ierr = WaitForKokkos();CHKERRQ(ierr); 256 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 257 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 258 PetscFunctionReturn(0); 259 } 260 261 /* y = A^T x */ 262 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 263 { 264 PetscErrorCode ierr; 265 Mat_SeqAIJKokkos *aijkok; 266 const char *mode; 267 ConstPetscScalarKokkosView xv; 268 PetscScalarKokkosView yv; 269 KokkosCsrMatrix *csrmat; 270 271 PetscFunctionBegin; 272 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 273 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 274 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 275 if (A->form_explicit_transpose) { 276 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 277 mode = "N"; 278 } else { 279 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 280 csrmat = &aijkok->csrmat; 281 mode = "T"; 282 } 283 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 284 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 285 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 286 ierr = WaitForKokkos();CHKERRQ(ierr); 287 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 288 PetscFunctionReturn(0); 289 } 290 291 /* y = A^H x */ 292 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 293 { 294 PetscErrorCode ierr; 295 Mat_SeqAIJKokkos *aijkok; 296 const char *mode; 297 ConstPetscScalarKokkosView xv; 298 PetscScalarKokkosView yv; 299 KokkosCsrMatrix *csrmat; 300 301 PetscFunctionBegin; 302 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 303 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 304 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 305 if (A->form_explicit_transpose) { 306 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 307 mode = "N"; 308 } else { 309 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 310 csrmat = &aijkok->csrmat; 311 mode = "C"; 312 } 313 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 314 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 315 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 316 ierr = WaitForKokkos();CHKERRQ(ierr); 317 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 318 PetscFunctionReturn(0); 319 } 320 321 /* z = A x + y */ 322 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 323 { 324 PetscErrorCode ierr; 325 Mat_SeqAIJKokkos *aijkok; 326 ConstPetscScalarKokkosView xv,yv; 327 PetscScalarKokkosView zv; 328 329 PetscFunctionBegin; 330 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 331 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 332 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 333 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 334 if (zz != yy) Kokkos::deep_copy(zv,yv); 335 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 336 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 337 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 338 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 339 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 340 ierr = WaitForKokkos();CHKERRQ(ierr); 341 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 342 PetscFunctionReturn(0); 343 } 344 345 /* z = A^T x + y */ 346 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 347 { 348 PetscErrorCode ierr; 349 Mat_SeqAIJKokkos *aijkok; 350 const char *mode; 351 ConstPetscScalarKokkosView xv,yv; 352 PetscScalarKokkosView zv; 353 KokkosCsrMatrix *csrmat; 354 355 PetscFunctionBegin; 356 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 357 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 358 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 359 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 360 if (zz != yy) Kokkos::deep_copy(zv,yv); 361 if (A->form_explicit_transpose) { 362 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 363 mode = "N"; 364 } else { 365 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 366 csrmat = &aijkok->csrmat; 367 mode = "T"; 368 } 369 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 370 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 371 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 372 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 373 ierr = WaitForKokkos();CHKERRQ(ierr); 374 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 375 PetscFunctionReturn(0); 376 } 377 378 /* z = A^H x + y */ 379 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 380 { 381 PetscErrorCode ierr; 382 Mat_SeqAIJKokkos *aijkok; 383 const char *mode; 384 ConstPetscScalarKokkosView xv,yv; 385 PetscScalarKokkosView zv; 386 KokkosCsrMatrix *csrmat; 387 388 PetscFunctionBegin; 389 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 390 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 391 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 392 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 393 if (zz != yy) Kokkos::deep_copy(zv,yv); 394 if (A->form_explicit_transpose) { 395 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 396 mode = "N"; 397 } else { 398 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 399 csrmat = &aijkok->csrmat; 400 mode = "C"; 401 } 402 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 403 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 404 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 405 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 406 ierr = WaitForKokkos();CHKERRQ(ierr); 407 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 408 PetscFunctionReturn(0); 409 } 410 411 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 412 { 413 PetscErrorCode ierr; 414 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 415 416 PetscFunctionBegin; 417 switch (op) { 418 case MAT_FORM_EXPLICIT_TRANSPOSE: 419 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 420 if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 421 A->form_explicit_transpose = flg; 422 break; 423 default: 424 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 425 break; 426 } 427 PetscFunctionReturn(0); 428 } 429 430 /* Depending on reuse, either build a new mat, or use the existing mat */ 431 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 432 { 433 PetscErrorCode ierr; 434 Mat_SeqAIJ *aseq; 435 436 PetscFunctionBegin; 437 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 438 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 439 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */ 440 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 441 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */ 442 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 443 if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 444 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 445 ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 446 ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 447 ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr); 448 aseq = static_cast<Mat_SeqAIJ*>(A->data); 449 if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */ 450 if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 451 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 452 } 453 } 454 PetscFunctionReturn(0); 455 } 456 457 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 458 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 459 */ 460 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 461 { 462 PetscErrorCode ierr; 463 Mat_SeqAIJ *bseq; 464 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 465 Mat mat; 466 467 PetscFunctionBegin; 468 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 469 ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr); 470 mat = *B; 471 if (A->assembled) { 472 bseq = static_cast<Mat_SeqAIJ*>(mat->data); 473 bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 474 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 475 /* Now copy values to B if needed */ 476 if (dupOption == MAT_COPY_VALUES) { 477 if (akok->a_dual.need_sync_device()) { 478 Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 479 bkok->a_dual.modify_host(); 480 } else { /* If device has the latest data, we only copy data on device */ 481 Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 482 bkok->a_dual.modify_device(); 483 } 484 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 485 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 486 bkok->a_dual.modify_host(); 487 } 488 mat->spptr = bkok; 489 } 490 491 ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr); 492 ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 493 ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr); 494 ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr); 495 PetscFunctionReturn(0); 496 } 497 498 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 499 { 500 PetscErrorCode ierr; 501 Mat At; 502 KokkosCsrMatrix *internT,*csrmatT; 503 Mat_SeqAIJKokkos *atkok,*bkok; 504 505 PetscFunctionBegin; 506 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */ 507 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 508 CHKERRCXX(csrmatT = new KokkosCsrMatrix("csrmat",*internT)); /* Deep copy internT to csrmatT, as we want to isolate the internal transpose */ 509 CHKERRCXX(atkok = new Mat_SeqAIJKokkos(*csrmatT)); 510 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr); 511 if (reuse == MAT_INITIAL_MATRIX) *B = At; 512 else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */ 513 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 514 if ((*B)->assembled) { 515 bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 516 CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 517 ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr); 518 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 519 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 520 MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 521 MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 522 CHKERRCXX(Kokkos::deep_copy(a_h,internT->values)); 523 CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 524 } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 525 } 526 PetscFunctionReturn(0); 527 } 528 529 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 530 { 531 PetscErrorCode ierr; 532 Mat_SeqAIJKokkos *aijkok; 533 534 PetscFunctionBegin; 535 if (A->factortype == MAT_FACTOR_NONE) { 536 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 537 if (aijkok) { 538 if (aijkok->device_mat_d.data()) { 539 delete aijkok->colmap_d; 540 delete aijkok->i_uncompressed_d; 541 } 542 if (aijkok->diag_d) delete aijkok->diag_d; 543 } 544 delete aijkok; 545 } else { 546 delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 547 } 548 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 549 A->spptr = NULL; 550 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 551 PetscFunctionReturn(0); 552 } 553 554 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 555 { 556 PetscErrorCode ierr; 557 558 PetscFunctionBegin; 559 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 560 ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 561 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 562 PetscFunctionReturn(0); 563 } 564 565 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 566 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 567 { 568 PetscErrorCode ierr; 569 Mat_SeqAIJ *a,*b; 570 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 571 MatScalarKokkosView aa,ba,ca; 572 MatRowMapKokkosView ai,bi,ci; 573 MatColIdxKokkosView aj,bj,cj; 574 PetscInt m,n,nnz,aN; 575 576 PetscFunctionBegin; 577 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 578 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 579 PetscValidPointer(C,4); 580 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 581 PetscCheckTypeName(B,MATSEQAIJKOKKOS); 582 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 583 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 584 585 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 586 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 587 a = static_cast<Mat_SeqAIJ*>(A->data); 588 b = static_cast<Mat_SeqAIJ*>(B->data); 589 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 590 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 591 aa = akok->a_dual.view_device(); 592 ai = akok->i_dual.view_device(); 593 ba = bkok->a_dual.view_device(); 594 bi = bkok->i_dual.view_device(); 595 m = A->rmap->n; /* M, N and nnz of C */ 596 n = A->cmap->n + B->cmap->n; 597 nnz = a->nz + b->nz; 598 aN = A->cmap->n; /* N of A */ 599 if (reuse == MAT_INITIAL_MATRIX) { 600 aj = akok->j_dual.view_device(); 601 bj = bkok->j_dual.view_device(); 602 auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 603 auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 604 auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 605 ca = ca_dual.view_device(); 606 ci = ci_dual.view_device(); 607 cj = cj_dual.view_device(); 608 609 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 610 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 611 PetscInt i = t.league_rank(); /* row i */ 612 PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 613 614 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 615 ci(i) = coffset; 616 if (i == m-1) ci(m) = ai(m) + bi(m); 617 }); 618 619 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 620 if (k < alen) { 621 ca(coffset+k) = aa(ai(i)+k); 622 cj(coffset+k) = aj(ai(i)+k); 623 } else { 624 ca(coffset+k) = ba(bi(i)+k-alen); 625 cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 626 } 627 }); 628 }); 629 ca_dual.modify_device(); 630 ci_dual.modify_device(); 631 cj_dual.modify_device(); 632 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 633 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr); 634 } else if (reuse == MAT_REUSE_MATRIX) { 635 PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 636 PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 637 ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 638 ca = ckok->a_dual.view_device(); 639 ci = ckok->i_dual.view_device(); 640 641 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 642 PetscInt i = t.league_rank(); /* row i */ 643 PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 644 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 645 if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 646 else ca(ci(i)+k) = ba(bi(i)+k-alen); 647 }); 648 }); 649 ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr); 650 } 651 PetscFunctionReturn(0); 652 } 653 654 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 655 { 656 PetscFunctionBegin; 657 delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 658 PetscFunctionReturn(0); 659 } 660 661 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 662 { 663 PetscErrorCode ierr; 664 Mat_Product *product = C->product; 665 Mat A,B; 666 bool transA,transB; /* use bool, since KK needs this type */ 667 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 668 Mat_SeqAIJ *c; 669 MatProductData_SeqAIJKokkos *pdata; 670 KokkosCsrMatrix *csrmatA,*csrmatB; 671 672 PetscFunctionBegin; 673 MatCheckProduct(C,1); 674 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 675 pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 676 677 if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 678 pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 679 PetscFunctionReturn(0); 680 } 681 682 switch (product->type) { 683 case MATPRODUCT_AB: transA = false; transB = false; break; 684 case MATPRODUCT_AtB: transA = true; transB = false; break; 685 case MATPRODUCT_ABt: transA = false; transB = true; break; 686 default: 687 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 688 } 689 690 A = product->A; 691 B = product->B; 692 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 693 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 694 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 695 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 696 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 697 698 if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 699 700 csrmatA = &akok->csrmat; 701 csrmatB = &bkok->csrmat; 702 703 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 704 if (transA) { 705 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 706 transA = false; 707 } 708 709 if (transB) { 710 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 711 transB = false; 712 } 713 714 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 715 CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 716 ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr); 717 /* shorter version of MatAssemblyEnd_SeqAIJ */ 718 c = (Mat_SeqAIJ*)C->data; 719 ierr = PetscInfo3(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 720 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 721 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr); 722 c->reallocs = 0; 723 C->info.mallocs = 0; 724 C->info.nz_unneeded = 0; 725 C->assembled = C->was_assembled = PETSC_TRUE; 726 C->num_ass++; 727 PetscFunctionReturn(0); 728 } 729 730 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 731 { 732 PetscErrorCode ierr; 733 Mat_Product *product = C->product; 734 MatProductType ptype; 735 Mat A,B; 736 bool transA,transB; 737 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 738 MatProductData_SeqAIJKokkos *pdata; 739 MPI_Comm comm; 740 KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 741 742 PetscFunctionBegin; 743 MatCheckProduct(C,1); 744 ierr = PetscObjectGetComm((PetscObject)C,&comm); 745 if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty"); 746 A = product->A; 747 B = product->B; 748 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 749 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 750 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 751 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 752 csrmatA = &akok->csrmat; 753 csrmatB = &bkok->csrmat; 754 755 ptype = product->type; 756 switch (ptype) { 757 case MATPRODUCT_AB: transA = false; transB = false; break; 758 case MATPRODUCT_AtB: transA = true; transB = false; break; 759 case MATPRODUCT_ABt: transA = false; transB = true; break; 760 default: 761 SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 762 } 763 764 product->data = pdata = new MatProductData_SeqAIJKokkos(); 765 pdata->kh.set_team_work_size(16); 766 pdata->kh.set_dynamic_scheduling(true); 767 pdata->reusesym = product->api_user; 768 769 /* TODO: add command line options to select spgemm algorithms */ 770 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 771 /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API. 772 We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support. 773 */ 774 pdata->kh.create_spgemm_handle(spgemm_alg); 775 776 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 777 if (transA) { 778 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 779 transA = false; 780 } 781 782 if (transB) { 783 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 784 transB = false; 785 } 786 787 CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 788 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 789 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 790 calling new Mat_SeqAIJKokkos(). 791 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 792 */ 793 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 794 CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 795 796 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 797 ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr); 798 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 799 PetscFunctionReturn(0); 800 } 801 802 /* handles sparse matrix matrix ops */ 803 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 804 { 805 PetscErrorCode ierr; 806 Mat_Product *product = mat->product; 807 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 808 809 PetscFunctionBegin; 810 MatCheckProduct(mat,1); 811 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 812 if (product->type == MATPRODUCT_ABC) { 813 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 814 } 815 if (Biskok && Ciskok) { 816 switch (product->type) { 817 case MATPRODUCT_AB: 818 case MATPRODUCT_AtB: 819 case MATPRODUCT_ABt: 820 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 821 break; 822 case MATPRODUCT_PtAP: 823 case MATPRODUCT_RARt: 824 case MATPRODUCT_ABC: 825 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 826 break; 827 default: 828 break; 829 } 830 } else { /* fallback for AIJ */ 831 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 832 } 833 PetscFunctionReturn(0); 834 } 835 836 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 837 { 838 PetscErrorCode ierr; 839 Mat_SeqAIJKokkos *aijkok; 840 841 PetscFunctionBegin; 842 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 843 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 844 KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 845 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 846 ierr = WaitForKokkos();CHKERRQ(ierr); 847 ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr); 848 PetscFunctionReturn(0); 849 } 850 851 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 852 { 853 PetscErrorCode ierr; 854 Mat_SeqAIJKokkos *aijkok; 855 856 PetscFunctionBegin; 857 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 858 if (aijkok) { /* Only zero the device if data is already there */ 859 KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 860 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 861 } else { /* Might be preallocated but not assembled */ 862 ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 863 } 864 PetscFunctionReturn(0); 865 } 866 867 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 868 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstPetscScalarKokkosView* kv) 869 { 870 PetscErrorCode ierr; 871 Mat_SeqAIJKokkos *aijkok; 872 873 PetscFunctionBegin; 874 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 875 PetscValidPointer(kv,2); 876 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 877 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 878 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 879 *kv = aijkok->a_dual.view_device(); 880 PetscFunctionReturn(0); 881 } 882 883 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstPetscScalarKokkosView* kv) 884 { 885 PetscFunctionBegin; 886 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 887 PetscValidPointer(kv,2); 888 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 889 PetscFunctionReturn(0); 890 } 891 892 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,PetscScalarKokkosView* kv) 893 { 894 PetscErrorCode ierr; 895 Mat_SeqAIJKokkos *aijkok; 896 897 PetscFunctionBegin; 898 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 899 PetscValidPointer(kv,2); 900 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 901 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 902 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 903 *kv = aijkok->a_dual.view_device(); 904 PetscFunctionReturn(0); 905 } 906 907 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,PetscScalarKokkosView* kv) 908 { 909 PetscErrorCode ierr; 910 911 PetscFunctionBegin; 912 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 913 PetscValidPointer(kv,2); 914 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 915 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 916 PetscFunctionReturn(0); 917 } 918 919 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) 920 { 921 Mat_SeqAIJKokkos *aijkok; 922 923 PetscFunctionBegin; 924 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 925 PetscValidPointer(kv,2); 926 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 927 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 928 *kv = aijkok->a_dual.view_device(); 929 PetscFunctionReturn(0); 930 } 931 932 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) 933 { 934 PetscErrorCode ierr; 935 936 PetscFunctionBegin; 937 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 938 PetscValidPointer(kv,2); 939 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 940 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 941 PetscFunctionReturn(0); 942 } 943 944 /* Computes Y += alpha X */ 945 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 946 { 947 PetscErrorCode ierr; 948 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 949 Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 950 ConstMatScalarKokkosView Xa; 951 MatScalarKokkosView Ya; 952 953 PetscFunctionBegin; 954 PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 955 PetscCheckTypeName(X,MATSEQAIJKOKKOS); 956 ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); 957 ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 958 959 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 960 /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 961 PetscBool e; 962 ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 963 if (e) { 964 ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 965 if (e) pattern = SAME_NONZERO_PATTERN; 966 } 967 } 968 969 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 970 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 971 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 972 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 973 */ 974 ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 975 xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 976 Xa = xkok->a_dual.view_device(); 977 Ya = ykok->a_dual.view_device(); 978 979 if (pattern == SAME_NONZERO_PATTERN) { 980 KokkosBlas::axpy(alpha,Xa,Ya); 981 ierr = MatSeqAIJKokkosModifyDevice(Y); 982 } else if (pattern == SUBSET_NONZERO_PATTERN) { 983 MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 984 MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 985 986 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 987 PetscInt i = t.league_rank(); /* row i */ 988 Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 989 PetscInt p,q = Yi(i); 990 for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 991 while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 992 if (Xj(p) == Yj(q)) { /* Found it */ 993 Ya(q) += alpha * Xa(p); 994 q++; 995 } else { 996 /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 997 Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 998 */ 999 if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */ 1000 } 1001 } 1002 }); 1003 }); 1004 ierr = MatSeqAIJKokkosModifyDevice(Y); 1005 } else { /* different nonzero patterns */ 1006 Mat Z; 1007 KokkosCsrMatrix zcsr; 1008 KernelHandle kh; 1009 kh.create_spadd_handle(false); 1010 KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1011 KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1012 zkok = new Mat_SeqAIJKokkos(zcsr); 1013 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr); 1014 ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr); 1015 kh.destroy_spadd_handle(); 1016 } 1017 1018 PetscFunctionReturn(0); 1019 } 1020 1021 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1022 { 1023 PetscErrorCode ierr; 1024 1025 PetscFunctionBegin; 1026 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 1027 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 1028 B->offloadmask = PETSC_OFFLOAD_CPU; 1029 PetscFunctionReturn(0); 1030 } 1031 1032 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1033 { 1034 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1035 1036 PetscFunctionBegin; 1037 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1038 A->boundtocpu = PETSC_FALSE; 1039 1040 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1041 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1042 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1043 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1044 A->ops->scale = MatScale_SeqAIJKokkos; 1045 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1046 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1047 A->ops->mult = MatMult_SeqAIJKokkos; 1048 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1049 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1050 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1051 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1052 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1053 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1054 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1055 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1056 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1057 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1058 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1059 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1060 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1061 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1062 PetscFunctionReturn(0); 1063 } 1064 1065 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1066 { 1067 PetscErrorCode ierr; 1068 Mat_SeqAIJ *aseq; 1069 PetscInt i,m,n; 1070 1071 PetscFunctionBegin; 1072 if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1073 1074 m = akok->nrows(); 1075 n = akok->ncols(); 1076 ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr); 1077 ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1078 1079 /* Set up data structures of A as a MATSEQAIJ */ 1080 ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1081 aseq = (Mat_SeqAIJ*)(A)->data; 1082 1083 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1084 akok->j_dual.sync_host(); 1085 1086 aseq->i = akok->i_host_data(); 1087 aseq->j = akok->j_host_data(); 1088 aseq->a = akok->a_host_data(); 1089 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1090 aseq->singlemalloc = PETSC_FALSE; 1091 aseq->free_a = PETSC_FALSE; 1092 aseq->free_ij = PETSC_FALSE; 1093 aseq->nz = akok->nnz(); 1094 aseq->maxnz = aseq->nz; 1095 1096 ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr); 1097 ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr); 1098 for (i=0; i<m; i++) { 1099 aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1100 } 1101 1102 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1103 akok->nonzerostate = A->nonzerostate; 1104 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); 1105 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); 1106 A->spptr = akok; 1107 PetscFunctionReturn(0); 1108 } 1109 1110 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1111 1112 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1113 */ 1114 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1115 { 1116 PetscErrorCode ierr; 1117 1118 PetscFunctionBegin; 1119 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1120 ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr); 1121 PetscFunctionReturn(0); 1122 } 1123 1124 /* --------------------------------------------------------------------------------*/ 1125 /*@C 1126 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 1127 (the default parallel PETSc format). This matrix will ultimately be handled by 1128 Kokkos for calculations. For good matrix 1129 assembly performance the user should preallocate the matrix storage by setting 1130 the parameter nz (or the array nnz). By setting these parameters accurately, 1131 performance during matrix assembly can be increased by more than a factor of 50. 1132 1133 Collective 1134 1135 Input Parameters: 1136 + comm - MPI communicator, set to PETSC_COMM_SELF 1137 . m - number of rows 1138 . n - number of columns 1139 . nz - number of nonzeros per row (same for all rows) 1140 - nnz - array containing the number of nonzeros in the various rows 1141 (possibly different for each row) or NULL 1142 1143 Output Parameter: 1144 . A - the matrix 1145 1146 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1147 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1148 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1149 1150 Notes: 1151 If nnz is given then nz is ignored 1152 1153 The AIJ format (also called the Yale sparse matrix format or 1154 compressed row storage), is fully compatible with standard Fortran 77 1155 storage. That is, the stored row and column indices can begin at 1156 either one (as in Fortran) or zero. See the users' manual for details. 1157 1158 Specify the preallocated storage with either nz or nnz (not both). 1159 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1160 allocation. For large problems you MUST preallocate memory or you 1161 will get TERRIBLE performance, see the users' manual chapter on matrices. 1162 1163 By default, this format uses inodes (identical nodes) when possible, to 1164 improve numerical efficiency of matrix-vector products and solves. We 1165 search for consecutive rows with the same nonzero structure, thereby 1166 reusing matrix information to achieve increased efficiency. 1167 1168 Level: intermediate 1169 1170 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 1171 @*/ 1172 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1173 { 1174 PetscErrorCode ierr; 1175 1176 PetscFunctionBegin; 1177 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 1178 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1179 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1180 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1181 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1182 PetscFunctionReturn(0); 1183 } 1184 1185 typedef Kokkos::TeamPolicy<>::member_type team_member; 1186 // 1187 // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container. 1188 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1189 // 1190 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1191 { 1192 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1193 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1194 IS isrow = b->row,isicol = b->icol; 1195 PetscErrorCode ierr; 1196 const PetscInt *r_h,*ic_h; 1197 const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data(); 1198 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1199 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1200 PetscBool row_identity,col_identity; 1201 PetscInt nc, Nf, nVec=32; // should be a parameter 1202 PetscContainer container; 1203 1204 PetscFunctionBegin; 1205 if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n); 1206 ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 1207 if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 1208 ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 1209 if (container) { 1210 PetscInt *pNf=NULL, nv; 1211 ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 1212 Nf = (*pNf)%1000; 1213 nv = (*pNf)/1000; 1214 if (nv>0) nVec = nv; 1215 } else Nf = 1; 1216 if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %" PetscInt_FMT " %" PetscInt_FMT,n,Nf); 1217 ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 1218 ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 1219 ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 1220 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 1221 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1222 #endif 1223 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1224 { 1225 #define KOKKOS_SHARED_LEVEL 1 1226 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1227 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1228 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1229 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 1230 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 1231 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 1232 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 1233 size_t flops_h = 0.0; 1234 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 1235 Kokkos::View<size_t> d_flops_k ("flops"); 1236 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1237 const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 1238 Kokkos::deep_copy (d_flops_k, h_flops_k); 1239 Kokkos::deep_copy (d_r_k, h_r_k); 1240 Kokkos::deep_copy (d_ic_k, h_ic_k); 1241 // Fill A --> fact 1242 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1243 const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 1244 const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 1245 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1246 // zero rows of B 1247 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1248 PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 1249 PetscScalar *baL = ba_d + bi_d[rowb]; 1250 PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 1251 /* zero (unfactored row) */ 1252 for (int j=0;j<nzbL;j++) baL[j] = 0; 1253 for (int j=0;j<nzbU;j++) baU[j] = 0; 1254 }); 1255 // copy A into B 1256 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1257 PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 1258 const PetscScalar *av = aa_d + ai_d[rowa]; 1259 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1260 /* load in initial (unfactored row) */ 1261 for (int j=0;j<nza;j++) { 1262 PetscInt colb = ic[ajtmp[j]]; 1263 PetscScalar vala = av[j]; 1264 if (colb == rowb) { 1265 *(ba_d + bdiag_d[rowb]) = vala; 1266 } else { 1267 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1268 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1269 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 1270 for (int j=0; j<nz ; j++) { 1271 if (pbj[j] == colb) { 1272 pba[j] = vala; 1273 set++; 1274 break; 1275 } 1276 } 1277 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1278 } 1279 } 1280 }); 1281 }); 1282 Kokkos::fence(); 1283 1284 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 1285 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1286 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1287 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1288 const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 1289 const PetscInt start = field*nloc, end = start + nloc; 1290 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1291 // A22 panel update for each row A(1,:) and col A(:,1) 1292 for (int ii=start; ii<end-1; ii++) { 1293 const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 1294 const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 1295 const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 1296 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1297 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 1298 PetscInt kIdx = j*Ni + field_block_idx; 1299 if (kIdx >= nzUi) /* void */ ; 1300 else { 1301 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1302 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1303 const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 1304 size_t st_idx; 1305 // find and do L(k,i) = A(:k,i) / A(i,i) 1306 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1307 // get column, there has got to be a better way 1308 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 1309 if (pjL[j] == ii) { 1310 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1311 idx = j; // output 1312 *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1313 } 1314 }, st_idx); 1315 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 1316 #if defined(PETSC_USE_DEBUG) 1317 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register 1318 #endif 1319 // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1320 // U(i+1,:end) 1321 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 1322 PetscScalar Uij = baUi[uiIdx]; 1323 PetscInt col = bjUi[uiIdx]; 1324 if (col==myk) { 1325 // A_kk = A_kk - L_ki * U_ij(k) 1326 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1327 *Akkv = *Akkv - L_ki() * Uij; // UiK 1328 } else { 1329 PetscScalar *start, *end, *pAkjv=NULL; 1330 PetscInt high, low; 1331 const PetscInt *startj; 1332 if (col<myk) { // L 1333 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1334 PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 1335 start = pLki+1; // start at pLki+1, A22(myk,1) 1336 startj= bj_d + bi_d[myk] + idx; 1337 end = ba_d + bi_d[myk+1]; 1338 } else { 1339 PetscInt idx = bdiag_d[myk+1]+1; 1340 start = ba_d + idx; 1341 startj= bj_d + idx; 1342 end = ba_d + bdiag_d[myk]; 1343 } 1344 // search for 'col', use bisection search - TODO 1345 low = 0; 1346 high = (PetscInt)(end-start); 1347 while (high-low > 5) { 1348 int t = (low+high)/2; 1349 if (startj[t] > col) high = t; 1350 else low = t; 1351 } 1352 for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 1353 if (startj[pAkjv-start] == col) break; 1354 } 1355 #if defined(PETSC_USE_DEBUG) 1356 if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register 1357 #endif 1358 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1359 } 1360 }); 1361 } 1362 }); 1363 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1364 if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 1365 } /* endof for (i=0; i<n; i++) { */ 1366 Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 1367 }); 1368 Kokkos::fence(); 1369 Kokkos::deep_copy (h_flops_k, d_flops_k); 1370 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 1371 ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 1372 #elif defined(PETSC_USE_LOG) 1373 ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 1374 #endif 1375 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 1376 const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 1377 const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 1378 /* Invert diagonal for simpler triangular solves */ 1379 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 1380 int i = start + outer_index*Ni + lg_rank%Ni; 1381 if (i < end) { 1382 PetscScalar *pv = ba_d + bdiag_d[i]; 1383 *pv = 1.0/(*pv); 1384 } 1385 }); 1386 }); 1387 } 1388 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 1389 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1390 #endif 1391 ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 1392 ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 1393 1394 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 1395 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 1396 if (b->inode.size) { 1397 B->ops->solve = MatSolve_SeqAIJ_Inode; 1398 } else if (row_identity && col_identity) { 1399 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1400 } else { 1401 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1402 } 1403 B->offloadmask = PETSC_OFFLOAD_GPU; 1404 ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 1405 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1406 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1407 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1408 B->ops->matsolve = MatMatSolve_SeqAIJ; 1409 B->assembled = PETSC_TRUE; 1410 B->preallocated = PETSC_TRUE; 1411 1412 PetscFunctionReturn(0); 1413 } 1414 1415 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1416 { 1417 PetscErrorCode ierr; 1418 1419 PetscFunctionBegin; 1420 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1421 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1422 PetscFunctionReturn(0); 1423 } 1424 1425 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1426 { 1427 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1428 1429 PetscFunctionBegin; 1430 if (!factors->sptrsv_symbolic_completed) { 1431 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 1432 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 1433 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1434 } 1435 PetscFunctionReturn(0); 1436 } 1437 1438 /* Check if we need to update factors etc for transpose solve */ 1439 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1440 { 1441 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1442 MatColIdxType n = A->rmap->n; 1443 1444 PetscFunctionBegin; 1445 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1446 /* Update L^T and do sptrsv symbolic */ 1447 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 1448 Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1449 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1450 factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 1451 1452 KokkosKernels::Impl::transpose_matrix< 1453 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1454 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1455 MatRowMapKokkosView,DefaultExecutionSpace>( 1456 n,n,factors->iL_d,factors->jL_d,factors->aL_d, 1457 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1458 1459 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1460 We have to sort the indices, until KK provides finer control options. 1461 */ 1462 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1463 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1464 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1465 1466 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 1467 1468 /* Update U^T and do sptrsv symbolic */ 1469 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 1470 Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1471 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1472 factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 1473 1474 KokkosKernels::Impl::transpose_matrix< 1475 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1476 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1477 MatRowMapKokkosView,DefaultExecutionSpace>( 1478 n,n,factors->iU_d, factors->jU_d, factors->aU_d, 1479 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1480 1481 /* Sort indices. See comments above */ 1482 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1483 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1484 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1485 1486 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 1487 factors->transpose_updated = PETSC_TRUE; 1488 } 1489 PetscFunctionReturn(0); 1490 } 1491 1492 /* Solve Ax = b, with A = LU */ 1493 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 1494 { 1495 PetscErrorCode ierr; 1496 ConstPetscScalarKokkosView bv; 1497 PetscScalarKokkosView xv; 1498 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1499 1500 PetscFunctionBegin; 1501 ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr); 1502 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1503 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1504 /* Solve L tmpv = b */ 1505 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 1506 /* Solve Ux = tmpv */ 1507 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 1508 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1509 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1510 PetscFunctionReturn(0); 1511 } 1512 1513 /* Solve A^T x = b, where A^T = U^T L^T */ 1514 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 1515 { 1516 PetscErrorCode ierr; 1517 ConstPetscScalarKokkosView bv; 1518 PetscScalarKokkosView xv; 1519 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1520 1521 PetscFunctionBegin; 1522 ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr); 1523 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1524 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1525 /* Solve U^T tmpv = b */ 1526 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 1527 1528 /* Solve L^T x = tmpv */ 1529 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 1530 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1531 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1532 PetscFunctionReturn(0); 1533 } 1534 1535 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1536 { 1537 PetscErrorCode ierr; 1538 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1539 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1540 PetscInt fill_lev = info->levels; 1541 1542 PetscFunctionBegin; 1543 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1544 1545 auto a_d = aijkok->a_dual.view_device(); 1546 auto i_d = aijkok->i_dual.view_device(); 1547 auto j_d = aijkok->j_dual.view_device(); 1548 1549 KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 1550 1551 B->assembled = PETSC_TRUE; 1552 B->preallocated = PETSC_TRUE; 1553 B->ops->solve = MatSolve_SeqAIJKokkos; 1554 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1555 B->ops->matsolve = NULL; 1556 B->ops->matsolvetranspose = NULL; 1557 B->offloadmask = PETSC_OFFLOAD_GPU; 1558 1559 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1560 factors->transpose_updated = PETSC_FALSE; 1561 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1562 PetscFunctionReturn(0); 1563 } 1564 1565 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1566 { 1567 PetscErrorCode ierr; 1568 Mat_SeqAIJKokkos *aijkok; 1569 Mat_SeqAIJ *b; 1570 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1571 PetscInt fill_lev = info->levels; 1572 PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 1573 PetscInt n = A->rmap->n; 1574 1575 PetscFunctionBegin; 1576 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1577 /* Rebuild factors */ 1578 if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 1579 else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 1580 1581 /* Create a spiluk handle and then do symbolic factorization */ 1582 nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 1583 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 1584 1585 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1586 1587 Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 1588 Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 1589 Kokkos::realloc(factors->iU_d,n+1); 1590 Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 1591 1592 aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1593 auto i_d = aijkok->i_dual.view_device(); 1594 auto j_d = aijkok->j_dual.view_device(); 1595 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 1596 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1597 1598 Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1599 Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 1600 Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 1601 Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 1602 1603 /* TODO: add options to select sptrsv algorithms */ 1604 /* Create sptrsv handles for L, U and their transpose */ 1605 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1606 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1607 #else 1608 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1609 #endif 1610 1611 factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 1612 factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 1613 factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 1614 factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 1615 1616 /* Fill fields of the factor matrix B */ 1617 ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1618 b = (Mat_SeqAIJ*)B->data; 1619 b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 1620 B->info.fill_ratio_given = info->fill; 1621 B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 1622 1623 B->offloadmask = PETSC_OFFLOAD_GPU; 1624 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1625 PetscFunctionReturn(0); 1626 } 1627 1628 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1629 { 1630 PetscErrorCode ierr; 1631 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1632 const PetscInt nrows = A->rmap->n; 1633 1634 PetscFunctionBegin; 1635 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1636 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1637 // move B data into Kokkos 1638 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 1639 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 1640 { 1641 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1642 if (!baijkok->diag_d) { 1643 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1644 baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1645 Kokkos::deep_copy (*baijkok->diag_d, h_diag); 1646 } 1647 } 1648 PetscFunctionReturn(0); 1649 } 1650 1651 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1652 { 1653 PetscFunctionBegin; 1654 *type = MATSOLVERKOKKOS; 1655 PetscFunctionReturn(0); 1656 } 1657 1658 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 1659 { 1660 PetscFunctionBegin; 1661 *type = MATSOLVERKOKKOSDEVICE; 1662 PetscFunctionReturn(0); 1663 } 1664 1665 /*MC 1666 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1667 on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1668 1669 Level: beginner 1670 1671 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1672 M*/ 1673 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1674 { 1675 PetscErrorCode ierr; 1676 PetscInt n = A->rmap->n; 1677 1678 PetscFunctionBegin; 1679 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1680 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1681 (*B)->factortype = ftype; 1682 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1683 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1684 1685 if (ftype == MAT_FACTOR_LU) { 1686 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1687 (*B)->canuseordering = PETSC_TRUE; 1688 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1689 } else if (ftype == MAT_FACTOR_ILU) { 1690 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1691 (*B)->canuseordering = PETSC_FALSE; 1692 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1693 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1694 1695 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1696 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr); 1697 PetscFunctionReturn(0); 1698 } 1699 1700 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 1701 { 1702 PetscErrorCode ierr; 1703 PetscInt n = A->rmap->n; 1704 1705 PetscFunctionBegin; 1706 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1707 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1708 (*B)->factortype = ftype; 1709 (*B)->canuseordering = PETSC_TRUE; 1710 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1711 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1712 1713 if (ftype == MAT_FACTOR_LU) { 1714 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1715 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1716 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1717 1718 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1719 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 1720 PetscFunctionReturn(0); 1721 } 1722 1723 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1724 { 1725 PetscErrorCode ierr; 1726 1727 PetscFunctionBegin; 1728 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1729 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1730 ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 1731 PetscFunctionReturn(0); 1732 } 1733 1734 /* Utility to print out a KokkosCsrMatrix for debugging */ 1735 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1736 { 1737 PetscErrorCode ierr; 1738 const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1739 const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1740 const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1741 const PetscInt *i = iv.data(); 1742 const PetscInt *j = jv.data(); 1743 const PetscScalar *a = av.data(); 1744 PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1745 1746 PetscFunctionBegin; 1747 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr); 1748 for (PetscInt k=0; k<m; k++) { 1749 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr); 1750 for (PetscInt p=i[k]; p<i[k+1]; p++) { 1751 ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr); 1752 } 1753 ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr); 1754 } 1755 PetscFunctionReturn(0); 1756 } 1757