1 #include <petscvec_kokkos.hpp> 2 #include <petscpkg_version.h> 3 #include <petsc/private/petscimpl.h> 4 #include <petscsystypes.h> 5 #include <petscerror.h> 6 7 #include <Kokkos_Core.hpp> 8 #include <KokkosBlas.hpp> 9 #include <KokkosKernels_Sorting.hpp> 10 #include <KokkosSparse_CrsMatrix.hpp> 11 #include <KokkosSparse_spmv.hpp> 12 #include <KokkosSparse_spiluk.hpp> 13 #include <KokkosSparse_sptrsv.hpp> 14 #include <KokkosSparse_spgemm.hpp> 15 #include <KokkosSparse_spadd.hpp> 16 17 #include <../src/mat/impls/aij/seq/aij.h> 18 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 19 20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 21 22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 23 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 24 In the latter case, it is important to set a_dual's sync state correctly. 25 */ 26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 27 { 28 PetscErrorCode ierr; 29 Mat_SeqAIJ *aijseq; 30 Mat_SeqAIJKokkos *aijkok; 31 32 PetscFunctionBegin; 33 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 34 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 35 36 aijseq = static_cast<Mat_SeqAIJ*>(A->data); 37 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 38 39 /* If aijkok does not exist, we just copy i, j to device. 40 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 41 In both cases, we build a new aijkok structure. 42 */ 43 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 44 delete aijkok; 45 aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 46 A->spptr = aijkok; 47 } 48 49 if (aijkok && aijkok->device_mat_d.data()) { 50 A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 51 } 52 PetscFunctionReturn(0); 53 } 54 55 /* Sync CSR data to device if not yet */ 56 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 57 { 58 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 59 60 PetscFunctionBegin; 61 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 62 if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device"); 63 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 64 if (aijkok->a_dual.need_sync_device()) { 65 aijkok->a_dual.sync_device(); 66 aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */ 67 aijkok->hermitian_updated = PETSC_FALSE; 68 } 69 PetscFunctionReturn(0); 70 } 71 72 /* Mark the CSR data on device as modified */ 73 static PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 74 { 75 PetscErrorCode ierr; 76 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 77 78 PetscFunctionBegin; 79 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 80 aijkok->a_dual.clear_sync_state(); 81 aijkok->a_dual.modify_device(); 82 aijkok->transpose_updated = PETSC_FALSE; 83 aijkok->hermitian_updated = PETSC_FALSE; 84 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 85 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 86 PetscFunctionReturn(0); 87 } 88 89 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 90 { 91 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 92 93 PetscFunctionBegin; 94 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 95 /* We do not expect one needs factors on host */ 96 if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 97 if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 98 aijkok->a_dual.sync_host(); 99 PetscFunctionReturn(0); 100 } 101 102 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 103 { 104 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 105 106 PetscFunctionBegin; 107 if (aijkok) { 108 aijkok->a_dual.sync_host(); 109 *array = aijkok->a_dual.view_host().data(); 110 } else { /* Happens when calling MatSetValues on a newly created matrix */ 111 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 112 } 113 PetscFunctionReturn(0); 114 } 115 116 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 117 { 118 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 119 120 PetscFunctionBegin; 121 if (aijkok) aijkok->a_dual.modify_host(); 122 PetscFunctionReturn(0); 123 } 124 125 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 126 { 127 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 128 129 PetscFunctionBegin; 130 if (aijkok) { 131 aijkok->a_dual.sync_host(); 132 *array = aijkok->a_dual.view_host().data(); 133 } else { 134 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 135 } 136 PetscFunctionReturn(0); 137 } 138 139 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 140 { 141 PetscFunctionBegin; 142 *array = NULL; 143 PetscFunctionReturn(0); 144 } 145 146 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 147 { 148 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 149 150 PetscFunctionBegin; 151 if (aijkok) { 152 *array = aijkok->a_dual.view_host().data(); 153 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 154 *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 155 } 156 PetscFunctionReturn(0); 157 } 158 159 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 160 { 161 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 162 163 PetscFunctionBegin; 164 if (aijkok) { 165 aijkok->a_dual.clear_sync_state(); 166 aijkok->a_dual.modify_host(); 167 } 168 PetscFunctionReturn(0); 169 } 170 171 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 172 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 173 { 174 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 175 Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 176 177 PetscFunctionBegin; 178 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 179 aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 180 Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 181 PetscFunctionReturn(0); 182 } 183 184 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 185 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 186 { 187 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 188 189 PetscFunctionBegin; 190 if (aijkok && aijkok->device_mat_d.data()) { 191 *d_mat = aijkok->device_mat_d.data(); 192 } else { 193 PetscErrorCode ierr; 194 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 195 *d_mat = NULL; 196 } 197 PetscFunctionReturn(0); 198 } 199 200 /* Generate the transpose on device and cache it internally */ 201 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 202 { 203 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 204 205 PetscFunctionBegin; 206 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 207 if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 208 /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 209 CHKERRCXX(aijkok->a_dual.sync_device()); 210 CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 211 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 212 aijkok->transpose_updated = PETSC_TRUE; 213 } 214 *csrmatT = &aijkok->csrmatT; 215 PetscFunctionReturn(0); 216 } 217 218 /* Generate the Hermitian on device and cache it internally */ 219 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 220 { 221 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 222 223 PetscFunctionBegin; 224 if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 225 if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 226 CHKERRCXX(aijkok->a_dual.sync_device()); 227 CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 228 CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 229 #if defined(PETSC_USE_COMPLEX) 230 const auto& a = aijkok->csrmatH.values; 231 Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 232 #endif 233 aijkok->hermitian_updated = PETSC_TRUE; 234 } 235 *csrmatH = &aijkok->csrmatH; 236 PetscFunctionReturn(0); 237 } 238 239 /* y = A x */ 240 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 241 { 242 PetscErrorCode ierr; 243 Mat_SeqAIJKokkos *aijkok; 244 ConstPetscScalarKokkosView xv; 245 PetscScalarKokkosView yv; 246 247 PetscFunctionBegin; 248 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 249 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 250 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 251 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 252 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 253 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 254 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 255 ierr = WaitForKokkos();CHKERRQ(ierr); 256 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 257 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 258 PetscFunctionReturn(0); 259 } 260 261 /* y = A^T x */ 262 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 263 { 264 PetscErrorCode ierr; 265 Mat_SeqAIJKokkos *aijkok; 266 const char *mode; 267 ConstPetscScalarKokkosView xv; 268 PetscScalarKokkosView yv; 269 KokkosCsrMatrix *csrmat; 270 271 PetscFunctionBegin; 272 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 273 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 274 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 275 if (A->form_explicit_transpose) { 276 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 277 mode = "N"; 278 } else { 279 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 280 csrmat = &aijkok->csrmat; 281 mode = "T"; 282 } 283 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 284 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 285 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 286 ierr = WaitForKokkos();CHKERRQ(ierr); 287 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 288 PetscFunctionReturn(0); 289 } 290 291 /* y = A^H x */ 292 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 293 { 294 PetscErrorCode ierr; 295 Mat_SeqAIJKokkos *aijkok; 296 const char *mode; 297 ConstPetscScalarKokkosView xv; 298 PetscScalarKokkosView yv; 299 KokkosCsrMatrix *csrmat; 300 301 PetscFunctionBegin; 302 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 303 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 304 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 305 if (A->form_explicit_transpose) { 306 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 307 mode = "N"; 308 } else { 309 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 310 csrmat = &aijkok->csrmat; 311 mode = "C"; 312 } 313 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 314 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 315 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 316 ierr = WaitForKokkos();CHKERRQ(ierr); 317 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 318 PetscFunctionReturn(0); 319 } 320 321 /* z = A x + y */ 322 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 323 { 324 PetscErrorCode ierr; 325 Mat_SeqAIJKokkos *aijkok; 326 ConstPetscScalarKokkosView xv,yv; 327 PetscScalarKokkosView zv; 328 329 PetscFunctionBegin; 330 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 331 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 332 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 333 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 334 if (zz != yy) Kokkos::deep_copy(zv,yv); 335 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 336 KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 337 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 338 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 339 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 340 ierr = WaitForKokkos();CHKERRQ(ierr); 341 ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 342 PetscFunctionReturn(0); 343 } 344 345 /* z = A^T x + y */ 346 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 347 { 348 PetscErrorCode ierr; 349 Mat_SeqAIJKokkos *aijkok; 350 const char *mode; 351 ConstPetscScalarKokkosView xv,yv; 352 PetscScalarKokkosView zv; 353 KokkosCsrMatrix *csrmat; 354 355 PetscFunctionBegin; 356 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 357 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 358 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 359 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 360 if (zz != yy) Kokkos::deep_copy(zv,yv); 361 if (A->form_explicit_transpose) { 362 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 363 mode = "N"; 364 } else { 365 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 366 csrmat = &aijkok->csrmat; 367 mode = "T"; 368 } 369 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 370 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 371 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 372 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 373 ierr = WaitForKokkos();CHKERRQ(ierr); 374 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 375 PetscFunctionReturn(0); 376 } 377 378 /* z = A^H x + y */ 379 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 380 { 381 PetscErrorCode ierr; 382 Mat_SeqAIJKokkos *aijkok; 383 const char *mode; 384 ConstPetscScalarKokkosView xv,yv; 385 PetscScalarKokkosView zv; 386 KokkosCsrMatrix *csrmat; 387 388 PetscFunctionBegin; 389 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 390 ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 391 ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 392 ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 393 if (zz != yy) Kokkos::deep_copy(zv,yv); 394 if (A->form_explicit_transpose) { 395 ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 396 mode = "N"; 397 } else { 398 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 399 csrmat = &aijkok->csrmat; 400 mode = "C"; 401 } 402 KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 403 ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 404 ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 405 ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 406 ierr = WaitForKokkos();CHKERRQ(ierr); 407 ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 408 PetscFunctionReturn(0); 409 } 410 411 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 412 { 413 PetscErrorCode ierr; 414 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 415 416 PetscFunctionBegin; 417 switch (op) { 418 case MAT_FORM_EXPLICIT_TRANSPOSE: 419 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 420 if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 421 A->form_explicit_transpose = flg; 422 break; 423 default: 424 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 425 break; 426 } 427 PetscFunctionReturn(0); 428 } 429 430 /* Depending on reuse, either build a new mat, or use the existing mat */ 431 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 432 { 433 PetscErrorCode ierr; 434 Mat_SeqAIJ *aseq; 435 436 PetscFunctionBegin; 437 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 438 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 439 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */ 440 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 441 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */ 442 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 443 if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 444 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 445 ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 446 ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 447 ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr); 448 aseq = static_cast<Mat_SeqAIJ*>(A->data); 449 if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */ 450 if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 451 A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 452 } 453 } 454 PetscFunctionReturn(0); 455 } 456 457 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 458 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 459 */ 460 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 461 { 462 PetscErrorCode ierr; 463 Mat_SeqAIJ *bseq; 464 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 465 Mat mat; 466 467 PetscFunctionBegin; 468 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 469 ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr); 470 mat = *B; 471 if (A->assembled) { 472 bseq = static_cast<Mat_SeqAIJ*>(mat->data); 473 bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 474 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 475 /* Now copy values to B if needed */ 476 if (dupOption == MAT_COPY_VALUES) { 477 if (akok->a_dual.need_sync_device()) { 478 Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 479 bkok->a_dual.modify_host(); 480 } else { /* If device has the latest data, we only copy data on device */ 481 Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 482 bkok->a_dual.modify_device(); 483 } 484 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 485 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 486 bkok->a_dual.modify_host(); 487 } 488 mat->spptr = bkok; 489 } 490 491 ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr); 492 ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 493 ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr); 494 ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr); 495 PetscFunctionReturn(0); 496 } 497 498 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 499 { 500 PetscErrorCode ierr; 501 Mat At; 502 KokkosCsrMatrix *internT,*csrmatT; 503 Mat_SeqAIJKokkos *atkok,*bkok; 504 505 PetscFunctionBegin; 506 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */ 507 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 508 CHKERRCXX(csrmatT = new KokkosCsrMatrix("csrmat",*internT)); /* Deep copy internT to csrmatT, as we want to isolate the internal transpose */ 509 CHKERRCXX(atkok = new Mat_SeqAIJKokkos(*csrmatT)); 510 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr); 511 if (reuse == MAT_INITIAL_MATRIX) *B = At; 512 else {ierr = MatHeaderMerge(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */ 513 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 514 if ((*B)->assembled) { 515 bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 516 CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 517 ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr); 518 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 519 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 520 MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 521 MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 522 CHKERRCXX(Kokkos::deep_copy(a_h,internT->values)); 523 CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 524 } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 525 } 526 PetscFunctionReturn(0); 527 } 528 529 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 530 { 531 PetscErrorCode ierr; 532 Mat_SeqAIJKokkos *aijkok; 533 534 PetscFunctionBegin; 535 if (A->factortype == MAT_FACTOR_NONE) { 536 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 537 if (aijkok) { 538 if (aijkok->device_mat_d.data()) { 539 delete aijkok->colmap_d; 540 delete aijkok->i_uncompressed_d; 541 } 542 if (aijkok->diag_d) delete aijkok->diag_d; 543 } 544 delete aijkok; 545 } else { 546 delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 547 } 548 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 549 A->spptr = NULL; 550 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 551 PetscFunctionReturn(0); 552 } 553 554 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 555 { 556 PetscErrorCode ierr; 557 558 PetscFunctionBegin; 559 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 560 ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 561 ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 562 PetscFunctionReturn(0); 563 } 564 565 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 566 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 567 { 568 PetscErrorCode ierr; 569 Mat_SeqAIJ *a,*b; 570 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 571 MatScalarKokkosView aa,ba,ca; 572 MatRowMapKokkosView ai,bi,ci; 573 MatColIdxKokkosView aj,bj,cj; 574 PetscInt m,n,nnz,aN; 575 576 PetscFunctionBegin; 577 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 578 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 579 PetscValidPointer(C,4); 580 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 581 PetscCheckTypeName(B,MATSEQAIJKOKKOS); 582 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 583 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 584 585 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 586 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 587 a = static_cast<Mat_SeqAIJ*>(A->data); 588 b = static_cast<Mat_SeqAIJ*>(B->data); 589 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 590 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 591 aa = akok->a_dual.view_device(); 592 ai = akok->i_dual.view_device(); 593 ba = bkok->a_dual.view_device(); 594 bi = bkok->i_dual.view_device(); 595 m = A->rmap->n; /* M, N and nnz of C */ 596 n = A->cmap->n + B->cmap->n; 597 nnz = a->nz + b->nz; 598 aN = A->cmap->n; /* N of A */ 599 if (reuse == MAT_INITIAL_MATRIX) { 600 aj = akok->j_dual.view_device(); 601 bj = bkok->j_dual.view_device(); 602 auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 603 auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 604 auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 605 ca = ca_dual.view_device(); 606 ci = ci_dual.view_device(); 607 cj = cj_dual.view_device(); 608 609 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 610 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 611 PetscInt i = t.league_rank(); /* row i */ 612 PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 613 614 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 615 ci(i) = coffset; 616 if (i == m-1) ci(m) = ai(m) + bi(m); 617 }); 618 619 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 620 if (k < alen) { 621 ca(coffset+k) = aa(ai(i)+k); 622 cj(coffset+k) = aj(ai(i)+k); 623 } else { 624 ca(coffset+k) = ba(bi(i)+k-alen); 625 cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 626 } 627 }); 628 }); 629 ca_dual.modify_device(); 630 ci_dual.modify_device(); 631 cj_dual.modify_device(); 632 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 633 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr); 634 } else if (reuse == MAT_REUSE_MATRIX) { 635 PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 636 PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 637 ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 638 ca = ckok->a_dual.view_device(); 639 ci = ckok->i_dual.view_device(); 640 641 Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 642 PetscInt i = t.league_rank(); /* row i */ 643 PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 644 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 645 if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 646 else ca(ci(i)+k) = ba(bi(i)+k-alen); 647 }); 648 }); 649 ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr); 650 } 651 PetscFunctionReturn(0); 652 } 653 654 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 655 { 656 PetscFunctionBegin; 657 delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 658 PetscFunctionReturn(0); 659 } 660 661 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 662 { 663 PetscErrorCode ierr; 664 Mat_Product *product = C->product; 665 Mat A,B; 666 bool transA,transB; /* use bool, since KK needs this type */ 667 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 668 Mat_SeqAIJ *c; 669 MatProductData_SeqAIJKokkos *pdata; 670 KokkosCsrMatrix *csrmatA,*csrmatB; 671 672 PetscFunctionBegin; 673 MatCheckProduct(C,1); 674 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 675 pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 676 677 if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 678 pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 679 PetscFunctionReturn(0); 680 } 681 682 switch (product->type) { 683 case MATPRODUCT_AB: transA = false; transB = false; break; 684 case MATPRODUCT_AtB: transA = true; transB = false; break; 685 case MATPRODUCT_ABt: transA = false; transB = true; break; 686 default: 687 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 688 } 689 690 A = product->A; 691 B = product->B; 692 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 693 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 694 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 695 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 696 ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 697 698 if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 699 700 csrmatA = &akok->csrmat; 701 csrmatB = &bkok->csrmat; 702 703 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 704 if (transA) { 705 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 706 transA = false; 707 } 708 709 if (transB) { 710 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 711 transB = false; 712 } 713 714 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 715 CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 716 ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr); 717 /* shorter version of MatAssemblyEnd_SeqAIJ */ 718 c = (Mat_SeqAIJ*)C->data; 719 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 720 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 721 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 722 c->reallocs = 0; 723 C->info.mallocs = 0; 724 C->info.nz_unneeded = 0; 725 C->assembled = C->was_assembled = PETSC_TRUE; 726 C->num_ass++; 727 PetscFunctionReturn(0); 728 } 729 730 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 731 { 732 PetscErrorCode ierr; 733 Mat_Product *product = C->product; 734 MatProductType ptype; 735 Mat A,B; 736 bool transA,transB; 737 Mat_SeqAIJKokkos *akok,*bkok,*ckok; 738 MatProductData_SeqAIJKokkos *pdata; 739 MPI_Comm comm; 740 KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 741 742 PetscFunctionBegin; 743 MatCheckProduct(C,1); 744 ierr = PetscObjectGetComm((PetscObject)C,&comm); 745 if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty"); 746 A = product->A; 747 B = product->B; 748 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 749 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 750 akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 751 bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 752 csrmatA = &akok->csrmat; 753 csrmatB = &bkok->csrmat; 754 755 ptype = product->type; 756 switch (ptype) { 757 case MATPRODUCT_AB: transA = false; transB = false; break; 758 case MATPRODUCT_AtB: transA = true; transB = false; break; 759 case MATPRODUCT_ABt: transA = false; transB = true; break; 760 default: 761 SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 762 } 763 764 product->data = pdata = new MatProductData_SeqAIJKokkos(); 765 pdata->kh.set_team_work_size(16); 766 pdata->kh.set_dynamic_scheduling(true); 767 pdata->reusesym = product->api_user; 768 769 /* TODO: add command line options to select spgemm algorithms */ 770 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 771 #if defined(PETSC_HAVE_CUDA) 772 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 773 /* This algorithm + cuda-10.2 sometimes gave wrong results (invalid device pointers in csrmatC) and failed snes/tutorials/ex56.c */ 774 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_CUSPARSE; 775 #endif 776 #endif 777 pdata->kh.create_spgemm_handle(spgemm_alg); 778 779 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 780 if (transA) { 781 ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 782 transA = false; 783 } 784 785 if (transB) { 786 ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 787 transB = false; 788 } 789 790 CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 791 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 792 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 793 calling new Mat_SeqAIJKokkos(). 794 TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 795 */ 796 CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 797 CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 798 799 CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 800 ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr); 801 C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 802 PetscFunctionReturn(0); 803 } 804 805 /* handles sparse matrix matrix ops */ 806 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 807 { 808 PetscErrorCode ierr; 809 Mat_Product *product = mat->product; 810 PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 811 812 PetscFunctionBegin; 813 MatCheckProduct(mat,1); 814 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 815 if (product->type == MATPRODUCT_ABC) { 816 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 817 } 818 if (Biskok && Ciskok) { 819 switch (product->type) { 820 case MATPRODUCT_AB: 821 case MATPRODUCT_AtB: 822 case MATPRODUCT_ABt: 823 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 824 break; 825 case MATPRODUCT_PtAP: 826 case MATPRODUCT_RARt: 827 case MATPRODUCT_ABC: 828 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 829 break; 830 default: 831 break; 832 } 833 } else { /* fallback for AIJ */ 834 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 835 } 836 PetscFunctionReturn(0); 837 } 838 839 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 840 { 841 PetscErrorCode ierr; 842 Mat_SeqAIJKokkos *aijkok; 843 844 PetscFunctionBegin; 845 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 846 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 847 KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 848 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 849 ierr = WaitForKokkos();CHKERRQ(ierr); 850 ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr); 851 PetscFunctionReturn(0); 852 } 853 854 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 855 { 856 PetscErrorCode ierr; 857 Mat_SeqAIJKokkos *aijkok; 858 859 PetscFunctionBegin; 860 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 861 if (aijkok) { /* Only zero the device if data is already there */ 862 KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 863 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 864 } else { /* Might be preallocated but not assembled */ 865 ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 866 } 867 PetscFunctionReturn(0); 868 } 869 870 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 871 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstPetscScalarKokkosView* kv) 872 { 873 PetscErrorCode ierr; 874 Mat_SeqAIJKokkos *aijkok; 875 876 PetscFunctionBegin; 877 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 878 PetscValidPointer(kv,2); 879 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 880 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 881 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 882 *kv = aijkok->a_dual.view_device(); 883 PetscFunctionReturn(0); 884 } 885 886 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstPetscScalarKokkosView* kv) 887 { 888 PetscFunctionBegin; 889 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 890 PetscValidPointer(kv,2); 891 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 892 PetscFunctionReturn(0); 893 } 894 895 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,PetscScalarKokkosView* kv) 896 { 897 PetscErrorCode ierr; 898 Mat_SeqAIJKokkos *aijkok; 899 900 PetscFunctionBegin; 901 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 902 PetscValidPointer(kv,2); 903 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 904 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 905 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 906 *kv = aijkok->a_dual.view_device(); 907 PetscFunctionReturn(0); 908 } 909 910 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,PetscScalarKokkosView* kv) 911 { 912 PetscErrorCode ierr; 913 914 PetscFunctionBegin; 915 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 916 PetscValidPointer(kv,2); 917 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 918 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 919 PetscFunctionReturn(0); 920 } 921 922 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) 923 { 924 Mat_SeqAIJKokkos *aijkok; 925 926 PetscFunctionBegin; 927 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 928 PetscValidPointer(kv,2); 929 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 930 aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 931 *kv = aijkok->a_dual.view_device(); 932 PetscFunctionReturn(0); 933 } 934 935 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) 936 { 937 PetscErrorCode ierr; 938 939 PetscFunctionBegin; 940 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 941 PetscValidPointer(kv,2); 942 PetscCheckTypeName(A,MATSEQAIJKOKKOS); 943 ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 944 PetscFunctionReturn(0); 945 } 946 947 /* Computes Y += alpha X */ 948 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 949 { 950 PetscErrorCode ierr; 951 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 952 Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 953 ConstMatScalarKokkosView Xa; 954 MatScalarKokkosView Ya; 955 956 PetscFunctionBegin; 957 PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 958 PetscCheckTypeName(X,MATSEQAIJKOKKOS); 959 ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); 960 ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 961 962 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 963 /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 964 PetscBool e; 965 ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 966 if (e) { 967 ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 968 if (e) pattern = SAME_NONZERO_PATTERN; 969 } 970 } 971 972 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 973 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 974 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 975 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 976 */ 977 ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 978 xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 979 Xa = xkok->a_dual.view_device(); 980 Ya = ykok->a_dual.view_device(); 981 982 if (pattern == SAME_NONZERO_PATTERN) { 983 KokkosBlas::axpy(alpha,Xa,Ya); 984 ierr = MatSeqAIJKokkosModifyDevice(Y); 985 } else if (pattern == SUBSET_NONZERO_PATTERN) { 986 MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 987 MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 988 989 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 990 PetscInt i = t.league_rank(); /* row i */ 991 Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 992 PetscInt p,q = Yi(i); 993 for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 994 while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 995 if (Xj(p) == Yj(q)) { /* Found it */ 996 Ya(q) += alpha * Xa(p); 997 q++; 998 } else { 999 /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1000 Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1001 */ 1002 if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */ 1003 } 1004 } 1005 }); 1006 }); 1007 ierr = MatSeqAIJKokkosModifyDevice(Y); 1008 } else { /* different nonzero patterns */ 1009 Mat Z; 1010 KokkosCsrMatrix zcsr; 1011 KernelHandle kh; 1012 kh.create_spadd_handle(false); 1013 KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1014 KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1015 zkok = new Mat_SeqAIJKokkos(zcsr); 1016 ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr); 1017 ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr); 1018 kh.destroy_spadd_handle(); 1019 } 1020 1021 PetscFunctionReturn(0); 1022 } 1023 1024 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1025 { 1026 PetscErrorCode ierr; 1027 1028 PetscFunctionBegin; 1029 ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 1030 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 1031 B->offloadmask = PETSC_OFFLOAD_CPU; 1032 PetscFunctionReturn(0); 1033 } 1034 1035 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1036 { 1037 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1038 1039 PetscFunctionBegin; 1040 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 1041 A->boundtocpu = PETSC_FALSE; 1042 1043 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 1044 A->ops->destroy = MatDestroy_SeqAIJKokkos; 1045 A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1046 A->ops->axpy = MatAXPY_SeqAIJKokkos; 1047 A->ops->scale = MatScale_SeqAIJKokkos; 1048 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1049 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 1050 A->ops->mult = MatMult_SeqAIJKokkos; 1051 A->ops->multadd = MatMultAdd_SeqAIJKokkos; 1052 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 1053 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 1054 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 1055 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1056 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1057 A->ops->transpose = MatTranspose_SeqAIJKokkos; 1058 A->ops->setoption = MatSetOption_SeqAIJKokkos; 1059 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1060 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1061 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1062 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1063 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1064 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1065 PetscFunctionReturn(0); 1066 } 1067 1068 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1069 { 1070 PetscErrorCode ierr; 1071 Mat_SeqAIJ *aseq; 1072 PetscInt i,m,n; 1073 1074 PetscFunctionBegin; 1075 if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1076 1077 m = akok->nrows(); 1078 n = akok->ncols(); 1079 ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr); 1080 ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1081 1082 /* Set up data structures of A as a MATSEQAIJ */ 1083 ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1084 aseq = (Mat_SeqAIJ*)(A)->data; 1085 1086 akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1087 akok->j_dual.sync_host(); 1088 1089 aseq->i = akok->i_host_data(); 1090 aseq->j = akok->j_host_data(); 1091 aseq->a = akok->a_host_data(); 1092 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1093 aseq->singlemalloc = PETSC_FALSE; 1094 aseq->free_a = PETSC_FALSE; 1095 aseq->free_ij = PETSC_FALSE; 1096 aseq->nz = akok->nnz(); 1097 aseq->maxnz = aseq->nz; 1098 1099 ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr); 1100 ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr); 1101 for (i=0; i<m; i++) { 1102 aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1103 } 1104 1105 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1106 akok->nonzerostate = A->nonzerostate; 1107 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); 1108 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); 1109 A->spptr = akok; 1110 PetscFunctionReturn(0); 1111 } 1112 1113 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1114 1115 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1116 */ 1117 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1118 { 1119 PetscErrorCode ierr; 1120 1121 PetscFunctionBegin; 1122 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1123 ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr); 1124 PetscFunctionReturn(0); 1125 } 1126 1127 /* --------------------------------------------------------------------------------*/ 1128 /*@C 1129 MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 1130 (the default parallel PETSc format). This matrix will ultimately be handled by 1131 Kokkos for calculations. For good matrix 1132 assembly performance the user should preallocate the matrix storage by setting 1133 the parameter nz (or the array nnz). By setting these parameters accurately, 1134 performance during matrix assembly can be increased by more than a factor of 50. 1135 1136 Collective 1137 1138 Input Parameters: 1139 + comm - MPI communicator, set to PETSC_COMM_SELF 1140 . m - number of rows 1141 . n - number of columns 1142 . nz - number of nonzeros per row (same for all rows) 1143 - nnz - array containing the number of nonzeros in the various rows 1144 (possibly different for each row) or NULL 1145 1146 Output Parameter: 1147 . A - the matrix 1148 1149 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1150 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1151 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1152 1153 Notes: 1154 If nnz is given then nz is ignored 1155 1156 The AIJ format (also called the Yale sparse matrix format or 1157 compressed row storage), is fully compatible with standard Fortran 77 1158 storage. That is, the stored row and column indices can begin at 1159 either one (as in Fortran) or zero. See the users' manual for details. 1160 1161 Specify the preallocated storage with either nz or nnz (not both). 1162 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1163 allocation. For large problems you MUST preallocate memory or you 1164 will get TERRIBLE performance, see the users' manual chapter on matrices. 1165 1166 By default, this format uses inodes (identical nodes) when possible, to 1167 improve numerical efficiency of matrix-vector products and solves. We 1168 search for consecutive rows with the same nonzero structure, thereby 1169 reusing matrix information to achieve increased efficiency. 1170 1171 Level: intermediate 1172 1173 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 1174 @*/ 1175 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1176 { 1177 PetscErrorCode ierr; 1178 1179 PetscFunctionBegin; 1180 ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 1181 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1182 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1183 ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1184 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1185 PetscFunctionReturn(0); 1186 } 1187 1188 typedef Kokkos::TeamPolicy<>::member_type team_member; 1189 // 1190 // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container. 1191 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 1192 // 1193 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1194 { 1195 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1196 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1197 IS isrow = b->row,isicol = b->icol; 1198 PetscErrorCode ierr; 1199 const PetscInt *r_h,*ic_h; 1200 const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data(); 1201 const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1202 PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 1203 PetscBool row_identity,col_identity; 1204 PetscInt nc, Nf, nVec=32; // should be a parameter 1205 PetscContainer container; 1206 1207 PetscFunctionBegin; 1208 if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n); 1209 ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 1210 if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 1211 ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 1212 if (container) { 1213 PetscInt *pNf=NULL, nv; 1214 ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 1215 Nf = (*pNf)%1000; 1216 nv = (*pNf)/1000; 1217 if (nv>0) nVec = nv; 1218 } else Nf = 1; 1219 if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf); 1220 ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 1221 ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 1222 ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 1223 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 1224 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1225 #endif 1226 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1227 { 1228 #define KOKKOS_SHARED_LEVEL 1 1229 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 1230 using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 1231 using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 1232 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 1233 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 1234 const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 1235 Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 1236 size_t flops_h = 0.0; 1237 Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 1238 Kokkos::View<size_t> d_flops_k ("flops"); 1239 const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1240 const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 1241 Kokkos::deep_copy (d_flops_k, h_flops_k); 1242 Kokkos::deep_copy (d_r_k, h_r_k); 1243 Kokkos::deep_copy (d_ic_k, h_ic_k); 1244 // Fill A --> fact 1245 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1246 const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 1247 const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 1248 const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 1249 // zero rows of B 1250 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1251 PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 1252 PetscScalar *baL = ba_d + bi_d[rowb]; 1253 PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 1254 /* zero (unfactored row) */ 1255 for (int j=0;j<nzbL;j++) baL[j] = 0; 1256 for (int j=0;j<nzbU;j++) baU[j] = 0; 1257 }); 1258 // copy A into B 1259 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 1260 PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 1261 const PetscScalar *av = aa_d + ai_d[rowa]; 1262 const PetscInt *ajtmp = aj_d + ai_d[rowa]; 1263 /* load in initial (unfactored row) */ 1264 for (int j=0;j<nza;j++) { 1265 PetscInt colb = ic[ajtmp[j]]; 1266 PetscScalar vala = av[j]; 1267 if (colb == rowb) { 1268 *(ba_d + bdiag_d[rowb]) = vala; 1269 } else { 1270 const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1271 PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 1272 PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 1273 for (int j=0; j<nz ; j++) { 1274 if (pbj[j] == colb) { 1275 pba[j] = vala; 1276 set++; 1277 break; 1278 } 1279 } 1280 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1281 } 1282 } 1283 }); 1284 }); 1285 Kokkos::fence(); 1286 1287 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 1288 sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1289 scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 1290 sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1291 const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 1292 const PetscInt start = field*nloc, end = start + nloc; 1293 Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 1294 // A22 panel update for each row A(1,:) and col A(:,1) 1295 for (int ii=start; ii<end-1; ii++) { 1296 const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 1297 const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 1298 const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 1299 const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 1300 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 1301 PetscInt kIdx = j*Ni + field_block_idx; 1302 if (kIdx >= nzUi) /* void */ ; 1303 else { 1304 const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 1305 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 1306 const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 1307 size_t st_idx; 1308 // find and do L(k,i) = A(:k,i) / A(i,i) 1309 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 1310 // get column, there has got to be a better way 1311 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 1312 if (pjL[j] == ii) { 1313 PetscScalar *pLki = ba_d + bi_d[myk] + j; 1314 idx = j; // output 1315 *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 1316 } 1317 }, st_idx); 1318 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 1319 #if defined(PETSC_USE_DEBUG) 1320 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register 1321 #endif 1322 // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 1323 // U(i+1,:end) 1324 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 1325 PetscScalar Uij = baUi[uiIdx]; 1326 PetscInt col = bjUi[uiIdx]; 1327 if (col==myk) { 1328 // A_kk = A_kk - L_ki * U_ij(k) 1329 PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 1330 *Akkv = *Akkv - L_ki() * Uij; // UiK 1331 } else { 1332 PetscScalar *start, *end, *pAkjv=NULL; 1333 PetscInt high, low; 1334 const PetscInt *startj; 1335 if (col<myk) { // L 1336 PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 1337 PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 1338 start = pLki+1; // start at pLki+1, A22(myk,1) 1339 startj= bj_d + bi_d[myk] + idx; 1340 end = ba_d + bi_d[myk+1]; 1341 } else { 1342 PetscInt idx = bdiag_d[myk+1]+1; 1343 start = ba_d + idx; 1344 startj= bj_d + idx; 1345 end = ba_d + bdiag_d[myk]; 1346 } 1347 // search for 'col', use bisection search - TODO 1348 low = 0; 1349 high = (PetscInt)(end-start); 1350 while (high-low > 5) { 1351 int t = (low+high)/2; 1352 if (startj[t] > col) high = t; 1353 else low = t; 1354 } 1355 for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 1356 if (startj[pAkjv-start] == col) break; 1357 } 1358 #if defined(PETSC_USE_DEBUG) 1359 if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register 1360 #endif 1361 *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 1362 } 1363 }); 1364 } 1365 }); 1366 team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 1367 if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 1368 } /* endof for (i=0; i<n; i++) { */ 1369 Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 1370 }); 1371 Kokkos::fence(); 1372 Kokkos::deep_copy (h_flops_k, d_flops_k); 1373 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 1374 ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 1375 #elif defined(PETSC_USE_LOG) 1376 ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 1377 #endif 1378 Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 1379 const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 1380 const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 1381 /* Invert diagonal for simpler triangular solves */ 1382 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 1383 int i = start + outer_index*Ni + lg_rank%Ni; 1384 if (i < end) { 1385 PetscScalar *pv = ba_d + bdiag_d[i]; 1386 *pv = 1.0/(*pv); 1387 } 1388 }); 1389 }); 1390 } 1391 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 1392 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1393 #endif 1394 ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 1395 ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 1396 1397 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 1398 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 1399 if (b->inode.size) { 1400 B->ops->solve = MatSolve_SeqAIJ_Inode; 1401 } else if (row_identity && col_identity) { 1402 B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 1403 } else { 1404 B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 1405 } 1406 B->offloadmask = PETSC_OFFLOAD_GPU; 1407 ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 1408 B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 1409 B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1410 B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1411 B->ops->matsolve = MatMatSolve_SeqAIJ; 1412 B->assembled = PETSC_TRUE; 1413 B->preallocated = PETSC_TRUE; 1414 1415 PetscFunctionReturn(0); 1416 } 1417 1418 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1419 { 1420 PetscErrorCode ierr; 1421 1422 PetscFunctionBegin; 1423 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1424 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 1425 PetscFunctionReturn(0); 1426 } 1427 1428 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1429 { 1430 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1431 1432 PetscFunctionBegin; 1433 if (!factors->sptrsv_symbolic_completed) { 1434 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 1435 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 1436 factors->sptrsv_symbolic_completed = PETSC_TRUE; 1437 } 1438 PetscFunctionReturn(0); 1439 } 1440 1441 /* Check if we need to update factors etc for transpose solve */ 1442 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1443 { 1444 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1445 MatColIdxType n = A->rmap->n; 1446 1447 PetscFunctionBegin; 1448 if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 1449 /* Update L^T and do sptrsv symbolic */ 1450 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 1451 Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1452 factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1453 factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 1454 1455 KokkosKernels::Impl::transpose_matrix< 1456 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1457 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1458 MatRowMapKokkosView,DefaultExecutionSpace>( 1459 n,n,factors->iL_d,factors->jL_d,factors->aL_d, 1460 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1461 1462 /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 1463 We have to sort the indices, until KK provides finer control options. 1464 */ 1465 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1466 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1467 factors->iLt_d,factors->jLt_d,factors->aLt_d); 1468 1469 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 1470 1471 /* Update U^T and do sptrsv symbolic */ 1472 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 1473 Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1474 factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1475 factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 1476 1477 KokkosKernels::Impl::transpose_matrix< 1478 ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1479 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1480 MatRowMapKokkosView,DefaultExecutionSpace>( 1481 n,n,factors->iU_d, factors->jU_d, factors->aU_d, 1482 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1483 1484 /* Sort indices. See comments above */ 1485 KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1486 MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 1487 factors->iUt_d,factors->jUt_d,factors->aUt_d); 1488 1489 KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 1490 factors->transpose_updated = PETSC_TRUE; 1491 } 1492 PetscFunctionReturn(0); 1493 } 1494 1495 /* Solve Ax = b, with A = LU */ 1496 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 1497 { 1498 PetscErrorCode ierr; 1499 ConstPetscScalarKokkosView bv; 1500 PetscScalarKokkosView xv; 1501 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1502 1503 PetscFunctionBegin; 1504 ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr); 1505 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1506 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1507 /* Solve L tmpv = b */ 1508 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 1509 /* Solve Ux = tmpv */ 1510 CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 1511 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1512 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1513 PetscFunctionReturn(0); 1514 } 1515 1516 /* Solve A^T x = b, where A^T = U^T L^T */ 1517 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 1518 { 1519 PetscErrorCode ierr; 1520 ConstPetscScalarKokkosView bv; 1521 PetscScalarKokkosView xv; 1522 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1523 1524 PetscFunctionBegin; 1525 ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr); 1526 ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 1527 ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 1528 /* Solve U^T tmpv = b */ 1529 KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 1530 1531 /* Solve L^T x = tmpv */ 1532 KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 1533 ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 1534 ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1535 PetscFunctionReturn(0); 1536 } 1537 1538 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 1539 { 1540 PetscErrorCode ierr; 1541 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1542 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1543 PetscInt fill_lev = info->levels; 1544 1545 PetscFunctionBegin; 1546 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1547 1548 auto a_d = aijkok->a_dual.view_device(); 1549 auto i_d = aijkok->i_dual.view_device(); 1550 auto j_d = aijkok->j_dual.view_device(); 1551 1552 KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 1553 1554 B->assembled = PETSC_TRUE; 1555 B->preallocated = PETSC_TRUE; 1556 B->ops->solve = MatSolve_SeqAIJKokkos; 1557 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 1558 B->ops->matsolve = NULL; 1559 B->ops->matsolvetranspose = NULL; 1560 B->offloadmask = PETSC_OFFLOAD_GPU; 1561 1562 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 1563 factors->transpose_updated = PETSC_FALSE; 1564 factors->sptrsv_symbolic_completed = PETSC_FALSE; 1565 PetscFunctionReturn(0); 1566 } 1567 1568 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1569 { 1570 PetscErrorCode ierr; 1571 Mat_SeqAIJKokkos *aijkok; 1572 Mat_SeqAIJ *b; 1573 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 1574 PetscInt fill_lev = info->levels; 1575 PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 1576 PetscInt n = A->rmap->n; 1577 1578 PetscFunctionBegin; 1579 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1580 /* Rebuild factors */ 1581 if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 1582 else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 1583 1584 /* Create a spiluk handle and then do symbolic factorization */ 1585 nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 1586 factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 1587 1588 auto spiluk_handle = factors->kh.get_spiluk_handle(); 1589 1590 Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 1591 Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 1592 Kokkos::realloc(factors->iU_d,n+1); 1593 Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 1594 1595 aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1596 auto i_d = aijkok->i_dual.view_device(); 1597 auto j_d = aijkok->j_dual.view_device(); 1598 KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 1599 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 1600 1601 Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 1602 Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 1603 Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 1604 Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 1605 1606 /* TODO: add options to select sptrsv algorithms */ 1607 /* Create sptrsv handles for L, U and their transpose */ 1608 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 1609 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 1610 #else 1611 auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 1612 #endif 1613 1614 factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 1615 factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 1616 factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 1617 factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 1618 1619 /* Fill fields of the factor matrix B */ 1620 ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1621 b = (Mat_SeqAIJ*)B->data; 1622 b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 1623 B->info.fill_ratio_given = info->fill; 1624 B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 1625 1626 B->offloadmask = PETSC_OFFLOAD_GPU; 1627 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1628 PetscFunctionReturn(0); 1629 } 1630 1631 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1632 { 1633 PetscErrorCode ierr; 1634 Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 1635 const PetscInt nrows = A->rmap->n; 1636 1637 PetscFunctionBegin; 1638 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1639 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 1640 // move B data into Kokkos 1641 ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 1642 ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 1643 { 1644 Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1645 if (!baijkok->diag_d) { 1646 const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1647 baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1648 Kokkos::deep_copy (*baijkok->diag_d, h_diag); 1649 } 1650 } 1651 PetscFunctionReturn(0); 1652 } 1653 1654 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1655 { 1656 PetscFunctionBegin; 1657 *type = MATSOLVERKOKKOS; 1658 PetscFunctionReturn(0); 1659 } 1660 1661 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 1662 { 1663 PetscFunctionBegin; 1664 *type = MATSOLVERKOKKOSDEVICE; 1665 PetscFunctionReturn(0); 1666 } 1667 1668 /*MC 1669 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 1670 on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1671 1672 Level: beginner 1673 1674 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1675 M*/ 1676 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1677 { 1678 PetscErrorCode ierr; 1679 PetscInt n = A->rmap->n; 1680 1681 PetscFunctionBegin; 1682 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1683 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1684 (*B)->factortype = ftype; 1685 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1686 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1687 1688 if (ftype == MAT_FACTOR_LU) { 1689 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1690 (*B)->canuseordering = PETSC_TRUE; 1691 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 1692 } else if (ftype == MAT_FACTOR_ILU) { 1693 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1694 (*B)->canuseordering = PETSC_FALSE; 1695 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 1696 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1697 1698 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1699 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr); 1700 PetscFunctionReturn(0); 1701 } 1702 1703 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 1704 { 1705 PetscErrorCode ierr; 1706 PetscInt n = A->rmap->n; 1707 1708 PetscFunctionBegin; 1709 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1710 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1711 (*B)->factortype = ftype; 1712 (*B)->canuseordering = PETSC_TRUE; 1713 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1714 ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1715 1716 if (ftype == MAT_FACTOR_LU) { 1717 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1718 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 1719 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 1720 1721 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1722 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 1723 PetscFunctionReturn(0); 1724 } 1725 1726 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1727 { 1728 PetscErrorCode ierr; 1729 1730 PetscFunctionBegin; 1731 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1732 ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 1733 ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 1734 PetscFunctionReturn(0); 1735 } 1736 1737 /* Utility to print out a KokkosCsrMatrix for debugging */ 1738 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1739 { 1740 PetscErrorCode ierr; 1741 const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1742 const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1743 const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1744 const PetscInt *i = iv.data(); 1745 const PetscInt *j = jv.data(); 1746 const PetscScalar *a = av.data(); 1747 PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1748 1749 PetscFunctionBegin; 1750 ierr = PetscPrintf(PETSC_COMM_SELF,"%D x %D SeqAIJKokkos, with %D nonzeros\n",m,n,nnz);CHKERRQ(ierr); 1751 for (PetscInt k=0; k<m; k++) { 1752 ierr = PetscPrintf(PETSC_COMM_SELF,"%D: ",k);CHKERRQ(ierr); 1753 for (PetscInt p=i[k]; p<i[k+1]; p++) { 1754 ierr = PetscPrintf(PETSC_COMM_SELF,"%D(%.1f), ",j[p],a[p]);CHKERRQ(ierr); 1755 } 1756 ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr); 1757 } 1758 PetscFunctionReturn(0); 1759 } 1760