1 #include <petscvec_kokkos.hpp> 2 #include <petsc/private/deviceimpl.h> 3 #include <petsc/private/pcimpl.h> 4 #include <petsc/private/kspimpl.h> 5 #include <petscksp.h> /*I "petscksp.h" I*/ 6 #include "petscsection.h" 7 #include <petscdmcomposite.h> 8 #include "Kokkos_Core.hpp" 9 10 #include <../src/mat/impls/aij/seq/aij.h> 11 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 12 13 #if defined(PETSC_HAVE_CUDA) 14 #include <nvToolsExt.h> 15 #endif 16 17 #include <petscdevice_cupm.h> 18 19 #define PCBJKOKKOS_SHARED_LEVEL 0 // 0 is shared, 1 is global 20 #define PCBJKOKKOS_VEC_SIZE 16 21 #define PCBJKOKKOS_TEAM_SIZE 16 22 23 #define PCBJKOKKOS_VERBOSE_LEVEL 2 24 25 typedef Kokkos::DefaultExecutionSpace exec_space; 26 using layout = Kokkos::LayoutRight; 27 using IntView = Kokkos::View<PetscInt **, layout, exec_space>; 28 using AMatrixValueView = const Kokkos::View<PetscScalar **, layout, exec_space>; 29 using XYType = const Kokkos::View<PetscScalar **, layout, exec_space>; 30 31 typedef enum { 32 BATCH_KSP_BICG_IDX, 33 BATCH_KSP_TFQMR_IDX, 34 BATCH_KSP_GMRES_IDX, 35 NUM_BATCH_TYPES 36 } KSPIndex; 37 typedef struct { 38 Vec vec_diag; 39 PetscInt nBlocks; /* total number of blocks */ 40 PetscInt n; // cache host version of d_bid_eqOffset_k[nBlocks] 41 KSP ksp; // Used just for options. Should have one for each block 42 Kokkos::View<PetscInt *, Kokkos::LayoutRight> *d_bid_eqOffset_k; 43 Kokkos::View<PetscScalar *, Kokkos::LayoutRight> *d_idiag_k; 44 Kokkos::View<PetscInt *> *d_isrow_k; 45 Kokkos::View<PetscInt *> *d_isicol_k; 46 KSPIndex ksp_type_idx; 47 PetscInt nwork; 48 PetscInt const_block_size; // used to decide to use shared memory for work vectors 49 PetscInt *dm_Nf; // Number of fields in each DM 50 PetscInt num_dms; 51 // diagnostics 52 PetscBool reason; 53 PetscBool monitor; 54 PetscInt batch_target; 55 PetscInt nsolves_team; 56 PetscInt max_nits; 57 // caches 58 IntView *rowOffsets; 59 IntView *colIndices; 60 XYType *batch_b; 61 XYType *batch_x; 62 AMatrixValueView *batch_values; 63 } PC_PCBJKOKKOS; 64 65 #if defined(PETSC_HAVE_KOKKOS_KERNELS_GMRES) 66 #include <fstream> 67 68 #include "Kokkos_Timer.hpp" 69 #include "Kokkos_Random.hpp" 70 #include "Kokkos_UnorderedMap.hpp" 71 #include "Kokkos_Sort.hpp" 72 73 /// KokkosKernels headers 74 #include "KokkosBatched_Util.hpp" 75 #include "KokkosBatched_Vector.hpp" 76 77 #include <Kokkos_ArithTraits.hpp> 78 #include <KokkosBatched_Util.hpp> 79 #include <KokkosBatched_Vector.hpp> 80 #include <KokkosBatched_Copy_Decl.hpp> 81 #include <KokkosBatched_Copy_Impl.hpp> 82 #include <KokkosBatched_AddRadial_Decl.hpp> 83 #include <KokkosBatched_AddRadial_Impl.hpp> 84 #include <KokkosBatched_Gemm_Decl.hpp> 85 #include <KokkosBatched_Gemm_Serial_Impl.hpp> 86 #include <KokkosBatched_Gemm_Team_Impl.hpp> 87 #include <KokkosBatched_Gemv_Decl.hpp> 88 #include <KokkosBatched_Gemv_Serial_Impl.hpp> 89 #include <KokkosBatched_Gemv_Team_Impl.hpp> 90 #include <KokkosBatched_Trsm_Decl.hpp> 91 #include <KokkosBatched_Trsm_Serial_Impl.hpp> 92 #include <KokkosBatched_Trsm_Team_Impl.hpp> 93 #include <KokkosBatched_Trsv_Decl.hpp> 94 #include <KokkosBatched_Trsv_Serial_Impl.hpp> 95 #include <KokkosBatched_Trsv_Team_Impl.hpp> 96 #include <KokkosBatched_LU_Decl.hpp> 97 #include <KokkosBatched_LU_Serial_Impl.hpp> 98 #include <KokkosBatched_LU_Team_Impl.hpp> 99 #include <KokkosSparse_CrsMatrix.hpp> 100 #include "KokkosBatched_Spmv.hpp" 101 #include "KokkosBatched_CrsMatrix.hpp" 102 #include "KokkosBatched_Krylov_Handle.hpp" 103 #include "KokkosBatched_GMRES.hpp" 104 #include "KokkosBatched_JacobiPrec.hpp" 105 106 template <typename DeviceType, typename ValuesViewType, typename IntView, typename VectorViewType, typename KrylovHandleType> 107 struct Functor_TestBatchedTeamVectorGMRES { 108 const ValuesViewType _D; 109 const ValuesViewType _diag; 110 const IntView _r; 111 const IntView _c; 112 const VectorViewType _X; 113 const VectorViewType _B; 114 const int _N_team, _team_size, _vector_length; 115 const int _N_iteration; 116 const double _tol; 117 const int _ortho_strategy; 118 const int _scratch_pad_level; 119 KrylovHandleType _handle; 120 121 KOKKOS_INLINE_FUNCTION 122 Functor_TestBatchedTeamVectorGMRES(const ValuesViewType &D, const IntView &r, const IntView &c, const VectorViewType &X, const VectorViewType &B, const int N_team, const int team_size, const int vector_length, const int N_iteration, const double tol, const int ortho_strategy, const int scratch_pad_level, KrylovHandleType &handle) : 123 _D(D), _r(r), _c(c), _X(X), _B(B), _N_team(N_team), _team_size(team_size), _vector_length(vector_length), _N_iteration(N_iteration), _tol(tol), _ortho_strategy(ortho_strategy), _scratch_pad_level(scratch_pad_level), _handle(handle) { } 124 125 KOKKOS_INLINE_FUNCTION 126 Functor_TestBatchedTeamVectorGMRES(const ValuesViewType &D, const ValuesViewType &diag, const IntView &r, const IntView &c, const VectorViewType &X, const VectorViewType &B, const int N_team, const int team_size, const int vector_length, const int N_iteration, const double tol, int ortho_strategy, const int scratch_pad_level, KrylovHandleType &handle) : 127 _D(D), _diag(diag), _r(r), _c(c), _X(X), _B(B), _N_team(N_team), _team_size(team_size), _vector_length(vector_length), _N_iteration(N_iteration), _tol(tol), _ortho_strategy(ortho_strategy), _scratch_pad_level(scratch_pad_level), _handle(handle) { } 128 129 template <typename MemberType> 130 KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { 131 const int first_matrix = static_cast<int>(member.league_rank()) * _N_team; 132 const int N = _D.extent(0); 133 const int last_matrix = (static_cast<int>(member.league_rank() + 1) * _N_team < N ? static_cast<int>(member.league_rank() + 1) * _N_team : N); 134 const int graphID = static_cast<int>(member.league_rank()); 135 using TeamVectorCopy1D = KokkosBatched::TeamVectorCopy<MemberType, KokkosBatched::Trans::NoTranspose, 1>; 136 137 auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); 138 auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); 139 auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); 140 using ScratchPadIntViewType = Kokkos::View<typename IntView::non_const_value_type *, typename IntView::array_layout, typename IntView::execution_space::scratch_memory_space>; 141 using ScratchPadValuesViewType = Kokkos::View<typename ValuesViewType::non_const_value_type **, typename ValuesViewType::array_layout, typename ValuesViewType::execution_space::scratch_memory_space>; 142 143 using Operator = KokkosBatched::CrsMatrix<ValuesViewType, ScratchPadIntViewType>; 144 ScratchPadIntViewType r(member.team_scratch(1), _r.extent(1)); 145 ScratchPadIntViewType c(member.team_scratch(1), _c.extent(1)); 146 147 TeamVectorCopy1D::invoke(member, Kokkos::subview(_r, graphID, Kokkos::ALL), r); 148 TeamVectorCopy1D::invoke(member, Kokkos::subview(_c, graphID, Kokkos::ALL), c); 149 Operator A(d, r, c); 150 151 ScratchPadValuesViewType diag(member.team_scratch(1), last_matrix - first_matrix, _diag.extent(1)); 152 using PrecOperator = KokkosBatched::JacobiPrec<ScratchPadValuesViewType>; 153 154 KokkosBatched::TeamVectorCopy<MemberType>::invoke(member, Kokkos::subview(_diag, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL), diag); 155 PrecOperator P(diag); 156 P.setComputedInverse(); 157 158 KokkosBatched::TeamVectorGMRES<MemberType>::template invoke<Operator, VectorViewType, PrecOperator, KrylovHandleType>(member, A, b, x, P, _handle); 159 } 160 inline double run(PC pc) { 161 typedef typename ValuesViewType::value_type value_type; 162 std::string name("KokkosBatched::Test::TeamVectorGMRES"); 163 Kokkos::Timer timer; 164 Kokkos::Profiling::pushRegion(name.c_str()); 165 166 Kokkos::TeamPolicy<DeviceType> auto_policy(ceil(1. * _D.extent(0) / _N_team), Kokkos::AUTO(), Kokkos::AUTO()); 167 Kokkos::TeamPolicy<DeviceType> tuned_policy(ceil(1. * _D.extent(0) / _N_team), _team_size, _vector_length); 168 Kokkos::TeamPolicy<DeviceType> policy; 169 170 if (_team_size < 1) policy = auto_policy; 171 else policy = tuned_policy; 172 173 _handle.set_max_iteration(_N_iteration); 174 _handle.set_tolerance(_tol); 175 _handle.set_ortho_strategy(_ortho_strategy); 176 _handle.set_scratch_pad_level(_scratch_pad_level); 177 _handle.set_compute_last_residual(true); 178 179 int maximum_iteration = _handle.get_max_iteration(); 180 181 using ScalarType = typename ValuesViewType::non_const_value_type; 182 using Layout = typename ValuesViewType::array_layout; 183 using EXSP = typename ValuesViewType::execution_space; 184 185 using MagnitudeType = typename Kokkos::Details::ArithTraits<ScalarType>::mag_type; 186 187 using ViewType1D = Kokkos::View<MagnitudeType *, Layout, EXSP>; 188 using ViewType2D = Kokkos::View<ScalarType **, Layout, EXSP>; 189 using ViewType3D = Kokkos::View<ScalarType ***, Layout, EXSP>; 190 using IntViewType1D = Kokkos::View<PetscInt *, Layout, EXSP>; 191 192 size_t bytes_1D = ViewType2D::shmem_size(_N_team, 1); 193 size_t bytes_row_ptr = IntViewType1D::shmem_size(_r.extent(1)); 194 size_t bytes_col_idc = IntViewType1D::shmem_size(_c.extent(1)); 195 size_t bytes_2D_1 = ViewType2D::shmem_size(_N_team, _X.extent(1)); 196 size_t bytes_2D_2 = ViewType2D::shmem_size(_N_team, maximum_iteration + 1); 197 198 size_t bytes_diag = bytes_2D_1; 199 size_t bytes_tmp = 2 * bytes_2D_1 + 2 * bytes_1D + bytes_2D_2; 200 201 policy.set_scratch_size(0, Kokkos::PerTeam(bytes_tmp)); 202 policy.set_scratch_size(1, Kokkos::PerTeam(bytes_col_idc + bytes_row_ptr + bytes_diag)); 203 PetscInfo(pc, "%d scratch memory(0) = %d + %d + %d bytes_diag=%d; %d scratch memory(1); %d maximum_iterations\n", (int)(bytes_tmp), 2 * (int)bytes_2D_1, 2 * (int)bytes_1D, (int)bytes_2D_2, (int)bytes_diag, (int)(bytes_row_ptr + bytes_col_idc + bytes_diag), (int)maximum_iteration); 204 exec_space().fence(); 205 timer.reset(); 206 Kokkos::parallel_for(name.c_str(), policy, *this); 207 exec_space().fence(); 208 double sec = timer.seconds(); 209 210 return sec; 211 } 212 }; 213 #endif // KK GMRES 214 215 typedef Kokkos::TeamPolicy<>::member_type team_member; 216 217 static PetscErrorCode PCBJKOKKOSCreateKSP_BJKOKKOS(PC pc) { 218 const char *prefix; 219 PC_PCBJKOKKOS *jac = (PC_PCBJKOKKOS *)pc->data; 220 DM dm; 221 222 PetscFunctionBegin; 223 PetscCall(KSPCreate(PetscObjectComm((PetscObject)pc), &jac->ksp)); 224 PetscCall(KSPSetErrorIfNotConverged(jac->ksp, pc->erroriffailure)); 225 PetscCall(PetscObjectIncrementTabLevel((PetscObject)jac->ksp, (PetscObject)pc, 1)); 226 PetscCall(PCGetOptionsPrefix(pc, &prefix)); 227 PetscCall(KSPSetOptionsPrefix(jac->ksp, prefix)); 228 PetscCall(KSPAppendOptionsPrefix(jac->ksp, "pc_bjkokkos_")); 229 PetscCall(PCGetDM(pc, &dm)); 230 if (dm) { 231 PetscCall(KSPSetDM(jac->ksp, dm)); 232 PetscCall(KSPSetDMActive(jac->ksp, PETSC_FALSE)); 233 } 234 jac->reason = PETSC_FALSE; 235 jac->monitor = PETSC_FALSE; 236 jac->batch_target = -1; 237 jac->nsolves_team = 1; 238 jac->ksp->max_it = 50; // this is realy for GMRES w/o restarts 239 PetscFunctionReturn(0); 240 } 241 242 // y <-- Ax 243 KOKKOS_INLINE_FUNCTION PetscErrorCode MatMult(const team_member team, const PetscInt *glb_Aai, const PetscInt *glb_Aaj, const PetscScalar *glb_Aaa, const PetscInt *r, const PetscInt *ic, const PetscInt start, const PetscInt end, const PetscScalar *x_loc, PetscScalar *y_loc) { 244 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, start, end), [=](const int rowb) { 245 int rowa = ic[rowb]; 246 int n = glb_Aai[rowa + 1] - glb_Aai[rowa]; 247 const PetscInt *aj = glb_Aaj + glb_Aai[rowa]; 248 const PetscScalar *aa = glb_Aaa + glb_Aai[rowa]; 249 PetscScalar sum; 250 Kokkos::parallel_reduce( 251 Kokkos::ThreadVectorRange(team, n), [=](const int i, PetscScalar &lsum) { lsum += aa[i] * x_loc[r[aj[i]] - start]; }, sum); 252 Kokkos::single(Kokkos::PerThread(team), [=]() { y_loc[rowb - start] = sum; }); 253 }); 254 team.team_barrier(); 255 return 0; 256 } 257 258 // temp buffer per thread with reduction at end? 259 KOKKOS_INLINE_FUNCTION PetscErrorCode MatMultTranspose(const team_member team, const PetscInt *glb_Aai, const PetscInt *glb_Aaj, const PetscScalar *glb_Aaa, const PetscInt *r, const PetscInt *ic, const PetscInt start, const PetscInt end, const PetscScalar *x_loc, PetscScalar *y_loc) { 260 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, end - start), [=](int i) { y_loc[i] = 0; }); 261 team.team_barrier(); 262 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, start, end), [=](const int rowb) { 263 int rowa = ic[rowb]; 264 int n = glb_Aai[rowa + 1] - glb_Aai[rowa]; 265 const PetscInt *aj = glb_Aaj + glb_Aai[rowa]; 266 const PetscScalar *aa = glb_Aaa + glb_Aai[rowa]; 267 const PetscScalar xx = x_loc[rowb - start]; // rowb = ic[rowa] = ic[r[rowb]] 268 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, n), [=](const int &i) { 269 PetscScalar val = aa[i] * xx; 270 Kokkos::atomic_fetch_add(&y_loc[r[aj[i]] - start], val); 271 }); 272 }); 273 team.team_barrier(); 274 return 0; 275 } 276 277 typedef struct Batch_MetaData_TAG { 278 PetscInt flops; 279 PetscInt its; 280 KSPConvergedReason reason; 281 } Batch_MetaData; 282 283 // Solve A(BB^-1)x = y with TFQMR. Right preconditioned to get un-preconditioned residual 284 KOKKOS_INLINE_FUNCTION PetscErrorCode BJSolve_TFQMR(const team_member team, const PetscInt *glb_Aai, const PetscInt *glb_Aaj, const PetscScalar *glb_Aaa, const PetscInt *r, const PetscInt *ic, PetscScalar *work_space_global, const int stride_global, const int nShareVec, PetscScalar *work_space_shared, const int stride_shared, PetscReal rtol, PetscReal atol, PetscReal dtol, PetscInt maxit, Batch_MetaData *metad, const PetscInt start, const PetscInt end, const PetscScalar glb_idiag[], const PetscScalar *glb_b, PetscScalar *glb_x, bool monitor) { 285 using Kokkos::parallel_for; 286 using Kokkos::parallel_reduce; 287 int Nblk = end - start, i, m, stride = stride_shared, idx = 0; 288 PetscReal dp, dpold, w, dpest, tau, psi, cm, r0; 289 const PetscScalar *Diag = &glb_idiag[start]; 290 PetscScalar *ptr = work_space_shared, rho, rhoold, a, s, b, eta, etaold, psiold, cf, dpi; 291 292 if (idx++ == nShareVec) { 293 ptr = work_space_global; 294 stride = stride_global; 295 } 296 PetscScalar *XX = ptr; 297 ptr += stride; 298 if (idx++ == nShareVec) { 299 ptr = work_space_global; 300 stride = stride_global; 301 } 302 PetscScalar *R = ptr; 303 ptr += stride; 304 if (idx++ == nShareVec) { 305 ptr = work_space_global; 306 stride = stride_global; 307 } 308 PetscScalar *RP = ptr; 309 ptr += stride; 310 if (idx++ == nShareVec) { 311 ptr = work_space_global; 312 stride = stride_global; 313 } 314 PetscScalar *V = ptr; 315 ptr += stride; 316 if (idx++ == nShareVec) { 317 ptr = work_space_global; 318 stride = stride_global; 319 } 320 PetscScalar *T = ptr; 321 ptr += stride; 322 if (idx++ == nShareVec) { 323 ptr = work_space_global; 324 stride = stride_global; 325 } 326 PetscScalar *Q = ptr; 327 ptr += stride; 328 if (idx++ == nShareVec) { 329 ptr = work_space_global; 330 stride = stride_global; 331 } 332 PetscScalar *P = ptr; 333 ptr += stride; 334 if (idx++ == nShareVec) { 335 ptr = work_space_global; 336 stride = stride_global; 337 } 338 PetscScalar *U = ptr; 339 ptr += stride; 340 if (idx++ == nShareVec) { 341 ptr = work_space_global; 342 stride = stride_global; 343 } 344 PetscScalar *D = ptr; 345 ptr += stride; 346 if (idx++ == nShareVec) { 347 ptr = work_space_global; 348 stride = stride_global; 349 } 350 PetscScalar *AUQ = V; 351 352 // init: get b, zero x 353 parallel_for(Kokkos::TeamVectorRange(team, start, end), [=](int rowb) { 354 int rowa = ic[rowb]; 355 R[rowb - start] = glb_b[rowa]; 356 XX[rowb - start] = 0; 357 }); 358 team.team_barrier(); 359 parallel_reduce( 360 Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &lsum) { lsum += R[idx] * PetscConj(R[idx]); }, dpi); 361 team.team_barrier(); 362 r0 = dp = PetscSqrtReal(PetscRealPart(dpi)); 363 // diagnostics 364 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 365 if (monitor) Kokkos::single(Kokkos::PerTeam(team), [=]() { printf("%3d KSP Residual norm %14.12e \n", 0, (double)dp); }); 366 #endif 367 if (dp < atol) { 368 metad->reason = KSP_CONVERGED_ATOL_NORMAL; 369 return 0; 370 } 371 if (0 == maxit) { 372 metad->reason = KSP_DIVERGED_ITS; 373 return 0; 374 } 375 376 /* Make the initial Rp = R */ 377 parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { RP[idx] = R[idx]; }); 378 team.team_barrier(); 379 /* Set the initial conditions */ 380 etaold = 0.0; 381 psiold = 0.0; 382 tau = dp; 383 dpold = dp; 384 385 /* rhoold = (r,rp) */ 386 parallel_reduce( 387 Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &dot) { dot += R[idx] * PetscConj(RP[idx]); }, rhoold); 388 team.team_barrier(); 389 parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { 390 U[idx] = R[idx]; 391 P[idx] = R[idx]; 392 T[idx] = Diag[idx] * P[idx]; 393 D[idx] = 0; 394 }); 395 team.team_barrier(); 396 MatMult(team, glb_Aai, glb_Aaj, glb_Aaa, r, ic, start, end, T, V); 397 398 i = 0; 399 do { 400 /* s <- (v,rp) */ 401 parallel_reduce( 402 Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &dot) { dot += V[idx] * PetscConj(RP[idx]); }, s); 403 team.team_barrier(); 404 a = rhoold / s; /* a <- rho / s */ 405 /* q <- u - a v VecWAXPY(w,alpha,x,y): w = alpha x + y. */ 406 /* t <- u + q */ 407 parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { 408 Q[idx] = U[idx] - a * V[idx]; 409 T[idx] = U[idx] + Q[idx]; 410 }); 411 team.team_barrier(); 412 // KSP_PCApplyBAorAB 413 parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { T[idx] = Diag[idx] * T[idx]; }); 414 team.team_barrier(); 415 MatMult(team, glb_Aai, glb_Aaj, glb_Aaa, r, ic, start, end, T, AUQ); 416 /* r <- r - a K (u + q) */ 417 parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { R[idx] = R[idx] - a * AUQ[idx]; }); 418 team.team_barrier(); 419 parallel_reduce( 420 Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &lsum) { lsum += R[idx] * PetscConj(R[idx]); }, dpi); 421 team.team_barrier(); 422 dp = PetscSqrtReal(PetscRealPart(dpi)); 423 for (m = 0; m < 2; m++) { 424 if (!m) w = PetscSqrtReal(dp * dpold); 425 else w = dp; 426 psi = w / tau; 427 cm = 1.0 / PetscSqrtReal(1.0 + psi * psi); 428 tau = tau * psi * cm; 429 eta = cm * cm * a; 430 cf = psiold * psiold * etaold / a; 431 if (!m) { 432 /* D = U + cf D */ 433 parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { D[idx] = U[idx] + cf * D[idx]; }); 434 } else { 435 /* D = Q + cf D */ 436 parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { D[idx] = Q[idx] + cf * D[idx]; }); 437 } 438 team.team_barrier(); 439 parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { XX[idx] = XX[idx] + eta * D[idx]; }); 440 team.team_barrier(); 441 dpest = PetscSqrtReal(2 * i + m + 2.0) * tau; 442 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 443 if (monitor && m == 1) Kokkos::single(Kokkos::PerTeam(team), [=]() { printf("%3d KSP Residual norm %14.12e \n", i + 1, (double)dpest); }); 444 #endif 445 if (dpest < atol) { 446 metad->reason = KSP_CONVERGED_ATOL_NORMAL; 447 goto done; 448 } 449 if (dpest / r0 < rtol) { 450 metad->reason = KSP_CONVERGED_RTOL_NORMAL; 451 goto done; 452 } 453 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 454 if (dpest / r0 > dtol) { 455 metad->reason = KSP_DIVERGED_DTOL; 456 Kokkos::single(Kokkos::PerTeam(team), [=]() { printf("ERROR block %d diverged: %d it, res=%e, r_0=%e\n", team.league_rank(), i, dpest, r0); }); 457 goto done; 458 } 459 #else 460 if (dpest / r0 > dtol) { 461 metad->reason = KSP_DIVERGED_DTOL; 462 goto done; 463 } 464 #endif 465 if (i + 1 == maxit) { 466 metad->reason = KSP_DIVERGED_ITS; 467 goto done; 468 } 469 470 etaold = eta; 471 psiold = psi; 472 } 473 474 /* rho <- (r,rp) */ 475 parallel_reduce( 476 Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &dot) { dot += R[idx] * PetscConj(RP[idx]); }, rho); 477 team.team_barrier(); 478 b = rho / rhoold; /* b <- rho / rhoold */ 479 /* u <- r + b q */ 480 /* p <- u + b(q + b p) */ 481 parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { 482 U[idx] = R[idx] + b * Q[idx]; 483 Q[idx] = Q[idx] + b * P[idx]; 484 P[idx] = U[idx] + b * Q[idx]; 485 }); 486 /* v <- K p */ 487 team.team_barrier(); 488 parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { T[idx] = Diag[idx] * P[idx]; }); 489 team.team_barrier(); 490 MatMult(team, glb_Aai, glb_Aaj, glb_Aaa, r, ic, start, end, T, V); 491 492 rhoold = rho; 493 dpold = dp; 494 495 i++; 496 } while (i < maxit); 497 done: 498 // KSPUnwindPreconditioner 499 parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { XX[idx] = Diag[idx] * XX[idx]; }); 500 team.team_barrier(); 501 // put x into Plex order 502 parallel_for(Kokkos::TeamVectorRange(team, start, end), [=](int rowb) { 503 int rowa = ic[rowb]; 504 glb_x[rowa] = XX[rowb - start]; 505 }); 506 metad->its = i + 1; 507 if (1) { 508 int nnz; 509 parallel_reduce( 510 Kokkos::TeamVectorRange(team, start, end), [=](const int idx, int &lsum) { lsum += (glb_Aai[idx + 1] - glb_Aai[idx]); }, nnz); 511 metad->flops = 2 * (metad->its * (10 * Nblk + 2 * nnz) + 5 * Nblk); 512 } else { 513 metad->flops = 2 * (metad->its * (10 * Nblk + 2 * 50 * Nblk) + 5 * Nblk); // guess 514 } 515 return 0; 516 } 517 518 // Solve Ax = y with biCG 519 KOKKOS_INLINE_FUNCTION PetscErrorCode BJSolve_BICG(const team_member team, const PetscInt *glb_Aai, const PetscInt *glb_Aaj, const PetscScalar *glb_Aaa, const PetscInt *r, const PetscInt *ic, PetscScalar *work_space_global, const int stride_global, const int nShareVec, PetscScalar *work_space_shared, const int stride_shared, PetscReal rtol, PetscReal atol, PetscReal dtol, PetscInt maxit, Batch_MetaData *metad, const PetscInt start, const PetscInt end, const PetscScalar glb_idiag[], const PetscScalar *glb_b, PetscScalar *glb_x, bool monitor) { 520 using Kokkos::parallel_for; 521 using Kokkos::parallel_reduce; 522 int Nblk = end - start, i, stride = stride_shared, idx = 0; // start in shared mem 523 PetscReal dp, r0; 524 const PetscScalar *Di = &glb_idiag[start]; 525 PetscScalar *ptr = work_space_shared, dpi, a = 1.0, beta, betaold = 1.0, b, b2, ma, mac; 526 527 if (idx++ == nShareVec) { 528 ptr = work_space_global; 529 stride = stride_global; 530 } 531 PetscScalar *XX = ptr; 532 ptr += stride; 533 if (idx++ == nShareVec) { 534 ptr = work_space_global; 535 stride = stride_global; 536 } 537 PetscScalar *Rl = ptr; 538 ptr += stride; 539 if (idx++ == nShareVec) { 540 ptr = work_space_global; 541 stride = stride_global; 542 } 543 PetscScalar *Zl = ptr; 544 ptr += stride; 545 if (idx++ == nShareVec) { 546 ptr = work_space_global; 547 stride = stride_global; 548 } 549 PetscScalar *Pl = ptr; 550 ptr += stride; 551 if (idx++ == nShareVec) { 552 ptr = work_space_global; 553 stride = stride_global; 554 } 555 PetscScalar *Rr = ptr; 556 ptr += stride; 557 if (idx++ == nShareVec) { 558 ptr = work_space_global; 559 stride = stride_global; 560 } 561 PetscScalar *Zr = ptr; 562 ptr += stride; 563 if (idx++ == nShareVec) { 564 ptr = work_space_global; 565 stride = stride_global; 566 } 567 PetscScalar *Pr = ptr; 568 ptr += stride; 569 570 /* r <- b (x is 0) */ 571 parallel_for(Kokkos::TeamVectorRange(team, start, end), [=](int rowb) { 572 int rowa = ic[rowb]; 573 //PetscCall(VecCopy(Rr,Rl)); 574 Rl[rowb - start] = Rr[rowb - start] = glb_b[rowa]; 575 XX[rowb - start] = 0; 576 }); 577 team.team_barrier(); 578 /* z <- Br */ 579 parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { 580 Zr[idx] = Di[idx] * Rr[idx]; 581 Zl[idx] = Di[idx] * Rl[idx]; 582 }); 583 team.team_barrier(); 584 /* dp <- r'*r */ 585 parallel_reduce( 586 Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &lsum) { lsum += Rr[idx] * PetscConj(Rr[idx]); }, dpi); 587 team.team_barrier(); 588 r0 = dp = PetscSqrtReal(PetscRealPart(dpi)); 589 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 590 if (monitor) Kokkos::single(Kokkos::PerTeam(team), [=]() { printf("%3d KSP Residual norm %14.12e \n", 0, (double)dp); }); 591 #endif 592 if (dp < atol) { 593 metad->reason = KSP_CONVERGED_ATOL_NORMAL; 594 return 0; 595 } 596 if (0 == maxit) { 597 metad->reason = KSP_DIVERGED_ITS; 598 return 0; 599 } 600 i = 0; 601 do { 602 /* beta <- r'z */ 603 parallel_reduce( 604 Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &dot) { dot += Zr[idx] * PetscConj(Rl[idx]); }, beta); 605 team.team_barrier(); 606 #if PCBJKOKKOS_VERBOSE_LEVEL >= 6 607 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 608 Kokkos::single(Kokkos::PerTeam(team), [=]() { printf("%7d beta = Z.R = %22.14e \n", i, (double)beta); }); 609 #endif 610 #endif 611 if (!i) { 612 if (beta == 0.0) { 613 metad->reason = KSP_DIVERGED_BREAKDOWN_BICG; 614 goto done; 615 } 616 /* p <- z */ 617 parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { 618 Pr[idx] = Zr[idx]; 619 Pl[idx] = Zl[idx]; 620 }); 621 } else { 622 b = beta / betaold; 623 /* p <- z + b* p */ 624 b2 = PetscConj(b); 625 parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { 626 Pr[idx] = b * Pr[idx] + Zr[idx]; 627 Pl[idx] = b2 * Pl[idx] + Zl[idx]; 628 }); 629 } 630 team.team_barrier(); 631 betaold = beta; 632 /* z <- Kp */ 633 MatMult(team, glb_Aai, glb_Aaj, glb_Aaa, r, ic, start, end, Pr, Zr); 634 MatMultTranspose(team, glb_Aai, glb_Aaj, glb_Aaa, r, ic, start, end, Pl, Zl); 635 /* dpi <- z'p */ 636 parallel_reduce( 637 Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &lsum) { lsum += Zr[idx] * PetscConj(Pl[idx]); }, dpi); 638 team.team_barrier(); 639 // 640 a = beta / dpi; /* a = beta/p'z */ 641 ma = -a; 642 mac = PetscConj(ma); 643 /* x <- x + ap */ 644 parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { 645 XX[idx] = XX[idx] + a * Pr[idx]; 646 Rr[idx] = Rr[idx] + ma * Zr[idx]; 647 Rl[idx] = Rl[idx] + mac * Zl[idx]; 648 }); 649 team.team_barrier(); 650 team.team_barrier(); 651 /* dp <- r'*r */ 652 parallel_reduce( 653 Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &lsum) { lsum += Rr[idx] * PetscConj(Rr[idx]); }, dpi); 654 team.team_barrier(); 655 dp = PetscSqrtReal(PetscRealPart(dpi)); 656 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 657 if (monitor) Kokkos::single(Kokkos::PerTeam(team), [=]() { printf("%3d KSP Residual norm %14.12e \n", i + 1, (double)dp); }); 658 #endif 659 if (dp < atol) { 660 metad->reason = KSP_CONVERGED_ATOL_NORMAL; 661 goto done; 662 } 663 if (dp / r0 < rtol) { 664 metad->reason = KSP_CONVERGED_RTOL_NORMAL; 665 goto done; 666 } 667 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 668 if (dp / r0 > dtol) { 669 metad->reason = KSP_DIVERGED_DTOL; 670 Kokkos::single(Kokkos::PerTeam(team), [=]() { printf("ERROR block %d diverged: %d it, res=%e, r_0=%e\n", team.league_rank(), i, dp, r0); }); 671 goto done; 672 } 673 #else 674 if (dp / r0 > dtol) { 675 metad->reason = KSP_DIVERGED_DTOL; 676 goto done; 677 } 678 #endif 679 if (i + 1 == maxit) { 680 metad->reason = KSP_DIVERGED_ITS; 681 goto done; 682 } 683 /* z <- Br */ 684 parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { 685 Zr[idx] = Di[idx] * Rr[idx]; 686 Zl[idx] = Di[idx] * Rl[idx]; 687 }); 688 i++; 689 team.team_barrier(); 690 } while (i < maxit); 691 done: 692 // put x back into Plex order 693 parallel_for(Kokkos::TeamVectorRange(team, start, end), [=](int rowb) { 694 int rowa = ic[rowb]; 695 glb_x[rowa] = XX[rowb - start]; 696 }); 697 metad->its = i + 1; 698 if (1) { 699 int nnz; 700 parallel_reduce( 701 Kokkos::TeamVectorRange(team, start, end), [=](const int idx, int &lsum) { lsum += (glb_Aai[idx + 1] - glb_Aai[idx]); }, nnz); 702 metad->flops = 2 * (metad->its * (10 * Nblk + 2 * nnz) + 5 * Nblk); 703 } else { 704 metad->flops = 2 * (metad->its * (10 * Nblk + 2 * 50 * Nblk) + 5 * Nblk); // guess 705 } 706 return 0; 707 } 708 709 // KSP solver solve Ax = b; x is output, bin is input 710 static PetscErrorCode PCApply_BJKOKKOS(PC pc, Vec bin, Vec xout) { 711 PC_PCBJKOKKOS *jac = (PC_PCBJKOKKOS *)pc->data; 712 Mat A = pc->pmat; 713 Mat_SeqAIJKokkos *aijkok; 714 715 PetscFunctionBegin; 716 PetscCheck(jac->vec_diag && A, PetscObjectComm((PetscObject)pc), PETSC_ERR_USER, "Not setup???? %p %p", jac->vec_diag, A); 717 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 718 if (!aijkok) { 719 SETERRQ(PetscObjectComm((PetscObject)pc), PETSC_ERR_USER, "No aijkok"); 720 } else { 721 PetscInt maxit = jac->ksp->max_it; 722 const PetscInt conc = Kokkos::DefaultExecutionSpace().concurrency(), openmp = !!(conc < 1000), team_size = (openmp == 0 && PCBJKOKKOS_VEC_SIZE != 1) ? PCBJKOKKOS_TEAM_SIZE : 1; 723 const PetscInt nwork = jac->nwork, nBlk = jac->nBlocks; 724 PetscScalar *glb_xdata = NULL; 725 PetscReal rtol = jac->ksp->rtol, atol = jac->ksp->abstol, dtol = jac->ksp->divtol; 726 const PetscScalar *glb_idiag = jac->d_idiag_k->data(), *glb_bdata = NULL; 727 const PetscInt *glb_Aai = aijkok->i_device_data(), *glb_Aaj = aijkok->j_device_data(), *d_bid_eqOffset = jac->d_bid_eqOffset_k->data(); 728 const PetscScalar *glb_Aaa = aijkok->a_device_data(); 729 const PetscInt *d_isicol = jac->d_isicol_k->data(), *d_isrow = jac->d_isrow_k->data(); 730 PCFailedReason pcreason; 731 KSPIndex ksp_type_idx = jac->ksp_type_idx; 732 PetscMemType mtype; 733 PetscContainer container; 734 PetscInt batch_sz; 735 VecScatter plex_batch = NULL; // not used 736 Vec bvec; // a copy of b for scatter (just alias to bin now) 737 PetscBool monitor = jac->monitor; // captured 738 PetscInt view_bid = jac->batch_target; 739 MatInfo info; 740 jac->max_nits = 0; 741 if (view_bid < 0) view_bid = 0; 742 PetscCall(MatGetInfo(A, MAT_LOCAL, &info)); 743 // get field major is to map plex IO to/from block/field major 744 PetscCall(PetscObjectQuery((PetscObject)A, "plex_batch_is", (PetscObject *)&container)); 745 if (container) { 746 PetscCall(VecDuplicate(bin, &bvec)); 747 PetscCall(PetscContainerGetPointer(container, (void **)&plex_batch)); 748 PetscCall(VecScatterBegin(plex_batch, bin, bvec, INSERT_VALUES, SCATTER_FORWARD)); 749 PetscCall(VecScatterEnd(plex_batch, bin, bvec, INSERT_VALUES, SCATTER_FORWARD)); 750 SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_USER, "No plex_batch_is -- require NO field major ordering for now"); 751 } else { 752 bvec = bin; 753 } 754 // get x 755 PetscCall(VecGetArrayAndMemType(xout, &glb_xdata, &mtype)); 756 #if defined(PETSC_HAVE_CUDA) 757 PetscCheck(PetscMemTypeDevice(mtype), PetscObjectComm((PetscObject)pc), PETSC_ERR_ARG_WRONG, "No GPU data for x %" PetscInt_FMT " != %" PetscInt_FMT, mtype, PETSC_MEMTYPE_DEVICE); 758 #endif 759 PetscCall(VecGetArrayReadAndMemType(bvec, &glb_bdata, &mtype)); 760 #if defined(PETSC_HAVE_CUDA) 761 PetscCheck(PetscMemTypeDevice(mtype), PetscObjectComm((PetscObject)pc), PETSC_ERR_ARG_WRONG, "No GPU data for b"); 762 #endif 763 // get batch size 764 PetscCall(PetscObjectQuery((PetscObject)A, "batch size", (PetscObject *)&container)); 765 if (container) { 766 PetscInt *pNf = NULL; 767 PetscCall(PetscContainerGetPointer(container, (void **)&pNf)); 768 batch_sz = *pNf; 769 } else batch_sz = 1; 770 PetscCheck(nBlk % batch_sz == 0, PetscObjectComm((PetscObject)pc), PETSC_ERR_ARG_WRONG, "batch_sz = %" PetscInt_FMT ", nBlk = %" PetscInt_FMT, batch_sz, nBlk); 771 if (ksp_type_idx == BATCH_KSP_GMRES_IDX) { // KK solver - move PETSc data into Kokkos Views, setup solver, solve, move data out of Kokkos, process metadata (convergence tests, etc.) 772 #if defined(PETSC_HAVE_KOKKOS_KERNELS_GMRES) 773 int Nsolves_team = jac->nsolves_team, fill_idx = 0; 774 int Nloc = jac->const_block_size; // same grids 775 const int Nsolves = nBlk; 776 const int nnz = (int)info.nz_used / Nsolves; // fix for variable grid size 777 if (Nsolves_team > batch_sz) Nsolves_team = batch_sz; // silently fix this 778 PetscCheck(jac->const_block_size, PetscObjectComm((PetscObject)pc), PETSC_ERR_ARG_WRONG, "Kokkos (GMRES) solver requires constant block size (but can be made to work with species ordering or N_team==1)"); 779 PetscCheck(Nsolves % Nsolves_team == 0, PetscObjectComm((PetscObject)pc), PETSC_ERR_ARG_WRONG, "Nsolves.mod(Nsolves_team) != 0: Nsolves = %d, Nsolves_team = %d", Nsolves, Nsolves_team); 780 PetscCheck(((int)info.nz_used) % Nsolves == 0, PetscObjectComm((PetscObject)pc), PETSC_ERR_ARG_WRONG, "info.nz_used.mod(Nsolves) != 0: info.nz_used = %g, Nsolves = %d", info.nz_used, Nsolves); 781 #if defined(PETSC_HAVE_CUDA) 782 nvtxRangePushA("gmres-kk"); 783 #endif 784 Kokkos::View<PetscScalar **, layout, exec_space, Kokkos::MemoryTraits<Kokkos::Unmanaged>> inv_diag((PetscScalar *)glb_idiag, Nsolves, Nloc); // in correct order 785 if (!jac->rowOffsets) { 786 jac->rowOffsets = new IntView("rowOffsets", Nsolves / Nsolves_team, Nloc + 1); // same grids 787 jac->colIndices = new IntView("colIndices", Nsolves / Nsolves_team, nnz); 788 jac->batch_b = new XYType("batch rhs", Nsolves, Nloc); 789 jac->batch_x = new XYType("batch sol", Nsolves, Nloc); 790 jac->batch_values = new AMatrixValueView("batch values", Nsolves, nnz); 791 fill_idx = 1; 792 PetscInfo(pc, "Setup indices Nloc=%d, nnz=%d\n", Nloc, nnz); 793 } 794 IntView &rowOffsets = *jac->rowOffsets; 795 IntView &colIndices = *jac->colIndices; 796 XYType &batch_b = *jac->batch_b; 797 XYType &batch_x = *jac->batch_x; 798 AMatrixValueView &batch_values = *jac->batch_values; 799 800 Kokkos::deep_copy(batch_x, 0.); 801 PetscInfo(pc, "\tjac->n = %" PetscInt_FMT ", Nloc = %d, Nsolves = %d, nnz = %d, Nsolves_team = %d, league size = %d, maxit = %" PetscInt_FMT "\n", jac->n, Nloc, Nsolves, nnz, Nsolves_team, Nsolves / Nsolves_team, maxit); 802 Kokkos::parallel_for( 803 "rowOffsets+map", Kokkos::TeamPolicy<>(Nsolves, team_size, PCBJKOKKOS_VEC_SIZE), KOKKOS_LAMBDA(const team_member team) { 804 const int blkID = team.league_rank(), start = d_bid_eqOffset[blkID], end = d_bid_eqOffset[blkID + 1]; 805 if (fill_idx) { 806 if (blkID % Nsolves_team == 0) { // first matrix on this member 807 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start, end), [=](const int rowb) { // Nloc 808 int rowa = d_isicol[rowb]; 809 int n = glb_Aai[rowa + 1] - glb_Aai[rowa]; 810 rowOffsets(blkID / Nsolves_team, rowb + 1 - start) = n; // save sizes 811 }); 812 } 813 } 814 // map b into field major space 815 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start, end), [=](int rowb) { 816 int rowa = d_isicol[rowb]; 817 batch_b(blkID, rowb - start) = glb_bdata[rowa]; 818 }); 819 }); 820 Kokkos::fence(); 821 if (fill_idx) { 822 Kokkos::parallel_for( 823 "prefix sum", Kokkos::TeamPolicy<>(Nsolves / Nsolves_team, 1, 1), KOKKOS_LAMBDA(const team_member team) { 824 const int graphID = team.league_rank(); 825 rowOffsets(graphID, 0) = 0; 826 for (size_t i = 0; i < Nloc; ++i) rowOffsets(graphID, i + 1) += rowOffsets(graphID, i); 827 }); 828 Kokkos::fence(); 829 } 830 Kokkos::parallel_for( 831 "copy matrix", Kokkos::TeamPolicy<>(Nsolves /* /batch_sz */, team_size, PCBJKOKKOS_VEC_SIZE), KOKKOS_LAMBDA(const team_member team) { 832 const int blkID = team.league_rank(), start = d_bid_eqOffset[blkID], end = d_bid_eqOffset[blkID + 1], graphID = blkID / Nsolves_team; 833 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, start, end), [=](const int rowb) { 834 int rowa = d_isicol[rowb]; // global index 835 int n = glb_Aai[rowa + 1] - glb_Aai[rowa]; 836 const PetscInt *aj = glb_Aaj + glb_Aai[rowa]; 837 const PetscScalar *aa = glb_Aaa + glb_Aai[rowa]; 838 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, n), [=](const int &i) { 839 PetscScalar val = aa[i]; 840 if (fill_idx && blkID % Nsolves_team == 0) colIndices(graphID, rowOffsets(graphID, rowb - start) + i) = d_isrow[aj[i]] - blkID * Nloc; // local" global - block start 841 batch_values(blkID, rowOffsets(graphID, rowb - start) + i) = val; 842 }); 843 }); 844 }); 845 Kokkos::fence(); 846 // setup solver 847 using ScalarType = typename AMatrixValueView::non_const_value_type; 848 using MagnitudeType = typename Kokkos::Details::ArithTraits<ScalarType>::mag_type; 849 using NormViewType = Kokkos::View<MagnitudeType *, layout, exec_space>; 850 using Norm2DViewType = Kokkos::View<MagnitudeType **, layout, exec_space>; 851 using Scalar3DViewType = Kokkos::View<ScalarType ***, layout, exec_space>; 852 using IntViewType = Kokkos::View<int *, layout, exec_space>; 853 using KrylovHandleType = KokkosBatched::KrylovHandle<Norm2DViewType, IntViewType, Scalar3DViewType>; 854 const int n_iterations = maxit; 855 const int team_size = -1; 856 const int vector_length = -1; 857 const double tol = rtol; 858 const int ortho_strategy = 0; 859 KrylovHandleType handle(Nsolves, Nsolves_team, n_iterations, true); 860 handle.Arnoldi_view = Scalar3DViewType("", Nsolves, n_iterations, Nloc + n_iterations + 3); 861 // solve 862 double time = Functor_TestBatchedTeamVectorGMRES<exec_space, AMatrixValueView, IntView, XYType, KrylovHandleType>(batch_values, inv_diag, rowOffsets, colIndices, batch_x, batch_b, Nsolves_team, team_size, vector_length, n_iterations, tol, ortho_strategy, 0, handle) 863 .run(pc); 864 Kokkos::fence(); 865 // get data back 866 Kokkos::parallel_for( 867 "map", Kokkos::TeamPolicy<>(Nsolves /* /batch_sz */, team_size, PCBJKOKKOS_VEC_SIZE), KOKKOS_LAMBDA(const team_member team) { 868 const int blkID = team.league_rank(), start = d_bid_eqOffset[blkID], end = d_bid_eqOffset[blkID + 1]; // 0 869 // map x into Plex/PETSc 870 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start, end), [=](int rowb) { 871 int rowa = d_isicol[rowb]; 872 glb_xdata[rowa] = batch_x(blkID, rowb - start); 873 }); 874 }); 875 // output assume species major - clone from Kokkos solvers 876 #if PCBJKOKKOS_VERBOSE_LEVEL >= 3 877 #if PCBJKOKKOS_VERBOSE_LEVEL >= 4 878 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "Iterations\n")); 879 #else 880 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "max iterations per species (gmres) :")); 881 #endif 882 for (PetscInt dmIdx = 0, s = 0, head = 0; dmIdx < jac->num_dms; dmIdx += batch_sz) { 883 for (PetscInt f = 0, idx = head; f < jac->dm_Nf[dmIdx]; f++, s++, idx++) { 884 #if PCBJKOKKOS_VERBOSE_LEVEL >= 4 885 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "%2D:", s)); 886 for (int bid = 0; bid < batch_sz; bid++) PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "%3D ", handle.get_iteration_host(idx + bid * jac->dm_Nf[dmIdx]))); 887 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "\n")); 888 #else 889 int count = 0, ii; 890 for (int bid = 0; bid < batch_sz; bid++) { 891 if ((ii = handle.get_iteration_host(idx + bid * jac->dm_Nf[dmIdx])) > count) count = ii; 892 } 893 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "%3d", count)); 894 #endif 895 } 896 head += batch_sz * jac->dm_Nf[dmIdx]; 897 } 898 #if PCBJKOKKOS_VERBOSE_LEVEL == 3 899 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "\n")); 900 #endif 901 #endif 902 // return error code, get max it 903 PetscInt count = 0, mbid = 0; 904 if (handle.is_converged_host()) { 905 pcreason = PC_NOERROR; 906 if (!jac->max_nits) { 907 for (int blkID = 0; blkID < nBlk; blkID++) { 908 if (handle.get_iteration_host(blkID) > jac->max_nits) { 909 jac->max_nits = handle.get_iteration_host(blkID); 910 mbid = blkID; 911 } 912 } 913 } 914 } else { 915 PetscCall(PetscPrintf(PETSC_COMM_SELF, "There is at least one system that did not converge.")); 916 pcreason = PC_SUBPC_ERROR; 917 } 918 // output - assume species major order 919 for (int blkID = 0; blkID < nBlk; blkID++) { 920 if (jac->reason) { // -pc_bjkokkos_ksp_converged_reason 921 if (jac->batch_target == blkID) { 922 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), " Linear solve %s in %d iterations, batch %" PetscInt_FMT ", species %" PetscInt_FMT "\n", handle.is_converged_host(blkID) ? "converged" : "diverged", handle.get_iteration_host(blkID), blkID % batch_sz, blkID / batch_sz)); 923 } else if (jac->batch_target == -1 && handle.get_iteration_host(blkID) > count) { 924 jac->max_nits = count = handle.get_iteration_host(blkID); 925 mbid = blkID; 926 } 927 if (!handle.is_converged_host(blkID)) PetscCall(PetscPrintf(PETSC_COMM_SELF, "ERROR species %d, batch %d did not converge with %d iterations\n", (int)(blkID / batch_sz), (int)blkID % batch_sz, handle.get_iteration_host(blkID))); 928 } 929 } 930 if (jac->batch_target == -1 && jac->reason) { 931 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), " Linear solve %s in %d iteration, batch %" PetscInt_FMT ", specie %" PetscInt_FMT "\n", handle.is_converged_host(mbid) ? "converged" : "diverged", jac->max_nits, mbid % batch_sz, mbid / batch_sz)); 932 } 933 #if defined(PETSC_HAVE_CUDA) 934 nvtxRangePop(); 935 #endif 936 #else 937 SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_USER, "batch GMRES not supported"); 938 #endif 939 } else { // Kokkos Krylov 940 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 941 using vect2D_scr_t = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft, scr_mem_t>; 942 Kokkos::View<Batch_MetaData *, Kokkos::DefaultExecutionSpace> d_metadata("solver meta data", nBlk); 943 int stride_shared, stride_global, global_buff_words; 944 d_bid_eqOffset = jac->d_bid_eqOffset_k->data(); 945 // solve each block independently 946 int scr_bytes_team_shared = 0, nShareVec = 0, nGlobBVec = 0; 947 if (jac->const_block_size) { // use shared memory for work vectors only if constant block size - todo: test efficiency loss 948 int maximum_shared_mem_size = 64000; 949 PetscDevice device; 950 PetscCall(PetscDeviceGetDefault_Internal(&device)); 951 PetscCall(PetscDeviceGetAttribute(device, PETSC_DEVICE_ATTR_SIZE_T_SHARED_MEM_PER_BLOCK, &maximum_shared_mem_size)); 952 stride_shared = jac->const_block_size; // captured 953 nShareVec = maximum_shared_mem_size / (jac->const_block_size * sizeof(PetscScalar)); // integer floor, number of vectors that fit in shared 954 if (nShareVec > nwork) nShareVec = nwork; 955 else nGlobBVec = nwork - nShareVec; 956 global_buff_words = jac->n * nGlobBVec; 957 scr_bytes_team_shared = jac->const_block_size * nShareVec * sizeof(PetscScalar); 958 //PetscCall(PetscPrintf(PETSC_COMM_WORLD,"maximum_shared_mem_size=%d scr_bytes_shared=%d nShareVec=%d, nGlobBVec=%d vec size=%d jac->const_block_size=%d\n",maximum_shared_mem_size,scr_bytes_team_shared,nShareVec,nGlobBVec,jac->const_block_size*sizeof(PetscScalar),jac->const_block_size)); 959 } else { 960 scr_bytes_team_shared = 0; 961 stride_shared = 0; 962 global_buff_words = jac->n * nwork; 963 nGlobBVec = nwork; // not needed == fix 964 } 965 stride_global = jac->n; // captured 966 #if defined(PETSC_HAVE_CUDA) 967 nvtxRangePushA("batch-kokkos-solve"); 968 #endif 969 Kokkos::View<PetscScalar *, Kokkos::DefaultExecutionSpace> d_work_vecs_k("workvectors", global_buff_words); // global work vectors 970 PetscInfo(pc, "\tn = %d. %d shared bytes/team, %d global mem bytes, rtol=%e, num blocks %d, team_size=%d, %d vector threads, %d shared vectors, %d global vectors\n", (int)jac->n, scr_bytes_team_shared, global_buff_words, rtol, (int)nBlk, (int)team_size, PCBJKOKKOS_VEC_SIZE, nShareVec, nGlobBVec); 971 PetscScalar *d_work_vecs = d_work_vecs_k.data(); 972 Kokkos::parallel_for( 973 "Solve", Kokkos::TeamPolicy<Kokkos::LaunchBounds<256, 4>>(nBlk, team_size, PCBJKOKKOS_VEC_SIZE).set_scratch_size(PCBJKOKKOS_SHARED_LEVEL, Kokkos::PerTeam(scr_bytes_team_shared)), KOKKOS_LAMBDA(const team_member team) { 974 const int blkID = team.league_rank(), start = d_bid_eqOffset[blkID], end = d_bid_eqOffset[blkID + 1]; 975 vect2D_scr_t work_vecs_shared(team.team_scratch(PCBJKOKKOS_SHARED_LEVEL), end - start, nShareVec); 976 PetscScalar *work_buff_shared = work_vecs_shared.data(); 977 PetscScalar *work_buff_global = &d_work_vecs[start]; // start inc'ed in 978 bool print = monitor && (blkID == view_bid); 979 switch (ksp_type_idx) { 980 case BATCH_KSP_BICG_IDX: 981 BJSolve_BICG(team, glb_Aai, glb_Aaj, glb_Aaa, d_isrow, d_isicol, work_buff_global, stride_global, nShareVec, work_buff_shared, stride_shared, rtol, atol, dtol, maxit, &d_metadata[blkID], start, end, glb_idiag, glb_bdata, glb_xdata, print); 982 break; 983 case BATCH_KSP_TFQMR_IDX: 984 BJSolve_TFQMR(team, glb_Aai, glb_Aaj, glb_Aaa, d_isrow, d_isicol, work_buff_global, stride_global, nShareVec, work_buff_shared, stride_shared, rtol, atol, dtol, maxit, &d_metadata[blkID], start, end, glb_idiag, glb_bdata, glb_xdata, print); 985 break; 986 case BATCH_KSP_GMRES_IDX: 987 //BJSolve_GMRES(); 988 break; 989 default: 990 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 991 printf("Unknown KSP type %d\n", ksp_type_idx); 992 #else 993 /* void */; 994 #endif 995 } 996 }); 997 Kokkos::fence(); 998 #if defined(PETSC_HAVE_CUDA) 999 nvtxRangePop(); 1000 nvtxRangePushA("Post-solve-metadata"); 1001 #endif 1002 auto h_metadata = Kokkos::create_mirror(Kokkos::HostSpace::memory_space(), d_metadata); 1003 Kokkos::deep_copy(h_metadata, d_metadata); 1004 #if PCBJKOKKOS_VERBOSE_LEVEL >= 3 1005 #if PCBJKOKKOS_VERBOSE_LEVEL >= 4 1006 PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Iterations\n")); 1007 #endif 1008 // assume species major 1009 #if PCBJKOKKOS_VERBOSE_LEVEL < 4 1010 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "max iterations per species (%s) :", ksp_type_idx == BATCH_KSP_BICG_IDX ? "bicg" : "tfqmr")); 1011 #endif 1012 for (PetscInt dmIdx = 0, s = 0, head = 0; dmIdx < jac->num_dms; dmIdx += batch_sz) { 1013 for (PetscInt f = 0, idx = head; f < jac->dm_Nf[dmIdx]; f++, s++, idx++) { 1014 #if PCBJKOKKOS_VERBOSE_LEVEL >= 4 1015 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "%2" PetscInt_FMT ":", s)); 1016 for (int bid = 0; bid < batch_sz; bid++) PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "%3" PetscInt_FMT " ", h_metadata[idx + bid * jac->dm_Nf[dmIdx]].its)); 1017 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "\n")); 1018 #else 1019 PetscInt count = 0; 1020 for (int bid = 0; bid < batch_sz; bid++) { 1021 if (h_metadata[idx + bid * jac->dm_Nf[dmIdx]].its > count) count = h_metadata[idx + bid * jac->dm_Nf[dmIdx]].its; 1022 } 1023 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "%3" PetscInt_FMT " ", count)); 1024 #endif 1025 } 1026 head += batch_sz * jac->dm_Nf[dmIdx]; 1027 } 1028 #if PCBJKOKKOS_VERBOSE_LEVEL == 3 1029 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "\n")); 1030 #endif 1031 #endif 1032 PetscInt count = 0, mbid = 0; 1033 for (int blkID = 0; blkID < nBlk; blkID++) { 1034 PetscCall(PetscLogGpuFlops((PetscLogDouble)h_metadata[blkID].flops)); 1035 if (jac->reason) { // -pc_bjkokkos_ksp_converged_reason 1036 if (jac->batch_target == blkID) { 1037 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), " Linear solve converged due to %s iterations %d, batch %" PetscInt_FMT ", species %" PetscInt_FMT "\n", KSPConvergedReasons[h_metadata[blkID].reason], h_metadata[blkID].its, blkID % batch_sz, blkID / batch_sz)); 1038 } else if (jac->batch_target == -1 && h_metadata[blkID].its > count) { 1039 jac->max_nits = count = h_metadata[blkID].its; 1040 mbid = blkID; 1041 } 1042 if (h_metadata[blkID].reason < 0) { 1043 PetscCall(PetscPrintf(PETSC_COMM_SELF, "ERROR reason=%s, its=%" PetscInt_FMT ". species %" PetscInt_FMT ", batch %" PetscInt_FMT "\n", KSPConvergedReasons[h_metadata[blkID].reason], h_metadata[blkID].its, blkID / batch_sz, blkID % batch_sz)); 1044 } 1045 } 1046 } 1047 if (jac->batch_target == -1 && jac->reason) { 1048 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), " Linear solve converged due to %s iterations %d, batch %" PetscInt_FMT ", specie %" PetscInt_FMT "\n", KSPConvergedReasons[h_metadata[mbid].reason], h_metadata[mbid].its, mbid % batch_sz, mbid / batch_sz)); 1049 } 1050 { 1051 int errsum; 1052 Kokkos::parallel_reduce( 1053 nBlk, 1054 KOKKOS_LAMBDA(const int idx, int &lsum) { 1055 if (d_metadata[idx].reason < 0) ++lsum; 1056 }, 1057 errsum); 1058 pcreason = errsum ? PC_SUBPC_ERROR : PC_NOERROR; 1059 if (!errsum && !jac->max_nits) { // set max its to give back to top KSP 1060 for (int blkID = 0; blkID < nBlk; blkID++) { 1061 if (h_metadata[blkID].its > jac->max_nits) jac->max_nits = h_metadata[blkID].its; 1062 } 1063 } else if (errsum) { 1064 PetscCall(PetscPrintf(PETSC_COMM_SELF, "ERROR Kokkos batch solver did not converge in all solves\n")); 1065 } 1066 } 1067 #if defined(PETSC_HAVE_CUDA) 1068 nvtxRangePop(); 1069 #endif 1070 } // end of Kokkos (not Kernels) solvers block 1071 PetscCall(VecRestoreArrayAndMemType(xout, &glb_xdata)); 1072 PetscCall(VecRestoreArrayReadAndMemType(bvec, &glb_bdata)); 1073 PetscCall(PCSetFailedReason(pc, pcreason)); 1074 // map back to Plex space - not used 1075 if (plex_batch) { 1076 PetscCall(VecCopy(xout, bvec)); 1077 PetscCall(VecScatterBegin(plex_batch, bvec, xout, INSERT_VALUES, SCATTER_REVERSE)); 1078 PetscCall(VecScatterEnd(plex_batch, bvec, xout, INSERT_VALUES, SCATTER_REVERSE)); 1079 PetscCall(VecDestroy(&bvec)); 1080 } 1081 } // whole 'have aijkok' block 1082 PetscFunctionReturn(0); 1083 } 1084 1085 static PetscErrorCode PCSetUp_BJKOKKOS(PC pc) { 1086 PC_PCBJKOKKOS *jac = (PC_PCBJKOKKOS *)pc->data; 1087 Mat A = pc->pmat; 1088 Mat_SeqAIJKokkos *aijkok; 1089 PetscBool flg; 1090 1091 PetscFunctionBegin; 1092 PetscCheck(!pc->useAmat, PetscObjectComm((PetscObject)pc), PETSC_ERR_SUP, "No support for using 'use_amat'"); 1093 PetscCheck(A, PetscObjectComm((PetscObject)A), PETSC_ERR_USER, "No matrix - A is used above"); 1094 PetscCall(PetscObjectTypeCompareAny((PetscObject)A, &flg, MATSEQAIJKOKKOS, MATMPIAIJKOKKOS, MATAIJKOKKOS, "")); 1095 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "must use '-dm_mat_type aijkokkos -dm_vec_type kokkos' for -pc_type bjkokkos"); 1096 if (!(aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr))) { 1097 SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_USER, "No aijkok"); 1098 } else { 1099 if (!jac->vec_diag) { 1100 Vec *subX; 1101 DM pack, *subDM; 1102 PetscInt nDMs, n; 1103 PetscContainer container; 1104 PetscCall(PetscObjectQuery((PetscObject)A, "plex_batch_is", (PetscObject *)&container)); 1105 { // Permute the matrix to get a block diagonal system: d_isrow_k, d_isicol_k 1106 MatOrderingType rtype; 1107 IS isrow, isicol; 1108 const PetscInt *rowindices, *icolindices; 1109 rtype = MATORDERINGRCM; 1110 // get permutation. Not what I expect so inverted here 1111 PetscCall(MatGetOrdering(A, rtype, &isrow, &isicol)); 1112 PetscCall(ISDestroy(&isrow)); 1113 PetscCall(ISInvertPermutation(isicol, PETSC_DECIDE, &isrow)); // THIS IS BACKWARD -- isrow is inverse -- FIX!!!!! 1114 1115 Mat mat_block_order; 1116 PetscCall(MatCreateSubMatrix(A, isicol, isicol, MAT_INITIAL_MATRIX, &mat_block_order)); 1117 PetscCall(MatViewFromOptions(mat_block_order, NULL, "-ksp_batch_reorder_view")); 1118 PetscCall(MatDestroy(&mat_block_order)); 1119 1120 PetscCall(ISGetIndices(isrow, &rowindices)); 1121 PetscCall(ISGetIndices(isicol, &icolindices)); 1122 const Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_isrow_k((PetscInt *)rowindices, A->rmap->n); 1123 const Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_isicol_k((PetscInt *)icolindices, A->rmap->n); 1124 jac->d_isrow_k = new Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_isrow_k)); 1125 jac->d_isicol_k = new Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_isicol_k)); 1126 Kokkos::deep_copy(*jac->d_isrow_k, h_isrow_k); 1127 Kokkos::deep_copy(*jac->d_isicol_k, h_isicol_k); 1128 PetscCall(ISRestoreIndices(isrow, &rowindices)); 1129 PetscCall(ISRestoreIndices(isicol, &icolindices)); 1130 PetscCall(ISDestroy(&isrow)); 1131 PetscCall(ISDestroy(&isicol)); 1132 } 1133 // get block sizes 1134 PetscCall(PCGetDM(pc, &pack)); 1135 PetscCheck(pack, PetscObjectComm((PetscObject)A), PETSC_ERR_USER, "no DM. Requires a composite DM"); 1136 PetscCall(PetscObjectTypeCompare((PetscObject)pack, DMCOMPOSITE, &flg)); 1137 PetscCheck(flg, PetscObjectComm((PetscObject)pack), PETSC_ERR_USER, "Not for type %s", ((PetscObject)pack)->type_name); 1138 PetscCall(DMCompositeGetNumberDM(pack, &nDMs)); 1139 jac->num_dms = nDMs; 1140 PetscCall(DMCreateGlobalVector(pack, &jac->vec_diag)); 1141 PetscCall(VecGetLocalSize(jac->vec_diag, &n)); 1142 jac->n = n; 1143 jac->d_idiag_k = new Kokkos::View<PetscScalar *, Kokkos::LayoutRight>("idiag", n); 1144 // options 1145 PetscCall(PCBJKOKKOSCreateKSP_BJKOKKOS(pc)); 1146 PetscCall(KSPSetFromOptions(jac->ksp)); 1147 PetscCall(PetscObjectTypeCompareAny((PetscObject)jac->ksp, &flg, KSPBICG, "")); 1148 if (flg) { 1149 jac->ksp_type_idx = BATCH_KSP_BICG_IDX; 1150 jac->nwork = 7; 1151 } else { 1152 PetscCall(PetscObjectTypeCompareAny((PetscObject)jac->ksp, &flg, KSPTFQMR, "")); 1153 if (flg) { 1154 jac->ksp_type_idx = BATCH_KSP_TFQMR_IDX; 1155 jac->nwork = 10; 1156 } else { 1157 PetscCall(PetscObjectTypeCompareAny((PetscObject)jac->ksp, &flg, KSPGMRES, "")); 1158 if (flg) { 1159 jac->ksp_type_idx = BATCH_KSP_GMRES_IDX; 1160 jac->nwork = 0; 1161 } else { 1162 SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Unsupported batch ksp type"); 1163 } 1164 } 1165 } 1166 PetscOptionsBegin(PetscObjectComm((PetscObject)jac->ksp), ((PetscObject)jac->ksp)->prefix, "Options for Kokkos batch solver", "none"); 1167 PetscCall(PetscOptionsBool("-ksp_converged_reason", "", "bjkokkos.kokkos.cxx.c", jac->reason, &jac->reason, NULL)); 1168 PetscCall(PetscOptionsBool("-ksp_monitor", "", "bjkokkos.kokkos.cxx.c", jac->monitor, &jac->monitor, NULL)); 1169 PetscCall(PetscOptionsInt("-ksp_batch_target", "", "bjkokkos.kokkos.cxx.c", jac->batch_target, &jac->batch_target, NULL)); 1170 PetscCall(PetscOptionsInt("-ksp_batch_nsolves_team", "", "bjkokkos.kokkos.cxx.c", jac->nsolves_team, &jac->nsolves_team, NULL)); 1171 PetscCheck(jac->batch_target < jac->num_dms, PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "-ksp_batch_target (%" PetscInt_FMT ") >= number of DMs (%" PetscInt_FMT ")", jac->batch_target, jac->num_dms); 1172 PetscOptionsEnd(); 1173 // get blocks - jac->d_bid_eqOffset_k 1174 PetscCall(PetscMalloc(sizeof(*subX) * nDMs, &subX)); 1175 PetscCall(PetscMalloc(sizeof(*subDM) * nDMs, &subDM)); 1176 PetscCall(PetscMalloc(sizeof(*jac->dm_Nf) * nDMs, &jac->dm_Nf)); 1177 PetscCall(PetscInfo(pc, "Have %" PetscInt_FMT " DMs, n=%" PetscInt_FMT " rtol=%g type = %s\n", nDMs, n, (double)jac->ksp->rtol, ((PetscObject)jac->ksp)->type_name)); 1178 PetscCall(DMCompositeGetEntriesArray(pack, subDM)); 1179 jac->nBlocks = 0; 1180 for (PetscInt ii = 0; ii < nDMs; ii++) { 1181 PetscSection section; 1182 PetscInt Nf; 1183 DM dm = subDM[ii]; 1184 PetscCall(DMGetLocalSection(dm, §ion)); 1185 PetscCall(PetscSectionGetNumFields(section, &Nf)); 1186 jac->nBlocks += Nf; 1187 #if PCBJKOKKOS_VERBOSE_LEVEL <= 2 1188 if (ii == 0) PetscCall(PetscInfo(pc, "%" PetscInt_FMT ") %" PetscInt_FMT " blocks (%" PetscInt_FMT " total)\n", ii, Nf, jac->nBlocks)); 1189 #else 1190 PetscCall(PetscInfo(pc, "%" PetscInt_FMT ") %" PetscInt_FMT " blocks (%" PetscInt_FMT " total)\n", ii, Nf, jac->nBlocks)); 1191 #endif 1192 jac->dm_Nf[ii] = Nf; 1193 } 1194 { // d_bid_eqOffset_k 1195 Kokkos::View<PetscInt *, Kokkos::LayoutRight, Kokkos::HostSpace> h_block_offsets("block_offsets", jac->nBlocks + 1); 1196 PetscCall(DMCompositeGetAccessArray(pack, jac->vec_diag, nDMs, NULL, subX)); 1197 h_block_offsets[0] = 0; 1198 jac->const_block_size = -1; 1199 for (PetscInt ii = 0, idx = 0; ii < nDMs; ii++) { 1200 PetscInt nloc, nblk; 1201 PetscCall(VecGetSize(subX[ii], &nloc)); 1202 nblk = nloc / jac->dm_Nf[ii]; 1203 PetscCheck(nloc % jac->dm_Nf[ii] == 0, PetscObjectComm((PetscObject)pc), PETSC_ERR_USER, "nloc%%jac->dm_Nf[ii] (%" PetscInt_FMT ") != 0 DMs", nloc % jac->dm_Nf[ii]); 1204 for (PetscInt jj = 0; jj < jac->dm_Nf[ii]; jj++, idx++) { 1205 h_block_offsets[idx + 1] = h_block_offsets[idx] + nblk; 1206 #if PCBJKOKKOS_VERBOSE_LEVEL <= 2 1207 if (idx == 0) PetscCall(PetscInfo(pc, "\t%" PetscInt_FMT ") Add block with %" PetscInt_FMT " equations of %" PetscInt_FMT "\n", idx + 1, nblk, jac->nBlocks)); 1208 #else 1209 PetscCall(PetscInfo(pc, "\t%" PetscInt_FMT ") Add block with %" PetscInt_FMT " equations of %" PetscInt_FMT "\n", idx + 1, nblk, jac->nBlocks)); 1210 #endif 1211 if (jac->const_block_size == -1) jac->const_block_size = nblk; 1212 else if (jac->const_block_size > 0 && jac->const_block_size != nblk) jac->const_block_size = 0; 1213 } 1214 } 1215 PetscCall(DMCompositeRestoreAccessArray(pack, jac->vec_diag, jac->nBlocks, NULL, subX)); 1216 PetscCall(PetscFree(subX)); 1217 PetscCall(PetscFree(subDM)); 1218 jac->d_bid_eqOffset_k = new Kokkos::View<PetscInt *, Kokkos::LayoutRight>(Kokkos::create_mirror(Kokkos::DefaultExecutionSpace::memory_space(), h_block_offsets)); 1219 Kokkos::deep_copy(*jac->d_bid_eqOffset_k, h_block_offsets); 1220 } 1221 } 1222 { // get jac->d_idiag_k (PC setup), 1223 const PetscInt *d_ai = aijkok->i_device_data(), *d_aj = aijkok->j_device_data(); 1224 const PetscScalar *d_aa = aijkok->a_device_data(); 1225 const PetscInt conc = Kokkos::DefaultExecutionSpace().concurrency(), openmp = !!(conc < 1000), team_size = (openmp == 0 && PCBJKOKKOS_VEC_SIZE != 1) ? PCBJKOKKOS_TEAM_SIZE : 1; 1226 PetscInt *d_bid_eqOffset = jac->d_bid_eqOffset_k->data(), *r = jac->d_isrow_k->data(), *ic = jac->d_isicol_k->data(); 1227 PetscScalar *d_idiag = jac->d_idiag_k->data(); 1228 Kokkos::parallel_for( 1229 "Diag", Kokkos::TeamPolicy<>(jac->nBlocks, team_size, PCBJKOKKOS_VEC_SIZE), KOKKOS_LAMBDA(const team_member team) { 1230 const PetscInt blkID = team.league_rank(); 1231 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, d_bid_eqOffset[blkID], d_bid_eqOffset[blkID + 1]), [=](const int rowb) { 1232 const PetscInt rowa = ic[rowb], ai = d_ai[rowa], *aj = d_aj + ai; // grab original data 1233 const PetscScalar *aa = d_aa + ai; 1234 const PetscInt nrow = d_ai[rowa + 1] - ai; 1235 int found; 1236 Kokkos::parallel_reduce( 1237 Kokkos::ThreadVectorRange(team, nrow), 1238 [=](const int &j, int &count) { 1239 const PetscInt colb = r[aj[j]]; 1240 if (colb == rowb) { 1241 d_idiag[rowb] = 1. / aa[j]; 1242 count++; 1243 } 1244 }, 1245 found); 1246 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1247 if (found != 1) Kokkos::single(Kokkos::PerThread(team), [=]() { printf("ERRORrow %d) found = %d\n", rowb, found); }); 1248 #endif 1249 }); 1250 }); 1251 } 1252 } 1253 PetscFunctionReturn(0); 1254 } 1255 1256 /* Default destroy, if it has never been setup */ 1257 static PetscErrorCode PCReset_BJKOKKOS(PC pc) { 1258 PC_PCBJKOKKOS *jac = (PC_PCBJKOKKOS *)pc->data; 1259 1260 PetscFunctionBegin; 1261 PetscCall(KSPDestroy(&jac->ksp)); 1262 PetscCall(VecDestroy(&jac->vec_diag)); 1263 if (jac->d_bid_eqOffset_k) delete jac->d_bid_eqOffset_k; 1264 if (jac->d_idiag_k) delete jac->d_idiag_k; 1265 if (jac->d_isrow_k) delete jac->d_isrow_k; 1266 if (jac->d_isicol_k) delete jac->d_isicol_k; 1267 jac->d_bid_eqOffset_k = NULL; 1268 jac->d_idiag_k = NULL; 1269 jac->d_isrow_k = NULL; 1270 jac->d_isicol_k = NULL; 1271 PetscCall(PetscObjectComposeFunction((PetscObject)pc, "PCBJKOKKOSGetKSP_C", NULL)); // not published now (causes configure errors) 1272 PetscCall(PetscObjectComposeFunction((PetscObject)pc, "PCBJKOKKOSSetKSP_C", NULL)); 1273 PetscCall(PetscFree(jac->dm_Nf)); 1274 jac->dm_Nf = NULL; 1275 if (jac->rowOffsets) delete jac->rowOffsets; 1276 if (jac->colIndices) delete jac->colIndices; 1277 if (jac->batch_b) delete jac->batch_b; 1278 if (jac->batch_x) delete jac->batch_x; 1279 if (jac->batch_values) delete jac->batch_values; 1280 jac->rowOffsets = NULL; 1281 jac->colIndices = NULL; 1282 jac->batch_b = NULL; 1283 jac->batch_x = NULL; 1284 jac->batch_values = NULL; 1285 1286 PetscFunctionReturn(0); 1287 } 1288 1289 static PetscErrorCode PCDestroy_BJKOKKOS(PC pc) { 1290 PetscFunctionBegin; 1291 PetscCall(PCReset_BJKOKKOS(pc)); 1292 PetscCall(PetscFree(pc->data)); 1293 PetscFunctionReturn(0); 1294 } 1295 1296 static PetscErrorCode PCView_BJKOKKOS(PC pc, PetscViewer viewer) { 1297 PC_PCBJKOKKOS *jac = (PC_PCBJKOKKOS *)pc->data; 1298 PetscBool iascii; 1299 1300 PetscFunctionBegin; 1301 if (!jac->ksp) PetscCall(PCBJKOKKOSCreateKSP_BJKOKKOS(pc)); 1302 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii)); 1303 if (iascii) { 1304 PetscCall(PetscViewerASCIIPrintf(viewer, " Batched device linear solver: Krylov (KSP) method with Jacobi preconditioning\n")); 1305 PetscCall(PetscViewerASCIIPrintf(viewer, "\t\tnwork = %" PetscInt_FMT ", rel tol = %e, abs tol = %e, div tol = %e, max it =%" PetscInt_FMT ", type = %s\n", jac->nwork, jac->ksp->rtol, jac->ksp->abstol, jac->ksp->divtol, jac->ksp->max_it, 1306 ((PetscObject)jac->ksp)->type_name)); 1307 } 1308 PetscFunctionReturn(0); 1309 } 1310 1311 static PetscErrorCode PCSetFromOptions_BJKOKKOS(PC pc, PetscOptionItems *PetscOptionsObject) { 1312 PetscFunctionBegin; 1313 PetscOptionsHeadBegin(PetscOptionsObject, "PC BJKOKKOS options"); 1314 PetscOptionsHeadEnd(); 1315 PetscFunctionReturn(0); 1316 } 1317 1318 static PetscErrorCode PCBJKOKKOSSetKSP_BJKOKKOS(PC pc, KSP ksp) { 1319 PC_PCBJKOKKOS *jac = (PC_PCBJKOKKOS *)pc->data; 1320 1321 PetscFunctionBegin; 1322 PetscCall(PetscObjectReference((PetscObject)ksp)); 1323 PetscCall(KSPDestroy(&jac->ksp)); 1324 jac->ksp = ksp; 1325 PetscFunctionReturn(0); 1326 } 1327 1328 /*@C 1329 PCBJKOKKOSSetKSP - Sets the `KSP` context for `PCBJKOKKOS` 1330 1331 Collective on pc 1332 1333 Input Parameters: 1334 + pc - the `PCBJKOKKOS` preconditioner context 1335 - ksp - the `KSP` solver 1336 1337 Notes: 1338 The `PC` and the `KSP` must have the same communicator 1339 1340 If the `PC` is not `PCBJKOKKOS` this function returns without doing anything 1341 1342 Level: advanced 1343 1344 ,seealso: `PCBJKOKKOSGetKSP()`, `PCBJKOKKOS` 1345 @*/ 1346 PetscErrorCode PCBJKOKKOSSetKSP(PC pc, KSP ksp) { 1347 PetscFunctionBegin; 1348 PetscValidHeaderSpecific(pc, PC_CLASSID, 1); 1349 PetscValidHeaderSpecific(ksp, KSP_CLASSID, 2); 1350 PetscCheckSameComm(pc, 1, ksp, 2); 1351 PetscTryMethod(pc, "PCBJKOKKOSSetKSP_C", (PC, KSP), (pc, ksp)); 1352 PetscFunctionReturn(0); 1353 } 1354 1355 static PetscErrorCode PCBJKOKKOSGetKSP_BJKOKKOS(PC pc, KSP *ksp) { 1356 PC_PCBJKOKKOS *jac = (PC_PCBJKOKKOS *)pc->data; 1357 1358 PetscFunctionBegin; 1359 if (!jac->ksp) PetscCall(PCBJKOKKOSCreateKSP_BJKOKKOS(pc)); 1360 *ksp = jac->ksp; 1361 PetscFunctionReturn(0); 1362 } 1363 1364 /*@C 1365 PCBJKOKKOSGetKSP - Gets the `KSP` context for the `PCBJKOKKOS` preconditioner 1366 1367 Not Collective but `KSP` returned is parallel if `PC` was parallel 1368 1369 Input Parameter: 1370 . pc - the preconditioner context 1371 1372 Output Parameter: 1373 . ksp - the `KSP` solver 1374 1375 Notes: 1376 You must call `KSPSetUp()` before calling `PCBJKOKKOSGetKSP()`. 1377 1378 If the `PC` is not a `PCBJKOKKOS` object it raises an error 1379 1380 Level: advanced 1381 1382 .seealso: `PCBJKOKKOS`, `PCBJKOKKOSSetKSP()` 1383 @*/ 1384 PetscErrorCode PCBJKOKKOSGetKSP(PC pc, KSP *ksp) { 1385 PetscFunctionBegin; 1386 PetscValidHeaderSpecific(pc, PC_CLASSID, 1); 1387 PetscValidPointer(ksp, 2); 1388 PetscUseMethod(pc, "PCBJKOKKOSGetKSP_C", (PC, KSP *), (pc, ksp)); 1389 PetscFunctionReturn(0); 1390 } 1391 1392 static PetscErrorCode PCPostSolve_BJKOKKOS(PC pc, KSP ksp, Vec b, Vec x) { 1393 PC_PCBJKOKKOS *jac = (PC_PCBJKOKKOS *)pc->data; 1394 1395 PetscFunctionBegin; 1396 ksp->its = jac->max_nits; 1397 PetscFunctionReturn(0); 1398 } 1399 1400 /*MC 1401 PCBJKOKKOS - Defines a preconditioner that applies a Krylov solver and preconditioner to the blocks in a `MATSEQAIJ` matrix on the GPU using Kokkos 1402 1403 Options Database Key: 1404 . -pc_bjkokkos_ - options prefix for its `KSP` options 1405 1406 Level: intermediate 1407 1408 Note: 1409 For use with -ksp_type preonly to bypass any computation on the CPU 1410 1411 Developer Notes: 1412 The documentation is incomplete. Is this a block Jacobi preconditioner? 1413 1414 Why does it have its own `KSP`? Where is the `KSP` run if used with -ksp_type preonly? 1415 1416 .seealso: `PCCreate()`, `PCSetType()`, `PCType`, `PC`, `PCBJACOBI`, 1417 `PCSHELL`, `PCCOMPOSITE`, `PCSetUseAmat()`, `PCBJKOKKOSGetKSP()` 1418 M*/ 1419 1420 PETSC_EXTERN PetscErrorCode PCCreate_BJKOKKOS(PC pc) { 1421 PC_PCBJKOKKOS *jac; 1422 1423 PetscFunctionBegin; 1424 PetscCall(PetscNew(&jac)); 1425 pc->data = (void *)jac; 1426 1427 jac->ksp = NULL; 1428 jac->vec_diag = NULL; 1429 jac->d_bid_eqOffset_k = NULL; 1430 jac->d_idiag_k = NULL; 1431 jac->d_isrow_k = NULL; 1432 jac->d_isicol_k = NULL; 1433 jac->nBlocks = 1; 1434 jac->max_nits = 0; 1435 1436 PetscCall(PetscMemzero(pc->ops, sizeof(struct _PCOps))); 1437 pc->ops->apply = PCApply_BJKOKKOS; 1438 pc->ops->applytranspose = NULL; 1439 pc->ops->setup = PCSetUp_BJKOKKOS; 1440 pc->ops->reset = PCReset_BJKOKKOS; 1441 pc->ops->destroy = PCDestroy_BJKOKKOS; 1442 pc->ops->setfromoptions = PCSetFromOptions_BJKOKKOS; 1443 pc->ops->view = PCView_BJKOKKOS; 1444 pc->ops->postsolve = PCPostSolve_BJKOKKOS; 1445 1446 jac->rowOffsets = NULL; 1447 jac->colIndices = NULL; 1448 jac->batch_b = NULL; 1449 jac->batch_x = NULL; 1450 jac->batch_values = NULL; 1451 1452 PetscCall(PetscObjectComposeFunction((PetscObject)pc, "PCBJKOKKOSGetKSP_C", PCBJKOKKOSGetKSP_BJKOKKOS)); 1453 PetscCall(PetscObjectComposeFunction((PetscObject)pc, "PCBJKOKKOSSetKSP_C", PCBJKOKKOSSetKSP_BJKOKKOS)); 1454 PetscFunctionReturn(0); 1455 } 1456