1 #include <petscvec_kokkos.hpp> 2 #include <petsc/private/deviceimpl.h> 3 #include <petsc/private/pcimpl.h> 4 #include <petsc/private/kspimpl.h> 5 #include <petscksp.h> /*I "petscksp.h" I*/ 6 #include "petscsection.h" 7 #include <petscdmcomposite.h> 8 #include "Kokkos_Core.hpp" 9 10 #include <../src/mat/impls/aij/seq/aij.h> 11 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 12 13 #if defined(PETSC_HAVE_CUDA) 14 #include <nvToolsExt.h> 15 #endif 16 17 #define PCBJKOKKOS_SHARED_LEVEL 0 // 0 is shared, 1 is global 18 #define PCBJKOKKOS_VEC_SIZE 16 19 #define PCBJKOKKOS_TEAM_SIZE 16 20 21 #define PCBJKOKKOS_VERBOSE_LEVEL 2 22 23 typedef Kokkos::DefaultExecutionSpace exec_space; 24 using layout = Kokkos::LayoutRight; 25 using IntView = Kokkos::View<PetscInt **, layout, exec_space>; 26 using AMatrixValueView = const Kokkos::View<PetscScalar **, layout, exec_space>; 27 using XYType = const Kokkos::View<PetscScalar **, layout, exec_space>; 28 29 typedef enum { 30 BATCH_KSP_BICG_IDX, 31 BATCH_KSP_TFQMR_IDX, 32 BATCH_KSP_GMRES_IDX, 33 NUM_BATCH_TYPES 34 } KSPIndex; 35 typedef struct { 36 Vec vec_diag; 37 PetscInt nBlocks; /* total number of blocks */ 38 PetscInt n; // cache host version of d_bid_eqOffset_k[nBlocks] 39 KSP ksp; // Used just for options. Should have one for each block 40 Kokkos::View<PetscInt *, Kokkos::LayoutRight> *d_bid_eqOffset_k; 41 Kokkos::View<PetscScalar *, Kokkos::LayoutRight> *d_idiag_k; 42 Kokkos::View<PetscInt *> *d_isrow_k; 43 Kokkos::View<PetscInt *> *d_isicol_k; 44 KSPIndex ksp_type_idx; 45 PetscInt nwork; 46 PetscInt const_block_size; // used to decide to use shared memory for work vectors 47 PetscInt *dm_Nf; // Number of fields in each DM 48 PetscInt num_dms; 49 // diagnostics 50 PetscBool reason; 51 PetscBool monitor; 52 PetscInt batch_target; 53 PetscInt nsolves_team; 54 PetscInt max_nits; 55 // caches 56 IntView *rowOffsets; 57 IntView *colIndices; 58 XYType *batch_b; 59 XYType *batch_x; 60 AMatrixValueView *batch_values; 61 } PC_PCBJKOKKOS; 62 63 #if defined(PETSC_HAVE_KOKKOS_KERNELS_GMRES) 64 #include <fstream> 65 66 #include "Kokkos_Timer.hpp" 67 #include "Kokkos_Random.hpp" 68 #include "Kokkos_UnorderedMap.hpp" 69 #include "Kokkos_Sort.hpp" 70 71 /// KokkosKernels headers 72 #include "KokkosBatched_Util.hpp" 73 #include "KokkosBatched_Vector.hpp" 74 75 #include <Kokkos_ArithTraits.hpp> 76 #include <KokkosBatched_Util.hpp> 77 #include <KokkosBatched_Vector.hpp> 78 #include <KokkosBatched_Copy_Decl.hpp> 79 #include <KokkosBatched_Copy_Impl.hpp> 80 #include <KokkosBatched_AddRadial_Decl.hpp> 81 #include <KokkosBatched_AddRadial_Impl.hpp> 82 #include <KokkosBatched_Gemm_Decl.hpp> 83 #include <KokkosBatched_Gemm_Serial_Impl.hpp> 84 #include <KokkosBatched_Gemm_Team_Impl.hpp> 85 #include <KokkosBatched_Gemv_Decl.hpp> 86 #include <KokkosBatched_Gemv_Serial_Impl.hpp> 87 #include <KokkosBatched_Gemv_Team_Impl.hpp> 88 #include <KokkosBatched_Trsm_Decl.hpp> 89 #include <KokkosBatched_Trsm_Serial_Impl.hpp> 90 #include <KokkosBatched_Trsm_Team_Impl.hpp> 91 #include <KokkosBatched_Trsv_Decl.hpp> 92 #include <KokkosBatched_Trsv_Serial_Impl.hpp> 93 #include <KokkosBatched_Trsv_Team_Impl.hpp> 94 #include <KokkosBatched_LU_Decl.hpp> 95 #include <KokkosBatched_LU_Serial_Impl.hpp> 96 #include <KokkosBatched_LU_Team_Impl.hpp> 97 #include <KokkosSparse_CrsMatrix.hpp> 98 #include "KokkosBatched_Spmv.hpp" 99 #include "KokkosBatched_CrsMatrix.hpp" 100 #include "KokkosBatched_Krylov_Handle.hpp" 101 #include "KokkosBatched_GMRES.hpp" 102 #include "KokkosBatched_JacobiPrec.hpp" 103 104 template <typename DeviceType, typename ValuesViewType, typename IntView, typename VectorViewType, typename KrylovHandleType> 105 struct Functor_TestBatchedTeamVectorGMRES { 106 const ValuesViewType _D; 107 const ValuesViewType _diag; 108 const IntView _r; 109 const IntView _c; 110 const VectorViewType _X; 111 const VectorViewType _B; 112 const int _N_team, _team_size, _vector_length; 113 const int _N_iteration; 114 const double _tol; 115 const int _ortho_strategy; 116 const int _scratch_pad_level; 117 KrylovHandleType _handle; 118 119 KOKKOS_INLINE_FUNCTION 120 Functor_TestBatchedTeamVectorGMRES(const ValuesViewType &D, const IntView &r, const IntView &c, const VectorViewType &X, const VectorViewType &B, const int N_team, const int team_size, const int vector_length, const int N_iteration, const double tol, const int ortho_strategy, const int scratch_pad_level, KrylovHandleType &handle) : 121 _D(D), _r(r), _c(c), _X(X), _B(B), _N_team(N_team), _team_size(team_size), _vector_length(vector_length), _N_iteration(N_iteration), _tol(tol), _ortho_strategy(ortho_strategy), _scratch_pad_level(scratch_pad_level), _handle(handle) { } 122 123 KOKKOS_INLINE_FUNCTION 124 Functor_TestBatchedTeamVectorGMRES(const ValuesViewType &D, const ValuesViewType &diag, const IntView &r, const IntView &c, const VectorViewType &X, const VectorViewType &B, const int N_team, const int team_size, const int vector_length, const int N_iteration, const double tol, int ortho_strategy, const int scratch_pad_level, KrylovHandleType &handle) : 125 _D(D), _diag(diag), _r(r), _c(c), _X(X), _B(B), _N_team(N_team), _team_size(team_size), _vector_length(vector_length), _N_iteration(N_iteration), _tol(tol), _ortho_strategy(ortho_strategy), _scratch_pad_level(scratch_pad_level), _handle(handle) { } 126 127 template <typename MemberType> 128 KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { 129 const int first_matrix = static_cast<int>(member.league_rank()) * _N_team; 130 const int N = _D.extent(0); 131 const int last_matrix = (static_cast<int>(member.league_rank() + 1) * _N_team < N ? static_cast<int>(member.league_rank() + 1) * _N_team : N); 132 const int graphID = static_cast<int>(member.league_rank()); 133 using TeamVectorCopy1D = KokkosBatched::TeamVectorCopy<MemberType, KokkosBatched::Trans::NoTranspose, 1>; 134 135 auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); 136 auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); 137 auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); 138 using ScratchPadIntViewType = Kokkos::View<typename IntView::non_const_value_type *, typename IntView::array_layout, typename IntView::execution_space::scratch_memory_space>; 139 using ScratchPadValuesViewType = Kokkos::View<typename ValuesViewType::non_const_value_type **, typename ValuesViewType::array_layout, typename ValuesViewType::execution_space::scratch_memory_space>; 140 141 using Operator = KokkosBatched::CrsMatrix<ValuesViewType, ScratchPadIntViewType>; 142 ScratchPadIntViewType r(member.team_scratch(1), _r.extent(1)); 143 ScratchPadIntViewType c(member.team_scratch(1), _c.extent(1)); 144 145 TeamVectorCopy1D::invoke(member, Kokkos::subview(_r, graphID, Kokkos::ALL), r); 146 TeamVectorCopy1D::invoke(member, Kokkos::subview(_c, graphID, Kokkos::ALL), c); 147 Operator A(d, r, c); 148 149 ScratchPadValuesViewType diag(member.team_scratch(1), last_matrix - first_matrix, _diag.extent(1)); 150 using PrecOperator = KokkosBatched::JacobiPrec<ScratchPadValuesViewType>; 151 152 KokkosBatched::TeamVectorCopy<MemberType>::invoke(member, Kokkos::subview(_diag, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL), diag); 153 PrecOperator P(diag); 154 P.setComputedInverse(); 155 156 KokkosBatched::TeamVectorGMRES<MemberType>::template invoke<Operator, VectorViewType, PrecOperator, KrylovHandleType>(member, A, b, x, P, _handle); 157 } 158 inline double run(PC pc) { 159 typedef typename ValuesViewType::value_type value_type; 160 std::string name("KokkosBatched::Test::TeamVectorGMRES"); 161 Kokkos::Timer timer; 162 Kokkos::Profiling::pushRegion(name.c_str()); 163 164 Kokkos::TeamPolicy<DeviceType> auto_policy(ceil(1. * _D.extent(0) / _N_team), Kokkos::AUTO(), Kokkos::AUTO()); 165 Kokkos::TeamPolicy<DeviceType> tuned_policy(ceil(1. * _D.extent(0) / _N_team), _team_size, _vector_length); 166 Kokkos::TeamPolicy<DeviceType> policy; 167 168 if (_team_size < 1) policy = auto_policy; 169 else policy = tuned_policy; 170 171 _handle.set_max_iteration(_N_iteration); 172 _handle.set_tolerance(_tol); 173 _handle.set_ortho_strategy(_ortho_strategy); 174 _handle.set_scratch_pad_level(_scratch_pad_level); 175 _handle.set_compute_last_residual(true); 176 177 int maximum_iteration = _handle.get_max_iteration(); 178 179 using ScalarType = typename ValuesViewType::non_const_value_type; 180 using Layout = typename ValuesViewType::array_layout; 181 using EXSP = typename ValuesViewType::execution_space; 182 183 using MagnitudeType = typename Kokkos::Details::ArithTraits<ScalarType>::mag_type; 184 185 using ViewType1D = Kokkos::View<MagnitudeType *, Layout, EXSP>; 186 using ViewType2D = Kokkos::View<ScalarType **, Layout, EXSP>; 187 using ViewType3D = Kokkos::View<ScalarType ***, Layout, EXSP>; 188 using IntViewType1D = Kokkos::View<PetscInt *, Layout, EXSP>; 189 190 size_t bytes_1D = ViewType2D::shmem_size(_N_team, 1); 191 size_t bytes_row_ptr = IntViewType1D::shmem_size(_r.extent(1)); 192 size_t bytes_col_idc = IntViewType1D::shmem_size(_c.extent(1)); 193 size_t bytes_2D_1 = ViewType2D::shmem_size(_N_team, _X.extent(1)); 194 size_t bytes_2D_2 = ViewType2D::shmem_size(_N_team, maximum_iteration + 1); 195 196 size_t bytes_diag = bytes_2D_1; 197 size_t bytes_tmp = 2 * bytes_2D_1 + 2 * bytes_1D + bytes_2D_2; 198 199 policy.set_scratch_size(0, Kokkos::PerTeam(bytes_tmp)); 200 policy.set_scratch_size(1, Kokkos::PerTeam(bytes_col_idc + bytes_row_ptr + bytes_diag)); 201 PetscInfo(pc, "%d scratch memory(0) = %d + %d + %d bytes_diag=%d; %d scratch memory(1); %d maximum_iterations\n", (int)(bytes_tmp), 2 * (int)bytes_2D_1, 2 * (int)bytes_1D, (int)bytes_2D_2, (int)bytes_diag, (int)(bytes_row_ptr + bytes_col_idc + bytes_diag), (int)maximum_iteration); 202 exec_space().fence(); 203 timer.reset(); 204 Kokkos::parallel_for(name.c_str(), policy, *this); 205 exec_space().fence(); 206 double sec = timer.seconds(); 207 208 return sec; 209 } 210 }; 211 #endif // KK GMRES 212 213 typedef Kokkos::TeamPolicy<>::member_type team_member; 214 215 static PetscErrorCode PCBJKOKKOSCreateKSP_BJKOKKOS(PC pc) { 216 const char *prefix; 217 PC_PCBJKOKKOS *jac = (PC_PCBJKOKKOS *)pc->data; 218 DM dm; 219 220 PetscFunctionBegin; 221 PetscCall(KSPCreate(PetscObjectComm((PetscObject)pc), &jac->ksp)); 222 PetscCall(KSPSetErrorIfNotConverged(jac->ksp, pc->erroriffailure)); 223 PetscCall(PetscObjectIncrementTabLevel((PetscObject)jac->ksp, (PetscObject)pc, 1)); 224 PetscCall(PCGetOptionsPrefix(pc, &prefix)); 225 PetscCall(KSPSetOptionsPrefix(jac->ksp, prefix)); 226 PetscCall(KSPAppendOptionsPrefix(jac->ksp, "pc_bjkokkos_")); 227 PetscCall(PCGetDM(pc, &dm)); 228 if (dm) { 229 PetscCall(KSPSetDM(jac->ksp, dm)); 230 PetscCall(KSPSetDMActive(jac->ksp, PETSC_FALSE)); 231 } 232 jac->reason = PETSC_FALSE; 233 jac->monitor = PETSC_FALSE; 234 jac->batch_target = -1; 235 jac->nsolves_team = 1; 236 jac->ksp->max_it = 50; // this is realy for GMRES w/o restarts 237 PetscFunctionReturn(0); 238 } 239 240 // y <-- Ax 241 KOKKOS_INLINE_FUNCTION PetscErrorCode MatMult(const team_member team, const PetscInt *glb_Aai, const PetscInt *glb_Aaj, const PetscScalar *glb_Aaa, const PetscInt *r, const PetscInt *ic, const PetscInt start, const PetscInt end, const PetscScalar *x_loc, PetscScalar *y_loc) { 242 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, start, end), [=](const int rowb) { 243 int rowa = ic[rowb]; 244 int n = glb_Aai[rowa + 1] - glb_Aai[rowa]; 245 const PetscInt *aj = glb_Aaj + glb_Aai[rowa]; 246 const PetscScalar *aa = glb_Aaa + glb_Aai[rowa]; 247 PetscScalar sum; 248 Kokkos::parallel_reduce( 249 Kokkos::ThreadVectorRange(team, n), [=](const int i, PetscScalar &lsum) { lsum += aa[i] * x_loc[r[aj[i]] - start]; }, sum); 250 Kokkos::single(Kokkos::PerThread(team), [=]() { y_loc[rowb - start] = sum; }); 251 }); 252 team.team_barrier(); 253 return 0; 254 } 255 256 // temp buffer per thread with reduction at end? 257 KOKKOS_INLINE_FUNCTION PetscErrorCode MatMultTranspose(const team_member team, const PetscInt *glb_Aai, const PetscInt *glb_Aaj, const PetscScalar *glb_Aaa, const PetscInt *r, const PetscInt *ic, const PetscInt start, const PetscInt end, const PetscScalar *x_loc, PetscScalar *y_loc) { 258 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, end - start), [=](int i) { y_loc[i] = 0; }); 259 team.team_barrier(); 260 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, start, end), [=](const int rowb) { 261 int rowa = ic[rowb]; 262 int n = glb_Aai[rowa + 1] - glb_Aai[rowa]; 263 const PetscInt *aj = glb_Aaj + glb_Aai[rowa]; 264 const PetscScalar *aa = glb_Aaa + glb_Aai[rowa]; 265 const PetscScalar xx = x_loc[rowb - start]; // rowb = ic[rowa] = ic[r[rowb]] 266 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, n), [=](const int &i) { 267 PetscScalar val = aa[i] * xx; 268 Kokkos::atomic_fetch_add(&y_loc[r[aj[i]] - start], val); 269 }); 270 }); 271 team.team_barrier(); 272 return 0; 273 } 274 275 typedef struct Batch_MetaData_TAG { 276 PetscInt flops; 277 PetscInt its; 278 KSPConvergedReason reason; 279 } Batch_MetaData; 280 281 // Solve A(BB^-1)x = y with TFQMR. Right preconditioned to get un-preconditioned residual 282 KOKKOS_INLINE_FUNCTION PetscErrorCode BJSolve_TFQMR(const team_member team, const PetscInt *glb_Aai, const PetscInt *glb_Aaj, const PetscScalar *glb_Aaa, const PetscInt *r, const PetscInt *ic, PetscScalar *work_space_global, const int stride_global, const int nShareVec, PetscScalar *work_space_shared, const int stride_shared, PetscReal rtol, PetscReal atol, PetscReal dtol, PetscInt maxit, Batch_MetaData *metad, const PetscInt start, const PetscInt end, const PetscScalar glb_idiag[], const PetscScalar *glb_b, PetscScalar *glb_x, bool monitor) { 283 using Kokkos::parallel_for; 284 using Kokkos::parallel_reduce; 285 int Nblk = end - start, i, m, stride = stride_shared, idx = 0; 286 PetscReal dp, dpold, w, dpest, tau, psi, cm, r0; 287 const PetscScalar *Diag = &glb_idiag[start]; 288 PetscScalar *ptr = work_space_shared, rho, rhoold, a, s, b, eta, etaold, psiold, cf, dpi; 289 290 if (idx++ == nShareVec) { 291 ptr = work_space_global; 292 stride = stride_global; 293 } 294 PetscScalar *XX = ptr; 295 ptr += stride; 296 if (idx++ == nShareVec) { 297 ptr = work_space_global; 298 stride = stride_global; 299 } 300 PetscScalar *R = ptr; 301 ptr += stride; 302 if (idx++ == nShareVec) { 303 ptr = work_space_global; 304 stride = stride_global; 305 } 306 PetscScalar *RP = ptr; 307 ptr += stride; 308 if (idx++ == nShareVec) { 309 ptr = work_space_global; 310 stride = stride_global; 311 } 312 PetscScalar *V = ptr; 313 ptr += stride; 314 if (idx++ == nShareVec) { 315 ptr = work_space_global; 316 stride = stride_global; 317 } 318 PetscScalar *T = ptr; 319 ptr += stride; 320 if (idx++ == nShareVec) { 321 ptr = work_space_global; 322 stride = stride_global; 323 } 324 PetscScalar *Q = ptr; 325 ptr += stride; 326 if (idx++ == nShareVec) { 327 ptr = work_space_global; 328 stride = stride_global; 329 } 330 PetscScalar *P = ptr; 331 ptr += stride; 332 if (idx++ == nShareVec) { 333 ptr = work_space_global; 334 stride = stride_global; 335 } 336 PetscScalar *U = ptr; 337 ptr += stride; 338 if (idx++ == nShareVec) { 339 ptr = work_space_global; 340 stride = stride_global; 341 } 342 PetscScalar *D = ptr; 343 ptr += stride; 344 if (idx++ == nShareVec) { 345 ptr = work_space_global; 346 stride = stride_global; 347 } 348 PetscScalar *AUQ = V; 349 350 // init: get b, zero x 351 parallel_for(Kokkos::TeamVectorRange(team, start, end), [=](int rowb) { 352 int rowa = ic[rowb]; 353 R[rowb - start] = glb_b[rowa]; 354 XX[rowb - start] = 0; 355 }); 356 team.team_barrier(); 357 parallel_reduce( 358 Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &lsum) { lsum += R[idx] * PetscConj(R[idx]); }, dpi); 359 team.team_barrier(); 360 r0 = dp = PetscSqrtReal(PetscRealPart(dpi)); 361 // diagnostics 362 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 363 if (monitor) Kokkos::single(Kokkos::PerTeam(team), [=]() { printf("%3d KSP Residual norm %14.12e \n", 0, (double)dp); }); 364 #endif 365 if (dp < atol) { 366 metad->reason = KSP_CONVERGED_ATOL_NORMAL; 367 return 0; 368 } 369 if (0 == maxit) { 370 metad->reason = KSP_DIVERGED_ITS; 371 return 0; 372 } 373 374 /* Make the initial Rp = R */ 375 parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { RP[idx] = R[idx]; }); 376 team.team_barrier(); 377 /* Set the initial conditions */ 378 etaold = 0.0; 379 psiold = 0.0; 380 tau = dp; 381 dpold = dp; 382 383 /* rhoold = (r,rp) */ 384 parallel_reduce( 385 Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &dot) { dot += R[idx] * PetscConj(RP[idx]); }, rhoold); 386 team.team_barrier(); 387 parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { 388 U[idx] = R[idx]; 389 P[idx] = R[idx]; 390 T[idx] = Diag[idx] * P[idx]; 391 D[idx] = 0; 392 }); 393 team.team_barrier(); 394 MatMult(team, glb_Aai, glb_Aaj, glb_Aaa, r, ic, start, end, T, V); 395 396 i = 0; 397 do { 398 /* s <- (v,rp) */ 399 parallel_reduce( 400 Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &dot) { dot += V[idx] * PetscConj(RP[idx]); }, s); 401 team.team_barrier(); 402 a = rhoold / s; /* a <- rho / s */ 403 /* q <- u - a v VecWAXPY(w,alpha,x,y): w = alpha x + y. */ 404 /* t <- u + q */ 405 parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { 406 Q[idx] = U[idx] - a * V[idx]; 407 T[idx] = U[idx] + Q[idx]; 408 }); 409 team.team_barrier(); 410 // KSP_PCApplyBAorAB 411 parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { T[idx] = Diag[idx] * T[idx]; }); 412 team.team_barrier(); 413 MatMult(team, glb_Aai, glb_Aaj, glb_Aaa, r, ic, start, end, T, AUQ); 414 /* r <- r - a K (u + q) */ 415 parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { R[idx] = R[idx] - a * AUQ[idx]; }); 416 team.team_barrier(); 417 parallel_reduce( 418 Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &lsum) { lsum += R[idx] * PetscConj(R[idx]); }, dpi); 419 team.team_barrier(); 420 dp = PetscSqrtReal(PetscRealPart(dpi)); 421 for (m = 0; m < 2; m++) { 422 if (!m) w = PetscSqrtReal(dp * dpold); 423 else w = dp; 424 psi = w / tau; 425 cm = 1.0 / PetscSqrtReal(1.0 + psi * psi); 426 tau = tau * psi * cm; 427 eta = cm * cm * a; 428 cf = psiold * psiold * etaold / a; 429 if (!m) { 430 /* D = U + cf D */ 431 parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { D[idx] = U[idx] + cf * D[idx]; }); 432 } else { 433 /* D = Q + cf D */ 434 parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { D[idx] = Q[idx] + cf * D[idx]; }); 435 } 436 team.team_barrier(); 437 parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { XX[idx] = XX[idx] + eta * D[idx]; }); 438 team.team_barrier(); 439 dpest = PetscSqrtReal(2 * i + m + 2.0) * tau; 440 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 441 if (monitor && m == 1) Kokkos::single(Kokkos::PerTeam(team), [=]() { printf("%3d KSP Residual norm %14.12e \n", i + 1, (double)dpest); }); 442 #endif 443 if (dpest < atol) { 444 metad->reason = KSP_CONVERGED_ATOL_NORMAL; 445 goto done; 446 } 447 if (dpest / r0 < rtol) { 448 metad->reason = KSP_CONVERGED_RTOL_NORMAL; 449 goto done; 450 } 451 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 452 if (dpest / r0 > dtol) { 453 metad->reason = KSP_DIVERGED_DTOL; 454 Kokkos::single(Kokkos::PerTeam(team), [=]() { printf("ERROR block %d diverged: %d it, res=%e, r_0=%e\n", team.league_rank(), i, dpest, r0); }); 455 goto done; 456 } 457 #else 458 if (dpest / r0 > dtol) { 459 metad->reason = KSP_DIVERGED_DTOL; 460 goto done; 461 } 462 #endif 463 if (i + 1 == maxit) { 464 metad->reason = KSP_DIVERGED_ITS; 465 goto done; 466 } 467 468 etaold = eta; 469 psiold = psi; 470 } 471 472 /* rho <- (r,rp) */ 473 parallel_reduce( 474 Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &dot) { dot += R[idx] * PetscConj(RP[idx]); }, rho); 475 team.team_barrier(); 476 b = rho / rhoold; /* b <- rho / rhoold */ 477 /* u <- r + b q */ 478 /* p <- u + b(q + b p) */ 479 parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { 480 U[idx] = R[idx] + b * Q[idx]; 481 Q[idx] = Q[idx] + b * P[idx]; 482 P[idx] = U[idx] + b * Q[idx]; 483 }); 484 /* v <- K p */ 485 team.team_barrier(); 486 parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { T[idx] = Diag[idx] * P[idx]; }); 487 team.team_barrier(); 488 MatMult(team, glb_Aai, glb_Aaj, glb_Aaa, r, ic, start, end, T, V); 489 490 rhoold = rho; 491 dpold = dp; 492 493 i++; 494 } while (i < maxit); 495 done: 496 // KSPUnwindPreconditioner 497 parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { XX[idx] = Diag[idx] * XX[idx]; }); 498 team.team_barrier(); 499 // put x into Plex order 500 parallel_for(Kokkos::TeamVectorRange(team, start, end), [=](int rowb) { 501 int rowa = ic[rowb]; 502 glb_x[rowa] = XX[rowb - start]; 503 }); 504 metad->its = i + 1; 505 if (1) { 506 int nnz; 507 parallel_reduce( 508 Kokkos::TeamVectorRange(team, start, end), [=](const int idx, int &lsum) { lsum += (glb_Aai[idx + 1] - glb_Aai[idx]); }, nnz); 509 metad->flops = 2 * (metad->its * (10 * Nblk + 2 * nnz) + 5 * Nblk); 510 } else { 511 metad->flops = 2 * (metad->its * (10 * Nblk + 2 * 50 * Nblk) + 5 * Nblk); // guess 512 } 513 return 0; 514 } 515 516 // Solve Ax = y with biCG 517 KOKKOS_INLINE_FUNCTION PetscErrorCode BJSolve_BICG(const team_member team, const PetscInt *glb_Aai, const PetscInt *glb_Aaj, const PetscScalar *glb_Aaa, const PetscInt *r, const PetscInt *ic, PetscScalar *work_space_global, const int stride_global, const int nShareVec, PetscScalar *work_space_shared, const int stride_shared, PetscReal rtol, PetscReal atol, PetscReal dtol, PetscInt maxit, Batch_MetaData *metad, const PetscInt start, const PetscInt end, const PetscScalar glb_idiag[], const PetscScalar *glb_b, PetscScalar *glb_x, bool monitor) { 518 using Kokkos::parallel_for; 519 using Kokkos::parallel_reduce; 520 int Nblk = end - start, i, stride = stride_shared, idx = 0; // start in shared mem 521 PetscReal dp, r0; 522 const PetscScalar *Di = &glb_idiag[start]; 523 PetscScalar *ptr = work_space_shared, dpi, a = 1.0, beta, betaold = 1.0, b, b2, ma, mac; 524 525 if (idx++ == nShareVec) { 526 ptr = work_space_global; 527 stride = stride_global; 528 } 529 PetscScalar *XX = ptr; 530 ptr += stride; 531 if (idx++ == nShareVec) { 532 ptr = work_space_global; 533 stride = stride_global; 534 } 535 PetscScalar *Rl = ptr; 536 ptr += stride; 537 if (idx++ == nShareVec) { 538 ptr = work_space_global; 539 stride = stride_global; 540 } 541 PetscScalar *Zl = ptr; 542 ptr += stride; 543 if (idx++ == nShareVec) { 544 ptr = work_space_global; 545 stride = stride_global; 546 } 547 PetscScalar *Pl = ptr; 548 ptr += stride; 549 if (idx++ == nShareVec) { 550 ptr = work_space_global; 551 stride = stride_global; 552 } 553 PetscScalar *Rr = ptr; 554 ptr += stride; 555 if (idx++ == nShareVec) { 556 ptr = work_space_global; 557 stride = stride_global; 558 } 559 PetscScalar *Zr = ptr; 560 ptr += stride; 561 if (idx++ == nShareVec) { 562 ptr = work_space_global; 563 stride = stride_global; 564 } 565 PetscScalar *Pr = ptr; 566 ptr += stride; 567 568 /* r <- b (x is 0) */ 569 parallel_for(Kokkos::TeamVectorRange(team, start, end), [=](int rowb) { 570 int rowa = ic[rowb]; 571 //PetscCall(VecCopy(Rr,Rl)); 572 Rl[rowb - start] = Rr[rowb - start] = glb_b[rowa]; 573 XX[rowb - start] = 0; 574 }); 575 team.team_barrier(); 576 /* z <- Br */ 577 parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { 578 Zr[idx] = Di[idx] * Rr[idx]; 579 Zl[idx] = Di[idx] * Rl[idx]; 580 }); 581 team.team_barrier(); 582 /* dp <- r'*r */ 583 parallel_reduce( 584 Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &lsum) { lsum += Rr[idx] * PetscConj(Rr[idx]); }, dpi); 585 team.team_barrier(); 586 r0 = dp = PetscSqrtReal(PetscRealPart(dpi)); 587 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 588 if (monitor) Kokkos::single(Kokkos::PerTeam(team), [=]() { printf("%3d KSP Residual norm %14.12e \n", 0, (double)dp); }); 589 #endif 590 if (dp < atol) { 591 metad->reason = KSP_CONVERGED_ATOL_NORMAL; 592 return 0; 593 } 594 if (0 == maxit) { 595 metad->reason = KSP_DIVERGED_ITS; 596 return 0; 597 } 598 i = 0; 599 do { 600 /* beta <- r'z */ 601 parallel_reduce( 602 Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &dot) { dot += Zr[idx] * PetscConj(Rl[idx]); }, beta); 603 team.team_barrier(); 604 #if PCBJKOKKOS_VERBOSE_LEVEL >= 6 605 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 606 Kokkos::single(Kokkos::PerTeam(team), [=]() { printf("%7d beta = Z.R = %22.14e \n", i, (double)beta); }); 607 #endif 608 #endif 609 if (!i) { 610 if (beta == 0.0) { 611 metad->reason = KSP_DIVERGED_BREAKDOWN_BICG; 612 goto done; 613 } 614 /* p <- z */ 615 parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { 616 Pr[idx] = Zr[idx]; 617 Pl[idx] = Zl[idx]; 618 }); 619 } else { 620 b = beta / betaold; 621 /* p <- z + b* p */ 622 b2 = PetscConj(b); 623 parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { 624 Pr[idx] = b * Pr[idx] + Zr[idx]; 625 Pl[idx] = b2 * Pl[idx] + Zl[idx]; 626 }); 627 } 628 team.team_barrier(); 629 betaold = beta; 630 /* z <- Kp */ 631 MatMult(team, glb_Aai, glb_Aaj, glb_Aaa, r, ic, start, end, Pr, Zr); 632 MatMultTranspose(team, glb_Aai, glb_Aaj, glb_Aaa, r, ic, start, end, Pl, Zl); 633 /* dpi <- z'p */ 634 parallel_reduce( 635 Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &lsum) { lsum += Zr[idx] * PetscConj(Pl[idx]); }, dpi); 636 team.team_barrier(); 637 // 638 a = beta / dpi; /* a = beta/p'z */ 639 ma = -a; 640 mac = PetscConj(ma); 641 /* x <- x + ap */ 642 parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { 643 XX[idx] = XX[idx] + a * Pr[idx]; 644 Rr[idx] = Rr[idx] + ma * Zr[idx]; 645 Rl[idx] = Rl[idx] + mac * Zl[idx]; 646 }); 647 team.team_barrier(); 648 team.team_barrier(); 649 /* dp <- r'*r */ 650 parallel_reduce( 651 Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &lsum) { lsum += Rr[idx] * PetscConj(Rr[idx]); }, dpi); 652 team.team_barrier(); 653 dp = PetscSqrtReal(PetscRealPart(dpi)); 654 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 655 if (monitor) Kokkos::single(Kokkos::PerTeam(team), [=]() { printf("%3d KSP Residual norm %14.12e \n", i + 1, (double)dp); }); 656 #endif 657 if (dp < atol) { 658 metad->reason = KSP_CONVERGED_ATOL_NORMAL; 659 goto done; 660 } 661 if (dp / r0 < rtol) { 662 metad->reason = KSP_CONVERGED_RTOL_NORMAL; 663 goto done; 664 } 665 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 666 if (dp / r0 > dtol) { 667 metad->reason = KSP_DIVERGED_DTOL; 668 Kokkos::single(Kokkos::PerTeam(team), [=]() { printf("ERROR block %d diverged: %d it, res=%e, r_0=%e\n", team.league_rank(), i, dp, r0); }); 669 goto done; 670 } 671 #else 672 if (dp / r0 > dtol) { 673 metad->reason = KSP_DIVERGED_DTOL; 674 goto done; 675 } 676 #endif 677 if (i + 1 == maxit) { 678 metad->reason = KSP_DIVERGED_ITS; 679 goto done; 680 } 681 /* z <- Br */ 682 parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { 683 Zr[idx] = Di[idx] * Rr[idx]; 684 Zl[idx] = Di[idx] * Rl[idx]; 685 }); 686 i++; 687 team.team_barrier(); 688 } while (i < maxit); 689 done: 690 // put x back into Plex order 691 parallel_for(Kokkos::TeamVectorRange(team, start, end), [=](int rowb) { 692 int rowa = ic[rowb]; 693 glb_x[rowa] = XX[rowb - start]; 694 }); 695 metad->its = i + 1; 696 if (1) { 697 int nnz; 698 parallel_reduce( 699 Kokkos::TeamVectorRange(team, start, end), [=](const int idx, int &lsum) { lsum += (glb_Aai[idx + 1] - glb_Aai[idx]); }, nnz); 700 metad->flops = 2 * (metad->its * (10 * Nblk + 2 * nnz) + 5 * Nblk); 701 } else { 702 metad->flops = 2 * (metad->its * (10 * Nblk + 2 * 50 * Nblk) + 5 * Nblk); // guess 703 } 704 return 0; 705 } 706 707 // KSP solver solve Ax = b; x is output, bin is input 708 static PetscErrorCode PCApply_BJKOKKOS(PC pc, Vec bin, Vec xout) { 709 PC_PCBJKOKKOS *jac = (PC_PCBJKOKKOS *)pc->data; 710 Mat A = pc->pmat; 711 Mat_SeqAIJKokkos *aijkok; 712 713 PetscFunctionBegin; 714 PetscCheck(jac->vec_diag && A, PetscObjectComm((PetscObject)pc), PETSC_ERR_USER, "Not setup???? %p %p", jac->vec_diag, A); 715 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 716 if (!aijkok) { 717 SETERRQ(PetscObjectComm((PetscObject)pc), PETSC_ERR_USER, "No aijkok"); 718 } else { 719 PetscInt maxit = jac->ksp->max_it; 720 const PetscInt conc = Kokkos::DefaultExecutionSpace().concurrency(), openmp = !!(conc < 1000), team_size = (openmp == 0 && PCBJKOKKOS_VEC_SIZE != 1) ? PCBJKOKKOS_TEAM_SIZE : 1; 721 const PetscInt nwork = jac->nwork, nBlk = jac->nBlocks; 722 PetscScalar *glb_xdata = NULL; 723 PetscReal rtol = jac->ksp->rtol, atol = jac->ksp->abstol, dtol = jac->ksp->divtol; 724 const PetscScalar *glb_idiag = jac->d_idiag_k->data(), *glb_bdata = NULL; 725 const PetscInt *glb_Aai = aijkok->i_device_data(), *glb_Aaj = aijkok->j_device_data(), *d_bid_eqOffset = jac->d_bid_eqOffset_k->data(); 726 const PetscScalar *glb_Aaa = aijkok->a_device_data(); 727 const PetscInt *d_isicol = jac->d_isicol_k->data(), *d_isrow = jac->d_isrow_k->data(); 728 PCFailedReason pcreason; 729 KSPIndex ksp_type_idx = jac->ksp_type_idx; 730 PetscMemType mtype; 731 PetscContainer container; 732 PetscInt batch_sz; 733 VecScatter plex_batch = NULL; // not used 734 Vec bvec; // a copy of b for scatter (just alias to bin now) 735 PetscBool monitor = jac->monitor; // captured 736 PetscInt view_bid = jac->batch_target; 737 MatInfo info; 738 jac->max_nits = 0; 739 if (view_bid < 0) view_bid = 0; 740 PetscCall(MatGetInfo(A, MAT_LOCAL, &info)); 741 // get field major is to map plex IO to/from block/field major 742 PetscCall(PetscObjectQuery((PetscObject)A, "plex_batch_is", (PetscObject *)&container)); 743 if (container) { 744 PetscCall(VecDuplicate(bin, &bvec)); 745 PetscCall(PetscContainerGetPointer(container, (void **)&plex_batch)); 746 PetscCall(VecScatterBegin(plex_batch, bin, bvec, INSERT_VALUES, SCATTER_FORWARD)); 747 PetscCall(VecScatterEnd(plex_batch, bin, bvec, INSERT_VALUES, SCATTER_FORWARD)); 748 SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_USER, "No plex_batch_is -- require NO field major ordering for now"); 749 } else { 750 bvec = bin; 751 } 752 // get x 753 PetscCall(VecGetArrayAndMemType(xout, &glb_xdata, &mtype)); 754 #if defined(PETSC_HAVE_CUDA) 755 PetscCheck(PetscMemTypeDevice(mtype), PetscObjectComm((PetscObject)pc), PETSC_ERR_ARG_WRONG, "No GPU data for x %" PetscInt_FMT " != %" PetscInt_FMT, mtype, PETSC_MEMTYPE_DEVICE); 756 #endif 757 PetscCall(VecGetArrayReadAndMemType(bvec, &glb_bdata, &mtype)); 758 #if defined(PETSC_HAVE_CUDA) 759 PetscCheck(PetscMemTypeDevice(mtype), PetscObjectComm((PetscObject)pc), PETSC_ERR_ARG_WRONG, "No GPU data for b"); 760 #endif 761 // get batch size 762 PetscCall(PetscObjectQuery((PetscObject)A, "batch size", (PetscObject *)&container)); 763 if (container) { 764 PetscInt *pNf = NULL; 765 PetscCall(PetscContainerGetPointer(container, (void **)&pNf)); 766 batch_sz = *pNf; 767 } else batch_sz = 1; 768 PetscCheck(nBlk % batch_sz == 0, PetscObjectComm((PetscObject)pc), PETSC_ERR_ARG_WRONG, "batch_sz = %" PetscInt_FMT ", nBlk = %" PetscInt_FMT, batch_sz, nBlk); 769 if (ksp_type_idx == BATCH_KSP_GMRES_IDX) { // KK solver - move PETSc data into Kokkos Views, setup solver, solve, move data out of Kokkos, process metadata (convergence tests, etc.) 770 #if defined(PETSC_HAVE_KOKKOS_KERNELS_GMRES) 771 int Nsolves_team = jac->nsolves_team, fill_idx = 0; 772 int Nloc = jac->const_block_size; // same grids 773 const int Nsolves = nBlk; 774 const int nnz = (int)info.nz_used / Nsolves; // fix for variable grid size 775 if (Nsolves_team > batch_sz) Nsolves_team = batch_sz; // silently fix this 776 PetscCheck(jac->const_block_size, PetscObjectComm((PetscObject)pc), PETSC_ERR_ARG_WRONG, "Kokkos (GMRES) solver requires constant block size (but can be made to work with species ordering or N_team==1)"); 777 PetscCheck(Nsolves % Nsolves_team == 0, PetscObjectComm((PetscObject)pc), PETSC_ERR_ARG_WRONG, "Nsolves.mod(Nsolves_team) != 0: Nsolves = %d, Nsolves_team = %d", Nsolves, Nsolves_team); 778 PetscCheck(((int)info.nz_used) % Nsolves == 0, PetscObjectComm((PetscObject)pc), PETSC_ERR_ARG_WRONG, "info.nz_used.mod(Nsolves) != 0: info.nz_used = %g, Nsolves = %d", info.nz_used, Nsolves); 779 #if defined(PETSC_HAVE_CUDA) 780 nvtxRangePushA("gmres-kk"); 781 #endif 782 Kokkos::View<PetscScalar **, layout, exec_space, Kokkos::MemoryTraits<Kokkos::Unmanaged>> inv_diag((PetscScalar *)glb_idiag, Nsolves, Nloc); // in correct order 783 if (!jac->rowOffsets) { 784 jac->rowOffsets = new IntView("rowOffsets", Nsolves / Nsolves_team, Nloc + 1); // same grids 785 jac->colIndices = new IntView("colIndices", Nsolves / Nsolves_team, nnz); 786 jac->batch_b = new XYType("batch rhs", Nsolves, Nloc); 787 jac->batch_x = new XYType("batch sol", Nsolves, Nloc); 788 jac->batch_values = new AMatrixValueView("batch values", Nsolves, nnz); 789 fill_idx = 1; 790 PetscInfo(pc, "Setup indices Nloc=%d, nnz=%d\n", Nloc, nnz); 791 } 792 IntView &rowOffsets = *jac->rowOffsets; 793 IntView &colIndices = *jac->colIndices; 794 XYType &batch_b = *jac->batch_b; 795 XYType &batch_x = *jac->batch_x; 796 AMatrixValueView &batch_values = *jac->batch_values; 797 798 Kokkos::deep_copy(batch_x, 0.); 799 PetscInfo(pc, "\tjac->n = %" PetscInt_FMT ", Nloc = %d, Nsolves = %d, nnz = %d, Nsolves_team = %d, league size = %d, maxit = %" PetscInt_FMT "\n", jac->n, Nloc, Nsolves, nnz, Nsolves_team, Nsolves / Nsolves_team, maxit); 800 Kokkos::parallel_for( 801 "rowOffsets+map", Kokkos::TeamPolicy<>(Nsolves, team_size, PCBJKOKKOS_VEC_SIZE), KOKKOS_LAMBDA(const team_member team) { 802 const int blkID = team.league_rank(), start = d_bid_eqOffset[blkID], end = d_bid_eqOffset[blkID + 1]; 803 if (fill_idx) { 804 if (blkID % Nsolves_team == 0) { // first matrix on this member 805 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start, end), [=](const int rowb) { // Nloc 806 int rowa = d_isicol[rowb]; 807 int n = glb_Aai[rowa + 1] - glb_Aai[rowa]; 808 rowOffsets(blkID / Nsolves_team, rowb + 1 - start) = n; // save sizes 809 }); 810 } 811 } 812 // map b into field major space 813 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start, end), [=](int rowb) { 814 int rowa = d_isicol[rowb]; 815 batch_b(blkID, rowb - start) = glb_bdata[rowa]; 816 }); 817 }); 818 Kokkos::fence(); 819 if (fill_idx) { 820 Kokkos::parallel_for( 821 "prefix sum", Kokkos::TeamPolicy<>(Nsolves / Nsolves_team, 1, 1), KOKKOS_LAMBDA(const team_member team) { 822 const int graphID = team.league_rank(); 823 rowOffsets(graphID, 0) = 0; 824 for (size_t i = 0; i < Nloc; ++i) rowOffsets(graphID, i + 1) += rowOffsets(graphID, i); 825 }); 826 Kokkos::fence(); 827 } 828 Kokkos::parallel_for( 829 "copy matrix", Kokkos::TeamPolicy<>(Nsolves /* /batch_sz */, team_size, PCBJKOKKOS_VEC_SIZE), KOKKOS_LAMBDA(const team_member team) { 830 const int blkID = team.league_rank(), start = d_bid_eqOffset[blkID], end = d_bid_eqOffset[blkID + 1], graphID = blkID / Nsolves_team; 831 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, start, end), [=](const int rowb) { 832 int rowa = d_isicol[rowb]; // global index 833 int n = glb_Aai[rowa + 1] - glb_Aai[rowa]; 834 const PetscInt *aj = glb_Aaj + glb_Aai[rowa]; 835 const PetscScalar *aa = glb_Aaa + glb_Aai[rowa]; 836 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, n), [=](const int &i) { 837 PetscScalar val = aa[i]; 838 if (fill_idx && blkID % Nsolves_team == 0) colIndices(graphID, rowOffsets(graphID, rowb - start) + i) = d_isrow[aj[i]] - blkID * Nloc; // local" global - block start 839 batch_values(blkID, rowOffsets(graphID, rowb - start) + i) = val; 840 }); 841 }); 842 }); 843 Kokkos::fence(); 844 // setup solver 845 using ScalarType = typename AMatrixValueView::non_const_value_type; 846 using MagnitudeType = typename Kokkos::Details::ArithTraits<ScalarType>::mag_type; 847 using NormViewType = Kokkos::View<MagnitudeType *, layout, exec_space>; 848 using Norm2DViewType = Kokkos::View<MagnitudeType **, layout, exec_space>; 849 using Scalar3DViewType = Kokkos::View<ScalarType ***, layout, exec_space>; 850 using IntViewType = Kokkos::View<int *, layout, exec_space>; 851 using KrylovHandleType = KokkosBatched::KrylovHandle<Norm2DViewType, IntViewType, Scalar3DViewType>; 852 const int n_iterations = maxit; 853 const int team_size = -1; 854 const int vector_length = -1; 855 const double tol = rtol; 856 const int ortho_strategy = 0; 857 KrylovHandleType handle(Nsolves, Nsolves_team, n_iterations, true); 858 handle.Arnoldi_view = Scalar3DViewType("", Nsolves, n_iterations, Nloc + n_iterations + 3); 859 // solve 860 double time = Functor_TestBatchedTeamVectorGMRES<exec_space, AMatrixValueView, IntView, XYType, KrylovHandleType>(batch_values, inv_diag, rowOffsets, colIndices, batch_x, batch_b, Nsolves_team, team_size, vector_length, n_iterations, tol, ortho_strategy, 0, handle) 861 .run(pc); 862 Kokkos::fence(); 863 // get data back 864 Kokkos::parallel_for( 865 "map", Kokkos::TeamPolicy<>(Nsolves /* /batch_sz */, team_size, PCBJKOKKOS_VEC_SIZE), KOKKOS_LAMBDA(const team_member team) { 866 const int blkID = team.league_rank(), start = d_bid_eqOffset[blkID], end = d_bid_eqOffset[blkID + 1]; // 0 867 // map x into Plex/PETSc 868 Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start, end), [=](int rowb) { 869 int rowa = d_isicol[rowb]; 870 glb_xdata[rowa] = batch_x(blkID, rowb - start); 871 }); 872 }); 873 // output assume species major - clone from Kokkos solvers 874 #if PCBJKOKKOS_VERBOSE_LEVEL >= 3 875 #if PCBJKOKKOS_VERBOSE_LEVEL >= 4 876 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "Iterations\n")); 877 #else 878 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "max iterations per species (gmres) :")); 879 #endif 880 for (PetscInt dmIdx = 0, s = 0, head = 0; dmIdx < jac->num_dms; dmIdx += batch_sz) { 881 for (PetscInt f = 0, idx = head; f < jac->dm_Nf[dmIdx]; f++, s++, idx++) { 882 #if PCBJKOKKOS_VERBOSE_LEVEL >= 4 883 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "%2D:", s)); 884 for (int bid = 0; bid < batch_sz; bid++) PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "%3D ", handle.get_iteration_host(idx + bid * jac->dm_Nf[dmIdx]))); 885 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "\n")); 886 #else 887 int count = 0, ii; 888 for (int bid = 0; bid < batch_sz; bid++) { 889 if ((ii = handle.get_iteration_host(idx + bid * jac->dm_Nf[dmIdx])) > count) count = ii; 890 } 891 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "%3d", count)); 892 #endif 893 } 894 head += batch_sz * jac->dm_Nf[dmIdx]; 895 } 896 #if PCBJKOKKOS_VERBOSE_LEVEL == 3 897 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "\n")); 898 #endif 899 #endif 900 // return error code, get max it 901 PetscInt count = 0, mbid = 0; 902 if (handle.is_converged_host()) { 903 pcreason = PC_NOERROR; 904 if (!jac->max_nits) { 905 for (int blkID = 0; blkID < nBlk; blkID++) { 906 if (handle.get_iteration_host(blkID) > jac->max_nits) { 907 jac->max_nits = handle.get_iteration_host(blkID); 908 mbid = blkID; 909 } 910 } 911 } 912 } else { 913 PetscCall(PetscPrintf(PETSC_COMM_SELF, "There is at least one system that did not converge.")); 914 pcreason = PC_SUBPC_ERROR; 915 } 916 // output - assume species major order 917 for (int blkID = 0; blkID < nBlk; blkID++) { 918 if (jac->reason) { // -pc_bjkokkos_ksp_converged_reason 919 if (jac->batch_target == blkID) { 920 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), " Linear solve %s in %d iterations, batch %" PetscInt_FMT ", species %" PetscInt_FMT "\n", handle.is_converged_host(blkID) ? "converged" : "diverged", handle.get_iteration_host(blkID), blkID % batch_sz, blkID / batch_sz)); 921 } else if (jac->batch_target == -1 && handle.get_iteration_host(blkID) > count) { 922 jac->max_nits = count = handle.get_iteration_host(blkID); 923 mbid = blkID; 924 } 925 if (!handle.is_converged_host(blkID)) PetscCall(PetscPrintf(PETSC_COMM_SELF, "ERROR species %d, batch %d did not converge with %d iterations\n", (int)(blkID / batch_sz), (int)blkID % batch_sz, handle.get_iteration_host(blkID))); 926 } 927 } 928 if (jac->batch_target == -1 && jac->reason) { 929 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), " Linear solve %s in %d iteration, batch %" PetscInt_FMT ", specie %" PetscInt_FMT "\n", handle.is_converged_host(mbid) ? "converged" : "diverged", jac->max_nits, mbid % batch_sz, mbid / batch_sz)); 930 } 931 #if defined(PETSC_HAVE_CUDA) 932 nvtxRangePop(); 933 #endif 934 #else 935 SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_USER, "batch GMRES not supported"); 936 #endif 937 } else { // Kokkos Krylov 938 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 939 using vect2D_scr_t = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft, scr_mem_t>; 940 Kokkos::View<Batch_MetaData *, Kokkos::DefaultExecutionSpace> d_metadata("solver meta data", nBlk); 941 int stride_shared, stride_global, global_buff_words; 942 d_bid_eqOffset = jac->d_bid_eqOffset_k->data(); 943 // solve each block independently 944 int scr_bytes_team_shared = 0, nShareVec = 0, nGlobBVec = 0; 945 if (jac->const_block_size) { // use shared memory for work vectors only if constant block size - todo: test efficiency loss 946 int maximum_shared_mem_size = 64000; 947 948 if (PetscDeviceConfiguredFor_Internal(PETSC_DEVICE_DEFAULT)) { 949 // FIXME: remove the configuredfor check once the PETSC_DEVICE_HOST MR is merged in 950 PetscDevice device; 951 952 PetscCall(PetscDeviceGetDefault_Internal(&device)); 953 PetscCall(PetscDeviceGetAttribute(device, PETSC_DEVICE_ATTR_SIZE_T_SHARED_MEM_PER_BLOCK, &maximum_shared_mem_size)); 954 } 955 stride_shared = jac->const_block_size; // captured 956 nShareVec = maximum_shared_mem_size / (jac->const_block_size * sizeof(PetscScalar)); // integer floor, number of vectors that fit in shared 957 if (nShareVec > nwork) nShareVec = nwork; 958 else nGlobBVec = nwork - nShareVec; 959 global_buff_words = jac->n * nGlobBVec; 960 scr_bytes_team_shared = jac->const_block_size * nShareVec * sizeof(PetscScalar); 961 //PetscCall(PetscPrintf(PETSC_COMM_WORLD,"maximum_shared_mem_size=%d scr_bytes_shared=%d nShareVec=%d, nGlobBVec=%d vec size=%d jac->const_block_size=%d\n",maximum_shared_mem_size,scr_bytes_team_shared,nShareVec,nGlobBVec,jac->const_block_size*sizeof(PetscScalar),jac->const_block_size)); 962 } else { 963 scr_bytes_team_shared = 0; 964 stride_shared = 0; 965 global_buff_words = jac->n * nwork; 966 nGlobBVec = nwork; // not needed == fix 967 } 968 stride_global = jac->n; // captured 969 #if defined(PETSC_HAVE_CUDA) 970 nvtxRangePushA("batch-kokkos-solve"); 971 #endif 972 Kokkos::View<PetscScalar *, Kokkos::DefaultExecutionSpace> d_work_vecs_k("workvectors", global_buff_words); // global work vectors 973 PetscInfo(pc, "\tn = %d. %d shared bytes/team, %d global mem bytes, rtol=%e, num blocks %d, team_size=%d, %d vector threads, %d shared vectors, %d global vectors\n", (int)jac->n, scr_bytes_team_shared, global_buff_words, rtol, (int)nBlk, (int)team_size, PCBJKOKKOS_VEC_SIZE, nShareVec, nGlobBVec); 974 PetscScalar *d_work_vecs = d_work_vecs_k.data(); 975 Kokkos::parallel_for( 976 "Solve", Kokkos::TeamPolicy<Kokkos::LaunchBounds<256, 4>>(nBlk, team_size, PCBJKOKKOS_VEC_SIZE).set_scratch_size(PCBJKOKKOS_SHARED_LEVEL, Kokkos::PerTeam(scr_bytes_team_shared)), KOKKOS_LAMBDA(const team_member team) { 977 const int blkID = team.league_rank(), start = d_bid_eqOffset[blkID], end = d_bid_eqOffset[blkID + 1]; 978 vect2D_scr_t work_vecs_shared(team.team_scratch(PCBJKOKKOS_SHARED_LEVEL), end - start, nShareVec); 979 PetscScalar *work_buff_shared = work_vecs_shared.data(); 980 PetscScalar *work_buff_global = &d_work_vecs[start]; // start inc'ed in 981 bool print = monitor && (blkID == view_bid); 982 switch (ksp_type_idx) { 983 case BATCH_KSP_BICG_IDX: 984 BJSolve_BICG(team, glb_Aai, glb_Aaj, glb_Aaa, d_isrow, d_isicol, work_buff_global, stride_global, nShareVec, work_buff_shared, stride_shared, rtol, atol, dtol, maxit, &d_metadata[blkID], start, end, glb_idiag, glb_bdata, glb_xdata, print); 985 break; 986 case BATCH_KSP_TFQMR_IDX: 987 BJSolve_TFQMR(team, glb_Aai, glb_Aaj, glb_Aaa, d_isrow, d_isicol, work_buff_global, stride_global, nShareVec, work_buff_shared, stride_shared, rtol, atol, dtol, maxit, &d_metadata[blkID], start, end, glb_idiag, glb_bdata, glb_xdata, print); 988 break; 989 case BATCH_KSP_GMRES_IDX: 990 //BJSolve_GMRES(); 991 break; 992 default: 993 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 994 printf("Unknown KSP type %d\n", ksp_type_idx); 995 #else 996 /* void */; 997 #endif 998 } 999 }); 1000 Kokkos::fence(); 1001 #if defined(PETSC_HAVE_CUDA) 1002 nvtxRangePop(); 1003 nvtxRangePushA("Post-solve-metadata"); 1004 #endif 1005 auto h_metadata = Kokkos::create_mirror(Kokkos::HostSpace::memory_space(), d_metadata); 1006 Kokkos::deep_copy(h_metadata, d_metadata); 1007 #if PCBJKOKKOS_VERBOSE_LEVEL >= 3 1008 #if PCBJKOKKOS_VERBOSE_LEVEL >= 4 1009 PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Iterations\n")); 1010 #endif 1011 // assume species major 1012 #if PCBJKOKKOS_VERBOSE_LEVEL < 4 1013 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "max iterations per species (%s) :", ksp_type_idx == BATCH_KSP_BICG_IDX ? "bicg" : "tfqmr")); 1014 #endif 1015 for (PetscInt dmIdx = 0, s = 0, head = 0; dmIdx < jac->num_dms; dmIdx += batch_sz) { 1016 for (PetscInt f = 0, idx = head; f < jac->dm_Nf[dmIdx]; f++, s++, idx++) { 1017 #if PCBJKOKKOS_VERBOSE_LEVEL >= 4 1018 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "%2" PetscInt_FMT ":", s)); 1019 for (int bid = 0; bid < batch_sz; bid++) PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "%3" PetscInt_FMT " ", h_metadata[idx + bid * jac->dm_Nf[dmIdx]].its)); 1020 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "\n")); 1021 #else 1022 PetscInt count = 0; 1023 for (int bid = 0; bid < batch_sz; bid++) { 1024 if (h_metadata[idx + bid * jac->dm_Nf[dmIdx]].its > count) count = h_metadata[idx + bid * jac->dm_Nf[dmIdx]].its; 1025 } 1026 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "%3" PetscInt_FMT " ", count)); 1027 #endif 1028 } 1029 head += batch_sz * jac->dm_Nf[dmIdx]; 1030 } 1031 #if PCBJKOKKOS_VERBOSE_LEVEL == 3 1032 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "\n")); 1033 #endif 1034 #endif 1035 PetscInt count = 0, mbid = 0; 1036 for (int blkID = 0; blkID < nBlk; blkID++) { 1037 PetscCall(PetscLogGpuFlops((PetscLogDouble)h_metadata[blkID].flops)); 1038 if (jac->reason) { // -pc_bjkokkos_ksp_converged_reason 1039 if (jac->batch_target == blkID) { 1040 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), " Linear solve converged due to %s iterations %d, batch %" PetscInt_FMT ", species %" PetscInt_FMT "\n", KSPConvergedReasons[h_metadata[blkID].reason], h_metadata[blkID].its, blkID % batch_sz, blkID / batch_sz)); 1041 } else if (jac->batch_target == -1 && h_metadata[blkID].its > count) { 1042 jac->max_nits = count = h_metadata[blkID].its; 1043 mbid = blkID; 1044 } 1045 if (h_metadata[blkID].reason < 0) { 1046 PetscCall(PetscPrintf(PETSC_COMM_SELF, "ERROR reason=%s, its=%" PetscInt_FMT ". species %" PetscInt_FMT ", batch %" PetscInt_FMT "\n", KSPConvergedReasons[h_metadata[blkID].reason], h_metadata[blkID].its, blkID / batch_sz, blkID % batch_sz)); 1047 } 1048 } 1049 } 1050 if (jac->batch_target == -1 && jac->reason) { 1051 PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), " Linear solve converged due to %s iterations %d, batch %" PetscInt_FMT ", specie %" PetscInt_FMT "\n", KSPConvergedReasons[h_metadata[mbid].reason], h_metadata[mbid].its, mbid % batch_sz, mbid / batch_sz)); 1052 } 1053 { 1054 int errsum; 1055 Kokkos::parallel_reduce( 1056 nBlk, 1057 KOKKOS_LAMBDA(const int idx, int &lsum) { 1058 if (d_metadata[idx].reason < 0) ++lsum; 1059 }, 1060 errsum); 1061 pcreason = errsum ? PC_SUBPC_ERROR : PC_NOERROR; 1062 if (!errsum && !jac->max_nits) { // set max its to give back to top KSP 1063 for (int blkID = 0; blkID < nBlk; blkID++) { 1064 if (h_metadata[blkID].its > jac->max_nits) jac->max_nits = h_metadata[blkID].its; 1065 } 1066 } else if (errsum) { 1067 PetscCall(PetscPrintf(PETSC_COMM_SELF, "ERROR Kokkos batch solver did not converge in all solves\n")); 1068 } 1069 } 1070 #if defined(PETSC_HAVE_CUDA) 1071 nvtxRangePop(); 1072 #endif 1073 } // end of Kokkos (not Kernels) solvers block 1074 PetscCall(VecRestoreArrayAndMemType(xout, &glb_xdata)); 1075 PetscCall(VecRestoreArrayReadAndMemType(bvec, &glb_bdata)); 1076 PetscCall(PCSetFailedReason(pc, pcreason)); 1077 // map back to Plex space - not used 1078 if (plex_batch) { 1079 PetscCall(VecCopy(xout, bvec)); 1080 PetscCall(VecScatterBegin(plex_batch, bvec, xout, INSERT_VALUES, SCATTER_REVERSE)); 1081 PetscCall(VecScatterEnd(plex_batch, bvec, xout, INSERT_VALUES, SCATTER_REVERSE)); 1082 PetscCall(VecDestroy(&bvec)); 1083 } 1084 } // whole 'have aijkok' block 1085 PetscFunctionReturn(0); 1086 } 1087 1088 static PetscErrorCode PCSetUp_BJKOKKOS(PC pc) { 1089 PC_PCBJKOKKOS *jac = (PC_PCBJKOKKOS *)pc->data; 1090 Mat A = pc->pmat; 1091 Mat_SeqAIJKokkos *aijkok; 1092 PetscBool flg; 1093 1094 PetscFunctionBegin; 1095 PetscCheck(!pc->useAmat, PetscObjectComm((PetscObject)pc), PETSC_ERR_SUP, "No support for using 'use_amat'"); 1096 PetscCheck(A, PetscObjectComm((PetscObject)A), PETSC_ERR_USER, "No matrix - A is used above"); 1097 PetscCall(PetscObjectTypeCompareAny((PetscObject)A, &flg, MATSEQAIJKOKKOS, MATMPIAIJKOKKOS, MATAIJKOKKOS, "")); 1098 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "must use '-dm_mat_type aijkokkos -dm_vec_type kokkos' for -pc_type bjkokkos"); 1099 if (!(aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr))) { 1100 SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_USER, "No aijkok"); 1101 } else { 1102 if (!jac->vec_diag) { 1103 Vec *subX; 1104 DM pack, *subDM; 1105 PetscInt nDMs, n; 1106 PetscContainer container; 1107 PetscCall(PetscObjectQuery((PetscObject)A, "plex_batch_is", (PetscObject *)&container)); 1108 { // Permute the matrix to get a block diagonal system: d_isrow_k, d_isicol_k 1109 MatOrderingType rtype; 1110 IS isrow, isicol; 1111 const PetscInt *rowindices, *icolindices; 1112 rtype = MATORDERINGRCM; 1113 // get permutation. Not what I expect so inverted here 1114 PetscCall(MatGetOrdering(A, rtype, &isrow, &isicol)); 1115 PetscCall(ISDestroy(&isrow)); 1116 PetscCall(ISInvertPermutation(isicol, PETSC_DECIDE, &isrow)); // THIS IS BACKWARD -- isrow is inverse -- FIX!!!!! 1117 1118 Mat mat_block_order; 1119 PetscCall(MatCreateSubMatrix(A, isicol, isicol, MAT_INITIAL_MATRIX, &mat_block_order)); 1120 PetscCall(MatViewFromOptions(mat_block_order, NULL, "-ksp_batch_reorder_view")); 1121 PetscCall(MatDestroy(&mat_block_order)); 1122 1123 PetscCall(ISGetIndices(isrow, &rowindices)); 1124 PetscCall(ISGetIndices(isicol, &icolindices)); 1125 const Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_isrow_k((PetscInt *)rowindices, A->rmap->n); 1126 const Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_isicol_k((PetscInt *)icolindices, A->rmap->n); 1127 jac->d_isrow_k = new Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_isrow_k)); 1128 jac->d_isicol_k = new Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_isicol_k)); 1129 Kokkos::deep_copy(*jac->d_isrow_k, h_isrow_k); 1130 Kokkos::deep_copy(*jac->d_isicol_k, h_isicol_k); 1131 PetscCall(ISRestoreIndices(isrow, &rowindices)); 1132 PetscCall(ISRestoreIndices(isicol, &icolindices)); 1133 PetscCall(ISDestroy(&isrow)); 1134 PetscCall(ISDestroy(&isicol)); 1135 } 1136 // get block sizes 1137 PetscCall(PCGetDM(pc, &pack)); 1138 PetscCheck(pack, PetscObjectComm((PetscObject)A), PETSC_ERR_USER, "no DM. Requires a composite DM"); 1139 PetscCall(PetscObjectTypeCompare((PetscObject)pack, DMCOMPOSITE, &flg)); 1140 PetscCheck(flg, PetscObjectComm((PetscObject)pack), PETSC_ERR_USER, "Not for type %s", ((PetscObject)pack)->type_name); 1141 PetscCall(DMCompositeGetNumberDM(pack, &nDMs)); 1142 jac->num_dms = nDMs; 1143 PetscCall(DMCreateGlobalVector(pack, &jac->vec_diag)); 1144 PetscCall(VecGetLocalSize(jac->vec_diag, &n)); 1145 jac->n = n; 1146 jac->d_idiag_k = new Kokkos::View<PetscScalar *, Kokkos::LayoutRight>("idiag", n); 1147 // options 1148 PetscCall(PCBJKOKKOSCreateKSP_BJKOKKOS(pc)); 1149 PetscCall(KSPSetFromOptions(jac->ksp)); 1150 PetscCall(PetscObjectTypeCompareAny((PetscObject)jac->ksp, &flg, KSPBICG, "")); 1151 if (flg) { 1152 jac->ksp_type_idx = BATCH_KSP_BICG_IDX; 1153 jac->nwork = 7; 1154 } else { 1155 PetscCall(PetscObjectTypeCompareAny((PetscObject)jac->ksp, &flg, KSPTFQMR, "")); 1156 if (flg) { 1157 jac->ksp_type_idx = BATCH_KSP_TFQMR_IDX; 1158 jac->nwork = 10; 1159 } else { 1160 PetscCall(PetscObjectTypeCompareAny((PetscObject)jac->ksp, &flg, KSPGMRES, "")); 1161 if (flg) { 1162 jac->ksp_type_idx = BATCH_KSP_GMRES_IDX; 1163 jac->nwork = 0; 1164 } else { 1165 SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Unsupported batch ksp type"); 1166 } 1167 } 1168 } 1169 PetscOptionsBegin(PetscObjectComm((PetscObject)jac->ksp), ((PetscObject)jac->ksp)->prefix, "Options for Kokkos batch solver", "none"); 1170 PetscCall(PetscOptionsBool("-ksp_converged_reason", "", "bjkokkos.kokkos.cxx.c", jac->reason, &jac->reason, NULL)); 1171 PetscCall(PetscOptionsBool("-ksp_monitor", "", "bjkokkos.kokkos.cxx.c", jac->monitor, &jac->monitor, NULL)); 1172 PetscCall(PetscOptionsInt("-ksp_batch_target", "", "bjkokkos.kokkos.cxx.c", jac->batch_target, &jac->batch_target, NULL)); 1173 PetscCall(PetscOptionsInt("-ksp_batch_nsolves_team", "", "bjkokkos.kokkos.cxx.c", jac->nsolves_team, &jac->nsolves_team, NULL)); 1174 PetscCheck(jac->batch_target < jac->num_dms, PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "-ksp_batch_target (%" PetscInt_FMT ") >= number of DMs (%" PetscInt_FMT ")", jac->batch_target, jac->num_dms); 1175 PetscOptionsEnd(); 1176 // get blocks - jac->d_bid_eqOffset_k 1177 PetscCall(PetscMalloc(sizeof(*subX) * nDMs, &subX)); 1178 PetscCall(PetscMalloc(sizeof(*subDM) * nDMs, &subDM)); 1179 PetscCall(PetscMalloc(sizeof(*jac->dm_Nf) * nDMs, &jac->dm_Nf)); 1180 PetscCall(PetscInfo(pc, "Have %" PetscInt_FMT " DMs, n=%" PetscInt_FMT " rtol=%g type = %s\n", nDMs, n, (double)jac->ksp->rtol, ((PetscObject)jac->ksp)->type_name)); 1181 PetscCall(DMCompositeGetEntriesArray(pack, subDM)); 1182 jac->nBlocks = 0; 1183 for (PetscInt ii = 0; ii < nDMs; ii++) { 1184 PetscSection section; 1185 PetscInt Nf; 1186 DM dm = subDM[ii]; 1187 PetscCall(DMGetLocalSection(dm, §ion)); 1188 PetscCall(PetscSectionGetNumFields(section, &Nf)); 1189 jac->nBlocks += Nf; 1190 #if PCBJKOKKOS_VERBOSE_LEVEL <= 2 1191 if (ii == 0) PetscCall(PetscInfo(pc, "%" PetscInt_FMT ") %" PetscInt_FMT " blocks (%" PetscInt_FMT " total)\n", ii, Nf, jac->nBlocks)); 1192 #else 1193 PetscCall(PetscInfo(pc, "%" PetscInt_FMT ") %" PetscInt_FMT " blocks (%" PetscInt_FMT " total)\n", ii, Nf, jac->nBlocks)); 1194 #endif 1195 jac->dm_Nf[ii] = Nf; 1196 } 1197 { // d_bid_eqOffset_k 1198 Kokkos::View<PetscInt *, Kokkos::LayoutRight, Kokkos::HostSpace> h_block_offsets("block_offsets", jac->nBlocks + 1); 1199 PetscCall(DMCompositeGetAccessArray(pack, jac->vec_diag, nDMs, NULL, subX)); 1200 h_block_offsets[0] = 0; 1201 jac->const_block_size = -1; 1202 for (PetscInt ii = 0, idx = 0; ii < nDMs; ii++) { 1203 PetscInt nloc, nblk; 1204 PetscCall(VecGetSize(subX[ii], &nloc)); 1205 nblk = nloc / jac->dm_Nf[ii]; 1206 PetscCheck(nloc % jac->dm_Nf[ii] == 0, PetscObjectComm((PetscObject)pc), PETSC_ERR_USER, "nloc%%jac->dm_Nf[ii] (%" PetscInt_FMT ") != 0 DMs", nloc % jac->dm_Nf[ii]); 1207 for (PetscInt jj = 0; jj < jac->dm_Nf[ii]; jj++, idx++) { 1208 h_block_offsets[idx + 1] = h_block_offsets[idx] + nblk; 1209 #if PCBJKOKKOS_VERBOSE_LEVEL <= 2 1210 if (idx == 0) PetscCall(PetscInfo(pc, "\t%" PetscInt_FMT ") Add block with %" PetscInt_FMT " equations of %" PetscInt_FMT "\n", idx + 1, nblk, jac->nBlocks)); 1211 #else 1212 PetscCall(PetscInfo(pc, "\t%" PetscInt_FMT ") Add block with %" PetscInt_FMT " equations of %" PetscInt_FMT "\n", idx + 1, nblk, jac->nBlocks)); 1213 #endif 1214 if (jac->const_block_size == -1) jac->const_block_size = nblk; 1215 else if (jac->const_block_size > 0 && jac->const_block_size != nblk) jac->const_block_size = 0; 1216 } 1217 } 1218 PetscCall(DMCompositeRestoreAccessArray(pack, jac->vec_diag, jac->nBlocks, NULL, subX)); 1219 PetscCall(PetscFree(subX)); 1220 PetscCall(PetscFree(subDM)); 1221 jac->d_bid_eqOffset_k = new Kokkos::View<PetscInt *, Kokkos::LayoutRight>(Kokkos::create_mirror(Kokkos::DefaultExecutionSpace::memory_space(), h_block_offsets)); 1222 Kokkos::deep_copy(*jac->d_bid_eqOffset_k, h_block_offsets); 1223 } 1224 } 1225 { // get jac->d_idiag_k (PC setup), 1226 const PetscInt *d_ai = aijkok->i_device_data(), *d_aj = aijkok->j_device_data(); 1227 const PetscScalar *d_aa = aijkok->a_device_data(); 1228 const PetscInt conc = Kokkos::DefaultExecutionSpace().concurrency(), openmp = !!(conc < 1000), team_size = (openmp == 0 && PCBJKOKKOS_VEC_SIZE != 1) ? PCBJKOKKOS_TEAM_SIZE : 1; 1229 PetscInt *d_bid_eqOffset = jac->d_bid_eqOffset_k->data(), *r = jac->d_isrow_k->data(), *ic = jac->d_isicol_k->data(); 1230 PetscScalar *d_idiag = jac->d_idiag_k->data(); 1231 Kokkos::parallel_for( 1232 "Diag", Kokkos::TeamPolicy<>(jac->nBlocks, team_size, PCBJKOKKOS_VEC_SIZE), KOKKOS_LAMBDA(const team_member team) { 1233 const PetscInt blkID = team.league_rank(); 1234 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, d_bid_eqOffset[blkID], d_bid_eqOffset[blkID + 1]), [=](const int rowb) { 1235 const PetscInt rowa = ic[rowb], ai = d_ai[rowa], *aj = d_aj + ai; // grab original data 1236 const PetscScalar *aa = d_aa + ai; 1237 const PetscInt nrow = d_ai[rowa + 1] - ai; 1238 int found; 1239 Kokkos::parallel_reduce( 1240 Kokkos::ThreadVectorRange(team, nrow), 1241 [=](const int &j, int &count) { 1242 const PetscInt colb = r[aj[j]]; 1243 if (colb == rowb) { 1244 d_idiag[rowb] = 1. / aa[j]; 1245 count++; 1246 } 1247 }, 1248 found); 1249 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 1250 if (found != 1) Kokkos::single(Kokkos::PerThread(team), [=]() { printf("ERRORrow %d) found = %d\n", rowb, found); }); 1251 #endif 1252 }); 1253 }); 1254 } 1255 } 1256 PetscFunctionReturn(0); 1257 } 1258 1259 /* Default destroy, if it has never been setup */ 1260 static PetscErrorCode PCReset_BJKOKKOS(PC pc) { 1261 PC_PCBJKOKKOS *jac = (PC_PCBJKOKKOS *)pc->data; 1262 1263 PetscFunctionBegin; 1264 PetscCall(KSPDestroy(&jac->ksp)); 1265 PetscCall(VecDestroy(&jac->vec_diag)); 1266 if (jac->d_bid_eqOffset_k) delete jac->d_bid_eqOffset_k; 1267 if (jac->d_idiag_k) delete jac->d_idiag_k; 1268 if (jac->d_isrow_k) delete jac->d_isrow_k; 1269 if (jac->d_isicol_k) delete jac->d_isicol_k; 1270 jac->d_bid_eqOffset_k = NULL; 1271 jac->d_idiag_k = NULL; 1272 jac->d_isrow_k = NULL; 1273 jac->d_isicol_k = NULL; 1274 PetscCall(PetscObjectComposeFunction((PetscObject)pc, "PCBJKOKKOSGetKSP_C", NULL)); // not published now (causes configure errors) 1275 PetscCall(PetscObjectComposeFunction((PetscObject)pc, "PCBJKOKKOSSetKSP_C", NULL)); 1276 PetscCall(PetscFree(jac->dm_Nf)); 1277 jac->dm_Nf = NULL; 1278 if (jac->rowOffsets) delete jac->rowOffsets; 1279 if (jac->colIndices) delete jac->colIndices; 1280 if (jac->batch_b) delete jac->batch_b; 1281 if (jac->batch_x) delete jac->batch_x; 1282 if (jac->batch_values) delete jac->batch_values; 1283 jac->rowOffsets = NULL; 1284 jac->colIndices = NULL; 1285 jac->batch_b = NULL; 1286 jac->batch_x = NULL; 1287 jac->batch_values = NULL; 1288 1289 PetscFunctionReturn(0); 1290 } 1291 1292 static PetscErrorCode PCDestroy_BJKOKKOS(PC pc) { 1293 PetscFunctionBegin; 1294 PetscCall(PCReset_BJKOKKOS(pc)); 1295 PetscCall(PetscFree(pc->data)); 1296 PetscFunctionReturn(0); 1297 } 1298 1299 static PetscErrorCode PCView_BJKOKKOS(PC pc, PetscViewer viewer) { 1300 PC_PCBJKOKKOS *jac = (PC_PCBJKOKKOS *)pc->data; 1301 PetscBool iascii; 1302 1303 PetscFunctionBegin; 1304 if (!jac->ksp) PetscCall(PCBJKOKKOSCreateKSP_BJKOKKOS(pc)); 1305 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii)); 1306 if (iascii) { 1307 PetscCall(PetscViewerASCIIPrintf(viewer, " Batched device linear solver: Krylov (KSP) method with Jacobi preconditioning\n")); 1308 PetscCall(PetscViewerASCIIPrintf(viewer, "\t\tnwork = %" PetscInt_FMT ", rel tol = %e, abs tol = %e, div tol = %e, max it =%" PetscInt_FMT ", type = %s\n", jac->nwork, jac->ksp->rtol, jac->ksp->abstol, jac->ksp->divtol, jac->ksp->max_it, 1309 ((PetscObject)jac->ksp)->type_name)); 1310 } 1311 PetscFunctionReturn(0); 1312 } 1313 1314 static PetscErrorCode PCSetFromOptions_BJKOKKOS(PC pc, PetscOptionItems *PetscOptionsObject) { 1315 PetscFunctionBegin; 1316 PetscOptionsHeadBegin(PetscOptionsObject, "PC BJKOKKOS options"); 1317 PetscOptionsHeadEnd(); 1318 PetscFunctionReturn(0); 1319 } 1320 1321 static PetscErrorCode PCBJKOKKOSSetKSP_BJKOKKOS(PC pc, KSP ksp) { 1322 PC_PCBJKOKKOS *jac = (PC_PCBJKOKKOS *)pc->data; 1323 1324 PetscFunctionBegin; 1325 PetscCall(PetscObjectReference((PetscObject)ksp)); 1326 PetscCall(KSPDestroy(&jac->ksp)); 1327 jac->ksp = ksp; 1328 PetscFunctionReturn(0); 1329 } 1330 1331 /*@C 1332 PCBJKOKKOSSetKSP - Sets the KSP context for a KSP PC. 1333 1334 Collective on PC 1335 1336 Input Parameters: 1337 + pc - the preconditioner context 1338 - ksp - the KSP solver 1339 1340 Notes: 1341 The PC and the KSP must have the same communicator 1342 1343 Level: advanced 1344 1345 @*/ 1346 PetscErrorCode PCBJKOKKOSSetKSP(PC pc, KSP ksp) { 1347 PetscFunctionBegin; 1348 PetscValidHeaderSpecific(pc, PC_CLASSID, 1); 1349 PetscValidHeaderSpecific(ksp, KSP_CLASSID, 2); 1350 PetscCheckSameComm(pc, 1, ksp, 2); 1351 PetscTryMethod(pc, "PCBJKOKKOSSetKSP_C", (PC, KSP), (pc, ksp)); 1352 PetscFunctionReturn(0); 1353 } 1354 1355 static PetscErrorCode PCBJKOKKOSGetKSP_BJKOKKOS(PC pc, KSP *ksp) { 1356 PC_PCBJKOKKOS *jac = (PC_PCBJKOKKOS *)pc->data; 1357 1358 PetscFunctionBegin; 1359 if (!jac->ksp) PetscCall(PCBJKOKKOSCreateKSP_BJKOKKOS(pc)); 1360 *ksp = jac->ksp; 1361 PetscFunctionReturn(0); 1362 } 1363 1364 /*@C 1365 PCBJKOKKOSGetKSP - Gets the KSP context for a KSP PC. 1366 1367 Not Collective but KSP returned is parallel if PC was parallel 1368 1369 Input Parameter: 1370 . pc - the preconditioner context 1371 1372 Output Parameters: 1373 . ksp - the KSP solver 1374 1375 Notes: 1376 You must call KSPSetUp() before calling PCBJKOKKOSGetKSP(). 1377 1378 If the PC is not a PCBJKOKKOS object it raises an error 1379 1380 Level: advanced 1381 1382 @*/ 1383 PetscErrorCode PCBJKOKKOSGetKSP(PC pc, KSP *ksp) { 1384 PetscFunctionBegin; 1385 PetscValidHeaderSpecific(pc, PC_CLASSID, 1); 1386 PetscValidPointer(ksp, 2); 1387 PetscUseMethod(pc, "PCBJKOKKOSGetKSP_C", (PC, KSP *), (pc, ksp)); 1388 PetscFunctionReturn(0); 1389 } 1390 1391 static PetscErrorCode PCPostSolve_BJKOKKOS(PC pc, KSP ksp, Vec b, Vec x) { 1392 PC_PCBJKOKKOS *jac = (PC_PCBJKOKKOS *)pc->data; 1393 1394 PetscFunctionBegin; 1395 ksp->its = jac->max_nits; 1396 PetscFunctionReturn(0); 1397 } 1398 1399 /* ----------------------------------------------------------------------------------*/ 1400 1401 /*MC 1402 PCBJKOKKOS - Defines a preconditioner that applies a Krylov solver and preconditioner to the blocks in a AIJASeq matrix on the GPU. 1403 1404 Options Database Key: 1405 . -pc_bjkokkos_ - options prefix with ksp options 1406 1407 Level: intermediate 1408 1409 Notes: 1410 For use with -ksp_type preonly to bypass any CPU work 1411 1412 Developer Notes: 1413 1414 .seealso: `PCCreate()`, `PCSetType()`, `PCType`, `PC`, 1415 `PCSHELL`, `PCCOMPOSITE`, `PCSetUseAmat()`, `PCBJKOKKOSGetKSP()` 1416 1417 M*/ 1418 1419 PETSC_EXTERN PetscErrorCode PCCreate_BJKOKKOS(PC pc) { 1420 PC_PCBJKOKKOS *jac; 1421 1422 PetscFunctionBegin; 1423 PetscCall(PetscNewLog(pc, &jac)); 1424 pc->data = (void *)jac; 1425 1426 jac->ksp = NULL; 1427 jac->vec_diag = NULL; 1428 jac->d_bid_eqOffset_k = NULL; 1429 jac->d_idiag_k = NULL; 1430 jac->d_isrow_k = NULL; 1431 jac->d_isicol_k = NULL; 1432 jac->nBlocks = 1; 1433 jac->max_nits = 0; 1434 1435 PetscCall(PetscMemzero(pc->ops, sizeof(struct _PCOps))); 1436 pc->ops->apply = PCApply_BJKOKKOS; 1437 pc->ops->applytranspose = NULL; 1438 pc->ops->setup = PCSetUp_BJKOKKOS; 1439 pc->ops->reset = PCReset_BJKOKKOS; 1440 pc->ops->destroy = PCDestroy_BJKOKKOS; 1441 pc->ops->setfromoptions = PCSetFromOptions_BJKOKKOS; 1442 pc->ops->view = PCView_BJKOKKOS; 1443 pc->ops->postsolve = PCPostSolve_BJKOKKOS; 1444 1445 jac->rowOffsets = NULL; 1446 jac->colIndices = NULL; 1447 jac->batch_b = NULL; 1448 jac->batch_x = NULL; 1449 jac->batch_values = NULL; 1450 1451 PetscCall(PetscObjectComposeFunction((PetscObject)pc, "PCBJKOKKOSGetKSP_C", PCBJKOKKOSGetKSP_BJKOKKOS)); 1452 PetscCall(PetscObjectComposeFunction((PetscObject)pc, "PCBJKOKKOSSetKSP_C", PCBJKOKKOSSetKSP_BJKOKKOS)); 1453 PetscFunctionReturn(0); 1454 } 1455