xref: /petsc/src/mat/impls/dense/seq/cupm/matseqdensecupm.hpp (revision 4742e46b56cb5d0762110e30c569ce3737a8e22a)
1 #ifndef PETSCMATSEQDENSECUPM_HPP
2 #define PETSCMATSEQDENSECUPM_HPP
3 
4 #include <petsc/private/matdensecupmimpl.h> /*I <petscmat.h> I*/
5 #include <../src/mat/impls/dense/seq/dense.h>
6 
7 #if defined(__cplusplus)
8   #include <petsc/private/deviceimpl.h> // PetscDeviceContextGetOptionalNullContext_Internal()
9   #include <petsc/private/randomimpl.h> // _p_PetscRandom
10   #include <petsc/private/vecimpl.h>    // _p_Vec
11   #include <petsc/private/cupmobject.hpp>
12   #include <petsc/private/cupmsolverinterface.hpp>
13 
14   #include <petsc/private/cpp/type_traits.hpp> // PetscObjectCast()
15   #include <petsc/private/cpp/utility.hpp>     // util::exchange()
16 
17   #include <../src/vec/vec/impls/seq/cupm/vecseqcupm.hpp> // for VecSeq_CUPM
18 
19 namespace Petsc
20 {
21 
22 namespace mat
23 {
24 
25 namespace cupm
26 {
27 
28 namespace impl
29 {
30 
31 template <device::cupm::DeviceType T>
32 class MatDense_Seq_CUPM : MatDense_CUPM<T, MatDense_Seq_CUPM<T>> {
33 public:
34   MATDENSECUPM_HEADER(T, MatDense_Seq_CUPM<T>);
35 
36 private:
37   struct Mat_SeqDenseCUPM {
38     PetscScalar *d_v;           // pointer to the matrix on the GPU
39     PetscScalar *unplacedarray; // if one called MatCUPMDensePlaceArray(), this is where it stashed the original
40     bool         d_user_alloc;
41     bool         d_unplaced_user_alloc;
42     // factorization support
43     cupmBlasInt_t *d_fact_ipiv;  // device pivots
44     cupmScalar_t  *d_fact_tau;   // device QR tau vector
45     cupmBlasInt_t *d_fact_info;  // device info
46     cupmScalar_t  *d_fact_work;  // device workspace
47     cupmBlasInt_t  d_fact_lwork; // size of device workspace
48     // workspace
49     Vec workvec;
50   };
51 
52   static PetscErrorCode SetPreallocation_(Mat, PetscDeviceContext, PetscScalar *) noexcept;
53 
54   static PetscErrorCode HostToDevice_(Mat, PetscDeviceContext) noexcept;
55   static PetscErrorCode DeviceToHost_(Mat, PetscDeviceContext) noexcept;
56 
57   static PetscErrorCode CheckCUPMSolverInfo_(const cupmBlasInt_t *, cupmStream_t) noexcept;
58 
59   template <typename Derived>
60   struct SolveCommon;
61   struct SolveQR;
62   struct SolveCholesky;
63   struct SolveLU;
64 
65   template <typename Solver, bool transpose>
66   static PetscErrorCode MatSolve_Factored_Dispatch_(Mat, Vec, Vec) noexcept;
67   template <typename Solver, bool transpose>
68   static PetscErrorCode MatMatSolve_Factored_Dispatch_(Mat, Mat, Mat) noexcept;
69   template <bool transpose>
70   static PetscErrorCode MatMultAdd_Dispatch_(Mat, Vec, Vec, Vec) noexcept;
71 
72   template <bool to_host>
73   static PetscErrorCode Convert_Dispatch_(Mat, MatType, MatReuse, Mat *) noexcept;
74 
75   PETSC_NODISCARD static constexpr MatType       MATIMPLCUPM_() noexcept;
76   PETSC_NODISCARD static constexpr Mat_SeqDense *MatIMPLCast_(Mat) noexcept;
77 
78 public:
79   PETSC_NODISCARD static constexpr Mat_SeqDenseCUPM *MatCUPMCast(Mat) noexcept;
80 
81   // define these by hand since they don't fit the above mold
82   PETSC_NODISCARD static constexpr const char *MatConvert_seqdensecupm_seqdense_C() noexcept;
83   PETSC_NODISCARD static constexpr const char *MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept;
84 
85   static PetscErrorCode Create(Mat) noexcept;
86   static PetscErrorCode Destroy(Mat) noexcept;
87   static PetscErrorCode SetUp(Mat) noexcept;
88   static PetscErrorCode Reset(Mat) noexcept;
89 
90   static PetscErrorCode BindToCPU(Mat, PetscBool) noexcept;
91   static PetscErrorCode Convert_SeqDense_SeqDenseCUPM(Mat, MatType, MatReuse, Mat *) noexcept;
92   static PetscErrorCode Convert_SeqDenseCUPM_SeqDense(Mat, MatType, MatReuse, Mat *) noexcept;
93 
94   template <PetscMemType, PetscMemoryAccessMode>
95   static PetscErrorCode GetArray(Mat, PetscScalar **, PetscDeviceContext) noexcept;
96   template <PetscMemType, PetscMemoryAccessMode>
97   static PetscErrorCode RestoreArray(Mat, PetscScalar **, PetscDeviceContext) noexcept;
98   template <PetscMemoryAccessMode>
99   static PetscErrorCode GetArrayAndMemType(Mat, PetscScalar **, PetscMemType *, PetscDeviceContext) noexcept;
100   template <PetscMemoryAccessMode>
101   static PetscErrorCode RestoreArrayAndMemType(Mat, PetscScalar **, PetscDeviceContext) noexcept;
102 
103 private:
104   template <PetscMemType mtype, PetscMemoryAccessMode mode>
105   static PetscErrorCode GetArrayC_(Mat m, PetscScalar **p) noexcept
106   {
107     PetscDeviceContext dctx;
108 
109     PetscFunctionBegin;
110     PetscCall(GetHandles_(&dctx));
111     PetscCall(GetArray<mtype, mode>(m, p, dctx));
112     PetscFunctionReturn(PETSC_SUCCESS);
113   }
114 
115   template <PetscMemType mtype, PetscMemoryAccessMode mode>
116   static PetscErrorCode RestoreArrayC_(Mat m, PetscScalar **p) noexcept
117   {
118     PetscDeviceContext dctx;
119 
120     PetscFunctionBegin;
121     PetscCall(GetHandles_(&dctx));
122     PetscCall(RestoreArray<mtype, mode>(m, p, dctx));
123     PetscFunctionReturn(PETSC_SUCCESS);
124   }
125 
126   template <PetscMemoryAccessMode mode>
127   static PetscErrorCode GetArrayAndMemTypeC_(Mat m, PetscScalar **p, PetscMemType *tp) noexcept
128   {
129     PetscDeviceContext dctx;
130 
131     PetscFunctionBegin;
132     PetscCall(GetHandles_(&dctx));
133     PetscCall(GetArrayAndMemType<mode>(m, p, tp, dctx));
134     PetscFunctionReturn(PETSC_SUCCESS);
135   }
136 
137   template <PetscMemoryAccessMode mode>
138   static PetscErrorCode RestoreArrayAndMemTypeC_(Mat m, PetscScalar **p) noexcept
139   {
140     PetscDeviceContext dctx;
141 
142     PetscFunctionBegin;
143     PetscCall(GetHandles_(&dctx));
144     PetscCall(RestoreArrayAndMemType<mode>(m, p, dctx));
145     PetscFunctionReturn(PETSC_SUCCESS);
146   }
147 
148 public:
149   static PetscErrorCode PlaceArray(Mat, const PetscScalar *) noexcept;
150   static PetscErrorCode ReplaceArray(Mat, const PetscScalar *) noexcept;
151   static PetscErrorCode ResetArray(Mat) noexcept;
152 
153   template <bool transpose_A, bool transpose_B>
154   static PetscErrorCode MatMatMult_Numeric_Dispatch(Mat, Mat, Mat) noexcept;
155   static PetscErrorCode Copy(Mat, Mat, MatStructure) noexcept;
156   static PetscErrorCode ZeroEntries(Mat) noexcept;
157   static PetscErrorCode Scale(Mat, PetscScalar) noexcept;
158   static PetscErrorCode Shift(Mat, PetscScalar) noexcept;
159   static PetscErrorCode AXPY(Mat, PetscScalar, Mat, MatStructure) noexcept;
160   static PetscErrorCode Duplicate(Mat, MatDuplicateOption, Mat *) noexcept;
161   static PetscErrorCode SetRandom(Mat, PetscRandom) noexcept;
162 
163   static PetscErrorCode GetColumnVector(Mat, Vec, PetscInt) noexcept;
164   template <PetscMemoryAccessMode>
165   static PetscErrorCode GetColumnVec(Mat, PetscInt, Vec *) noexcept;
166   template <PetscMemoryAccessMode>
167   static PetscErrorCode RestoreColumnVec(Mat, PetscInt, Vec *) noexcept;
168 
169   static PetscErrorCode GetFactor(Mat, MatFactorType, Mat *) noexcept;
170   static PetscErrorCode InvertFactors(Mat) noexcept;
171 
172   static PetscErrorCode GetSubMatrix(Mat, PetscInt, PetscInt, PetscInt, PetscInt, Mat *) noexcept;
173   static PetscErrorCode RestoreSubMatrix(Mat, Mat *) noexcept;
174 };
175 
176 } // namespace impl
177 
178 namespace
179 {
180 
181 // Declare this here so that the functions below can make use of it
182 template <device::cupm::DeviceType T>
183 inline PetscErrorCode MatCreateSeqDenseCUPM(MPI_Comm comm, PetscInt m, PetscInt n, PetscScalar *data, Mat *A, PetscDeviceContext dctx = nullptr, bool preallocate = true) noexcept
184 {
185   PetscFunctionBegin;
186   PetscCall(impl::MatDense_Seq_CUPM<T>::CreateIMPLDenseCUPM(comm, m, n, m, n, data, A, dctx, preallocate));
187   PetscFunctionReturn(PETSC_SUCCESS);
188 }
189 
190 } // anonymous namespace
191 
192 namespace impl
193 {
194 
195 // ==========================================================================================
196 // MatDense_Seq_CUPM - Private API - Utility
197 // ==========================================================================================
198 
199 template <device::cupm::DeviceType T>
200 inline PetscErrorCode MatDense_Seq_CUPM<T>::SetPreallocation_(Mat m, PetscDeviceContext dctx, PetscScalar *user_device_array) noexcept
201 {
202   const auto   mcu   = MatCUPMCast(m);
203   const auto   nrows = m->rmap->n;
204   const auto   ncols = m->cmap->n;
205   auto        &lda   = MatIMPLCast(m)->lda;
206   cupmStream_t stream;
207 
208   PetscFunctionBegin;
209   PetscCheckTypeName(m, MATSEQDENSECUPM());
210   PetscValidDeviceContext(dctx, 2);
211   PetscCall(checkCupmBlasIntCast(nrows));
212   PetscCall(checkCupmBlasIntCast(ncols));
213   PetscCall(GetHandlesFrom_(dctx, &stream));
214   if (lda <= 0) lda = nrows;
215   if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
216   if (user_device_array) {
217     mcu->d_user_alloc = PETSC_TRUE;
218     mcu->d_v          = user_device_array;
219   } else {
220     PetscInt size;
221 
222     mcu->d_user_alloc = PETSC_FALSE;
223     PetscCall(PetscIntMultError(lda, ncols, &size));
224     PetscCall(PetscCUPMMallocAsync(&mcu->d_v, size, stream));
225     PetscCall(PetscCUPMMemsetAsync(mcu->d_v, 0, size, stream));
226   }
227   m->offloadmask = PETSC_OFFLOAD_GPU;
228   PetscFunctionReturn(PETSC_SUCCESS);
229 }
230 
231 template <device::cupm::DeviceType T>
232 inline PetscErrorCode MatDense_Seq_CUPM<T>::HostToDevice_(Mat m, PetscDeviceContext dctx) noexcept
233 {
234   const auto nrows = m->rmap->n;
235   const auto ncols = m->cmap->n;
236   const auto copy  = m->offloadmask == PETSC_OFFLOAD_CPU || m->offloadmask == PETSC_OFFLOAD_UNALLOCATED;
237 
238   PetscFunctionBegin;
239   PetscCheckTypeName(m, MATSEQDENSECUPM());
240   if (m->boundtocpu) PetscFunctionReturn(PETSC_SUCCESS);
241   PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols));
242   if (copy) {
243     const auto   mcu = MatCUPMCast(m);
244     cupmStream_t stream;
245 
246     // Allocate GPU memory if not present
247     if (!mcu->d_v) PetscCall(SetPreallocation(m, dctx));
248     PetscCall(GetHandlesFrom_(dctx, &stream));
249     PetscCall(PetscLogEventBegin(MAT_DenseCopyToGPU, m, 0, 0, 0));
250     {
251       const auto mimpl = MatIMPLCast(m);
252       const auto lda   = mimpl->lda;
253       const auto src   = mimpl->v;
254       const auto dest  = mcu->d_v;
255 
256       if (lda > nrows) {
257         PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyHostToDevice, stream));
258       } else {
259         PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyHostToDevice, stream));
260       }
261     }
262     PetscCall(PetscLogEventEnd(MAT_DenseCopyToGPU, m, 0, 0, 0));
263     // order important, ensure that offloadmask is PETSC_OFFLOAD_BOTH
264     m->offloadmask = PETSC_OFFLOAD_BOTH;
265   }
266   PetscFunctionReturn(PETSC_SUCCESS);
267 }
268 
269 template <device::cupm::DeviceType T>
270 inline PetscErrorCode MatDense_Seq_CUPM<T>::DeviceToHost_(Mat m, PetscDeviceContext dctx) noexcept
271 {
272   const auto nrows = m->rmap->n;
273   const auto ncols = m->cmap->n;
274   const auto copy  = m->offloadmask == PETSC_OFFLOAD_GPU;
275 
276   PetscFunctionBegin;
277   PetscCheckTypeName(m, MATSEQDENSECUPM());
278   PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols));
279   if (copy) {
280     const auto   mimpl = MatIMPLCast(m);
281     cupmStream_t stream;
282 
283     // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed
284     if (!mimpl->v) PetscCall(MatSeqDenseSetPreallocation(m, nullptr));
285     PetscCall(GetHandlesFrom_(dctx, &stream));
286     PetscCall(PetscLogEventBegin(MAT_DenseCopyFromGPU, m, 0, 0, 0));
287     {
288       const auto lda  = mimpl->lda;
289       const auto dest = mimpl->v;
290       const auto src  = MatCUPMCast(m)->d_v;
291 
292       if (lda > nrows) {
293         PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyDeviceToHost, stream));
294       } else {
295         PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyDeviceToHost, stream));
296       }
297     }
298     PetscCall(PetscLogEventEnd(MAT_DenseCopyFromGPU, m, 0, 0, 0));
299     // order is important, MatSeqDenseSetPreallocation() might set offloadmask
300     m->offloadmask = PETSC_OFFLOAD_BOTH;
301   }
302   PetscFunctionReturn(PETSC_SUCCESS);
303 }
304 
305 template <device::cupm::DeviceType T>
306 inline PetscErrorCode MatDense_Seq_CUPM<T>::CheckCUPMSolverInfo_(const cupmBlasInt_t *fact_info, cupmStream_t stream) noexcept
307 {
308   PetscFunctionBegin;
309   if (PetscDefined(USE_DEBUG)) {
310     cupmBlasInt_t info = 0;
311 
312     PetscCall(PetscCUPMMemcpyAsync(&info, fact_info, 1, cupmMemcpyDeviceToHost, stream));
313     if (stream) PetscCallCUPM(cupmStreamSynchronize(stream));
314     static_assert(std::is_same<decltype(info), int>::value, "");
315     PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Bad factorization: zero pivot in row %d", info - 1);
316     PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to cupmSolver %d", -info);
317   }
318   PetscFunctionReturn(PETSC_SUCCESS);
319 }
320 
321 // ==========================================================================================
322 // MatDense_Seq_CUPM - Private API - Solver Dispatch
323 // ==========================================================================================
324 
325 // specific solvers called through the dispatch_() family of functions
326 template <device::cupm::DeviceType T>
327 template <typename Derived>
328 struct MatDense_Seq_CUPM<T>::SolveCommon {
329   using derived_type = Derived;
330 
331   template <typename F>
332   static PetscErrorCode ResizeFactLwork(Mat_SeqDenseCUPM *mcu, cupmStream_t stream, F &&cupmSolverComputeFactLwork) noexcept
333   {
334     cupmBlasInt_t lwork;
335 
336     PetscFunctionBegin;
337     PetscCallCUPMSOLVER(cupmSolverComputeFactLwork(&lwork));
338     if (lwork > mcu->d_fact_lwork) {
339       mcu->d_fact_lwork = lwork;
340       PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
341       PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, lwork, stream));
342     }
343     PetscFunctionReturn(PETSC_SUCCESS);
344   }
345 
346   static PetscErrorCode FactorPrepare(Mat A, cupmStream_t stream) noexcept
347   {
348     const auto mcu = MatCUPMCast(A);
349 
350     PetscFunctionBegin;
351     PetscCall(PetscInfo(A, "%s factor %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", derived_type::NAME(), A->rmap->n, A->cmap->n));
352     A->factortype             = derived_type::MATFACTORTYPE();
353     A->ops->solve             = MatSolve_Factored_Dispatch_<derived_type, false>;
354     A->ops->solvetranspose    = MatSolve_Factored_Dispatch_<derived_type, true>;
355     A->ops->matsolve          = MatMatSolve_Factored_Dispatch_<derived_type, false>;
356     A->ops->matsolvetranspose = MatMatSolve_Factored_Dispatch_<derived_type, true>;
357 
358     PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &A->solvertype));
359     if (!mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream));
360     PetscFunctionReturn(PETSC_SUCCESS);
361   }
362 };
363 
364 template <device::cupm::DeviceType T>
365 struct MatDense_Seq_CUPM<T>::SolveLU : SolveCommon<SolveLU> {
366   using base_type = SolveCommon<SolveLU>;
367 
368   static constexpr const char   *NAME() noexcept { return "LU"; }
369   static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_LU; }
370 
371   static PetscErrorCode Factor(Mat A, IS, IS, const MatFactorInfo *) noexcept
372   {
373     const auto         m = static_cast<cupmBlasInt_t>(A->rmap->n);
374     const auto         n = static_cast<cupmBlasInt_t>(A->cmap->n);
375     cupmStream_t       stream;
376     cupmSolverHandle_t handle;
377     PetscDeviceContext dctx;
378 
379     PetscFunctionBegin;
380     if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS);
381     PetscCall(GetHandles_(&dctx, &handle, &stream));
382     PetscCall(base_type::FactorPrepare(A, stream));
383     {
384       const auto mcu = MatCUPMCast(A);
385       const auto da  = DeviceArrayReadWrite(dctx, A);
386       const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
387 
388       // clang-format off
389       PetscCall(
390         base_type::ResizeFactLwork(
391           mcu, stream,
392           [&](cupmBlasInt_t *fact_lwork)
393           {
394             return cupmSolverXgetrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork);
395           }
396         )
397       );
398       // clang-format on
399       if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream));
400 
401       PetscCall(PetscLogGpuTimeBegin());
402       PetscCallCUPMSOLVER(cupmSolverXgetrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_ipiv, mcu->d_fact_info));
403       PetscCall(PetscLogGpuTimeEnd());
404       PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
405     }
406     PetscCall(PetscLogGpuFlops(2.0 * n * n * m / 3.0));
407     PetscFunctionReturn(PETSC_SUCCESS);
408   }
409 
410   template <bool transpose>
411   static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
412   {
413     const auto         mcu       = MatCUPMCast(A);
414     const auto         fact_info = mcu->d_fact_info;
415     const auto         fact_ipiv = mcu->d_fact_ipiv;
416     const auto         lda       = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
417     cupmSolverHandle_t handle;
418 
419     PetscFunctionBegin;
420     PetscCall(GetHandlesFrom_(dctx, &handle));
421     PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
422     PetscCall(PetscLogGpuTimeBegin());
423     {
424       constexpr auto op  = transpose ? CUPMSOLVER_OP_T : CUPMSOLVER_OP_N;
425       const auto     da  = DeviceArrayRead(dctx, A);
426       const auto     lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
427 
428       // clang-format off
429       PetscCall(
430         base_type::ResizeFactLwork(
431           mcu, stream,
432           [&](cupmBlasInt_t *lwork)
433           {
434             return cupmSolverXgetrs_bufferSize(
435               handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, lwork
436             );
437           }
438         )
439       );
440       // clang-format on
441       PetscCallCUPMSOLVER(cupmSolverXgetrs(handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info));
442       PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
443     }
444     PetscCall(PetscLogGpuTimeEnd());
445     PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
446     PetscFunctionReturn(PETSC_SUCCESS);
447   }
448 };
449 
450 template <device::cupm::DeviceType T>
451 struct MatDense_Seq_CUPM<T>::SolveCholesky : SolveCommon<SolveCholesky> {
452   using base_type = SolveCommon<SolveCholesky>;
453 
454   static constexpr const char   *NAME() noexcept { return "Cholesky"; }
455   static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_CHOLESKY; }
456 
457   static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept
458   {
459     const auto         n = static_cast<cupmBlasInt_t>(A->rmap->n);
460     PetscDeviceContext dctx;
461     cupmSolverHandle_t handle;
462     cupmStream_t       stream;
463 
464     PetscFunctionBegin;
465     if (!n || !A->cmap->n) PetscFunctionReturn(PETSC_SUCCESS);
466     PetscCheck(A->spd == PETSC_BOOL3_TRUE, PETSC_COMM_SELF, PETSC_ERR_SUP, "%ssytrs unavailable. Use MAT_FACTOR_LU", cupmSolverName());
467     PetscCall(GetHandles_(&dctx, &handle, &stream));
468     PetscCall(base_type::FactorPrepare(A, stream));
469     {
470       const auto mcu = MatCUPMCast(A);
471       const auto da  = DeviceArrayReadWrite(dctx, A);
472       const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
473 
474       // clang-format off
475       PetscCall(
476         base_type::ResizeFactLwork(
477           mcu, stream,
478           [&](cupmBlasInt_t *fact_lwork)
479           {
480             return cupmSolverXpotrf_bufferSize(
481               handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, fact_lwork
482             );
483           }
484         )
485       );
486       // clang-format on
487       PetscCall(PetscLogGpuTimeBegin());
488       PetscCallCUPMSOLVER(cupmSolverXpotrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
489       PetscCall(PetscLogGpuTimeEnd());
490       PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
491     }
492     PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));
493 
494   #if 0
495     // At the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs
496     // and *hetr* routines. The code below should work, and it can be activated when *sytrs
497     // routines will be available
498     if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream));
499     if (!mcu->d_fact_lwork) {
500       PetscCallCUPMSOLVER(cupmSolverDnXsytrf_bufferSize(handle, n, da.cupmdata(), lda, &mcu->d_fact_lwork));
501       PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, mcu->d_fact_lwork, stream));
502     }
503     if (mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream));
504     PetscCall(PetscLogGpuTimeBegin());
505     PetscCallCUPMSOLVER(cupmSolverXsytrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da, lda, mcu->d_fact_ipiv, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
506     PetscCall(PetscLogGpuTimeEnd());
507   #endif
508     PetscFunctionReturn(PETSC_SUCCESS);
509   }
510 
511   template <bool transpose>
512   static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
513   {
514     const auto         mcu       = MatCUPMCast(A);
515     const auto         fact_info = mcu->d_fact_info;
516     cupmSolverHandle_t handle;
517 
518     PetscFunctionBegin;
519     PetscAssert(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%ssytrs not implemented", cupmSolverName());
520     PetscCall(GetHandlesFrom_(dctx, &handle));
521     PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
522     PetscCall(PetscLogGpuTimeBegin());
523     {
524       const auto da  = DeviceArrayRead(dctx, A);
525       const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
526 
527       // clang-format off
528       PetscCall(
529         base_type::ResizeFactLwork(
530           mcu, stream,
531           [&](cupmBlasInt_t *lwork)
532           {
533             return cupmSolverXpotrs_bufferSize(
534               handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, lwork
535             );
536           }
537         )
538       );
539       // clang-format on
540       PetscCallCUPMSOLVER(cupmSolverXpotrs(handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info));
541       PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
542     }
543     PetscCall(PetscLogGpuTimeEnd());
544     PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
545     PetscFunctionReturn(PETSC_SUCCESS);
546   }
547 };
548 
549 template <device::cupm::DeviceType T>
550 struct MatDense_Seq_CUPM<T>::SolveQR : SolveCommon<SolveQR> {
551   using base_type = SolveCommon<SolveQR>;
552 
553   static constexpr const char   *NAME() noexcept { return "QR"; }
554   static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_QR; }
555 
556   static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept
557   {
558     const auto         m     = static_cast<cupmBlasInt_t>(A->rmap->n);
559     const auto         n     = static_cast<cupmBlasInt_t>(A->cmap->n);
560     const auto         min   = std::min(m, n);
561     const auto         mimpl = MatIMPLCast(A);
562     cupmStream_t       stream;
563     cupmSolverHandle_t handle;
564     PetscDeviceContext dctx;
565 
566     PetscFunctionBegin;
567     if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS);
568     PetscCall(GetHandles_(&dctx, &handle, &stream));
569     PetscCall(base_type::FactorPrepare(A, stream));
570     mimpl->rank = min;
571     {
572       const auto mcu = MatCUPMCast(A);
573       const auto da  = DeviceArrayReadWrite(dctx, A);
574       const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda);
575 
576       if (!mcu->workvec) PetscCall(vec::cupm::VecCreateSeqCUPMAsync<T>(PetscObjectComm(PetscObjectCast(A)), m, &mcu->workvec));
577       if (!mcu->d_fact_tau) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_tau, min, stream));
578       // clang-format off
579       PetscCall(
580         base_type::ResizeFactLwork(
581           mcu, stream,
582           [&](cupmBlasInt_t *fact_lwork)
583           {
584             return cupmSolverXgeqrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork);
585           }
586         )
587       );
588       // clang-format on
589       PetscCall(PetscLogGpuTimeBegin());
590       PetscCallCUPMSOLVER(cupmSolverXgeqrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_tau, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
591       PetscCall(PetscLogGpuTimeEnd());
592       PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
593     }
594     PetscCall(PetscLogGpuFlops(2.0 * min * min * (std::max(m, n) - min / 3.0)));
595     PetscFunctionReturn(PETSC_SUCCESS);
596   }
597 
598   template <bool transpose>
599   static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
600   {
601     const auto         mimpl      = MatIMPLCast(A);
602     const auto         rank       = static_cast<cupmBlasInt_t>(mimpl->rank);
603     const auto         mcu        = MatCUPMCast(A);
604     const auto         fact_info  = mcu->d_fact_info;
605     const auto         fact_tau   = mcu->d_fact_tau;
606     const auto         fact_work  = mcu->d_fact_work;
607     const auto         fact_lwork = mcu->d_fact_lwork;
608     cupmSolverHandle_t solver_handle;
609     cupmBlasHandle_t   blas_handle;
610 
611     PetscFunctionBegin;
612     PetscCall(GetHandlesFrom_(dctx, &blas_handle, &solver_handle));
613     PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
614     PetscCall(PetscLogGpuTimeBegin());
615     {
616       const auto da  = DeviceArrayRead(dctx, A);
617       const auto one = cupmScalarCast(1.0);
618       const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda);
619 
620       if (transpose) {
621         PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_T, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx));
622         PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, CUPMSOLVER_OP_N, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info));
623         PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
624       } else {
625         constexpr auto op = PetscDefined(USE_COMPLEX) ? CUPMSOLVER_OP_C : CUPMSOLVER_OP_T;
626 
627         PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, op, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info));
628         PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
629         PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_N, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx));
630       }
631     }
632     PetscCall(PetscLogGpuTimeEnd());
633     PetscCall(PetscLogFlops(nrhs * (4.0 * m * rank - (rank * rank))));
634     PetscFunctionReturn(PETSC_SUCCESS);
635   }
636 };
637 
638 template <device::cupm::DeviceType T>
639 template <typename Solver, bool transpose>
640 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatSolve_Factored_Dispatch_(Mat A, Vec x, Vec y) noexcept
641 {
642   using namespace vec::cupm;
643   const auto         pobj_A  = PetscObjectCast(A);
644   const auto         m       = static_cast<cupmBlasInt_t>(A->rmap->n);
645   const auto         k       = static_cast<cupmBlasInt_t>(A->cmap->n);
646   auto              &workvec = MatCUPMCast(A)->workvec;
647   PetscScalar       *y_array = nullptr;
648   PetscDeviceContext dctx;
649   PetscBool          xiscupm, yiscupm, aiscupm;
650   bool               use_y_array_directly;
651   cupmStream_t       stream;
652 
653   PetscFunctionBegin;
654   PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
655   PetscCall(PetscObjectTypeCompare(PetscObjectCast(x), VecSeq_CUPM::VECSEQCUPM(), &xiscupm));
656   PetscCall(PetscObjectTypeCompare(PetscObjectCast(y), VecSeq_CUPM::VECSEQCUPM(), &yiscupm));
657   PetscCall(PetscObjectTypeCompare(pobj_A, MATSEQDENSECUPM(), &aiscupm));
658   PetscAssert(aiscupm, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix A is somehow not CUPM?????????????????????????????");
659   PetscCall(GetHandles_(&dctx, &stream));
660   use_y_array_directly = yiscupm && (k >= m);
661   {
662     const PetscScalar *x_array;
663     const auto         xisdevice = xiscupm && PetscOffloadDevice(x->offloadmask);
664     const auto         copy_mode = xisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice;
665 
666     if (!use_y_array_directly && !workvec) PetscCall(VecCreateSeqCUPMAsync<T>(PetscObjectComm(pobj_A), m, &workvec));
667     // The logic here is to try to minimize the amount of memory copying:
668     //
669     // If we call VecCUPMGetArrayRead(X, &x) every time xiscupm and the data is not offloaded
670     // to the GPU yet, then the data is copied to the GPU. But we are only trying to get the
671     // data in order to copy it into the y array. So the array x will be wherever the data
672     // already is so that only one memcpy is performed
673     if (xisdevice) {
674       PetscCall(VecCUPMGetArrayReadAsync<T>(x, &x_array, dctx));
675     } else {
676       PetscCall(VecGetArrayRead(x, &x_array));
677     }
678     PetscCall(VecCUPMGetArrayWriteAsync<T>(use_y_array_directly ? y : workvec, &y_array, dctx));
679     PetscCall(PetscCUPMMemcpyAsync(y_array, x_array, m, copy_mode, stream));
680     if (xisdevice) {
681       PetscCall(VecCUPMRestoreArrayReadAsync<T>(x, &x_array, dctx));
682     } else {
683       PetscCall(VecRestoreArrayRead(x, &x_array));
684     }
685   }
686 
687   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
688   PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y_array), m, m, 1, k, dctx, stream));
689   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
690 
691   if (use_y_array_directly) {
692     PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &y_array, dctx));
693   } else {
694     const auto   copy_mode = yiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost;
695     PetscScalar *yv;
696 
697     // The logic here is that the data is not yet in either y's GPU array or its CPU array.
698     // There is nothing in the interface to say where the user would like it to end up. So we
699     // choose the GPU, because it is the faster option
700     if (yiscupm) {
701       PetscCall(VecCUPMGetArrayWriteAsync<T>(y, &yv, dctx));
702     } else {
703       PetscCall(VecGetArray(y, &yv));
704     }
705     PetscCall(PetscCUPMMemcpyAsync(yv, y_array, k, copy_mode, stream));
706     if (yiscupm) {
707       PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &yv, dctx));
708     } else {
709       PetscCall(VecRestoreArray(y, &yv));
710     }
711     PetscCall(VecCUPMRestoreArrayWriteAsync<T>(workvec, &y_array));
712   }
713   PetscFunctionReturn(PETSC_SUCCESS);
714 }
715 
716 template <device::cupm::DeviceType T>
717 template <typename Solver, bool transpose>
718 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatSolve_Factored_Dispatch_(Mat A, Mat B, Mat X) noexcept
719 {
720   const auto         m = static_cast<cupmBlasInt_t>(A->rmap->n);
721   const auto         k = static_cast<cupmBlasInt_t>(A->cmap->n);
722   cupmBlasInt_t      nrhs, ldb, ldx, ldy;
723   PetscScalar       *y;
724   PetscBool          biscupm, xiscupm, aiscupm;
725   PetscDeviceContext dctx;
726   cupmStream_t       stream;
727 
728   PetscFunctionBegin;
729   PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
730   PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &biscupm));
731   PetscCall(PetscObjectTypeCompare(PetscObjectCast(X), MATSEQDENSECUPM(), &xiscupm));
732   PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &aiscupm));
733   PetscCall(GetHandles_(&dctx, &stream));
734   {
735     PetscInt n;
736 
737     PetscCall(MatGetSize(B, nullptr, &n));
738     PetscCall(PetscCUPMBlasIntCast(n, &nrhs));
739     PetscCall(MatDenseGetLDA(B, &n));
740     PetscCall(PetscCUPMBlasIntCast(n, &ldb));
741     PetscCall(MatDenseGetLDA(X, &n));
742     PetscCall(PetscCUPMBlasIntCast(n, &ldx));
743   }
744   {
745     // The logic here is to try to minimize the amount of memory copying:
746     //
747     // If we call MatDenseCUPMGetArrayRead(B, &b) every time biscupm and the data is not
748     // offloaded to the GPU yet, then the data is copied to the GPU. But we are only trying to
749     // get the data in order to copy it into the y array. So the array b will be wherever the
750     // data already is so that only one memcpy is performed
751     const auto         bisdevice = biscupm && PetscOffloadDevice(B->offloadmask);
752     const auto         copy_mode = bisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice;
753     const PetscScalar *b;
754 
755     if (bisdevice) {
756       b = DeviceArrayRead(dctx, B);
757     } else if (biscupm) {
758       b = HostArrayRead(dctx, B);
759     } else {
760       PetscCall(MatDenseGetArrayRead(B, &b));
761     }
762 
763     if (ldx < m || !xiscupm) {
764       // X's array cannot serve as the array (too small or not on device), B's array cannot
765       // serve as the array (const), so allocate a new array
766       ldy = m;
767       PetscCall(PetscCUPMMallocAsync(&y, nrhs * m));
768     } else {
769       // X's array should serve as the array
770       ldy = ldx;
771       y   = DeviceArrayWrite(dctx, X);
772     }
773     PetscCall(PetscCUPMMemcpy2DAsync(y, ldy, b, ldb, m, nrhs, copy_mode, stream));
774     if (!bisdevice && !biscupm) PetscCall(MatDenseRestoreArrayRead(B, &b));
775   }
776 
777   // convert to CUPM twice??????????????????????????????????
778   // but A should already be CUPM??????????????????????????????????????
779   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
780   PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y), ldy, m, nrhs, k, dctx, stream));
781   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
782 
783   if (ldx < m || !xiscupm) {
784     const auto   copy_mode = xiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost;
785     PetscScalar *x;
786 
787     // The logic here is that the data is not yet in either X's GPU array or its CPU
788     // array. There is nothing in the interface to say where the user would like it to end up.
789     // So we choose the GPU, because it is the faster option
790     if (xiscupm) {
791       x = DeviceArrayWrite(dctx, X);
792     } else {
793       PetscCall(MatDenseGetArray(X, &x));
794     }
795     PetscCall(PetscCUPMMemcpy2DAsync(x, ldx, y, ldy, k, nrhs, copy_mode, stream));
796     if (!xiscupm) PetscCall(MatDenseRestoreArray(X, &x));
797     PetscCallCUPM(cupmFreeAsync(y, stream));
798   }
799   PetscFunctionReturn(PETSC_SUCCESS);
800 }
801 
802 template <device::cupm::DeviceType T>
803 template <bool transpose>
804 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultAdd_Dispatch_(Mat A, Vec xx, Vec yy, Vec zz) noexcept
805 {
806   const auto         m = static_cast<cupmBlasInt_t>(A->rmap->n);
807   const auto         n = static_cast<cupmBlasInt_t>(A->cmap->n);
808   cupmBlasHandle_t   handle;
809   PetscDeviceContext dctx;
810 
811   PetscFunctionBegin;
812   if (yy && yy != zz) PetscCall(VecSeq_CUPM::Copy(yy, zz)); // mult add
813   if (!m || !n) {
814     // mult only
815     if (!yy) PetscCall(VecSeq_CUPM::Set(zz, 0.0));
816     PetscFunctionReturn(PETSC_SUCCESS);
817   }
818   PetscCall(PetscInfo(A, "Matrix-vector product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, n));
819   PetscCall(GetHandles_(&dctx, &handle));
820   {
821     constexpr auto op   = transpose ? CUPMBLAS_OP_T : CUPMBLAS_OP_N;
822     const auto     one  = cupmScalarCast(1.0);
823     const auto     zero = cupmScalarCast(0.0);
824     const auto     da   = DeviceArrayRead(dctx, A);
825     const auto     dxx  = VecSeq_CUPM::DeviceArrayRead(dctx, xx);
826     const auto     dzz  = VecSeq_CUPM::DeviceArrayReadWrite(dctx, zz);
827 
828     PetscCall(PetscLogGpuTimeBegin());
829     PetscCallCUPMBLAS(cupmBlasXgemv(handle, op, m, n, &one, da.cupmdata(), static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda), dxx.cupmdata(), 1, (yy ? &one : &zero), dzz.cupmdata(), 1));
830     PetscCall(PetscLogGpuTimeEnd());
831   }
832   PetscCall(PetscLogGpuFlops(2.0 * m * n - (yy ? 0 : m)));
833   PetscFunctionReturn(PETSC_SUCCESS);
834 }
835 
836 // ==========================================================================================
837 // MatDense_Seq_CUPM - Private API - Conversion Dispatch
838 // ==========================================================================================
839 
840 template <device::cupm::DeviceType T>
841 template <bool to_host>
842 inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_Dispatch_(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
843 {
844   PetscFunctionBegin;
845   if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
846     // TODO these cases should be optimized
847     PetscCall(MatConvert_Basic(M, type, reuse, newmat));
848   } else {
849     const auto B    = *newmat;
850     const auto pobj = PetscObjectCast(B);
851 
852     if (to_host) {
853       PetscCall(BindToCPU(B, PETSC_TRUE));
854       PetscCall(Reset(B));
855     } else {
856       PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
857     }
858 
859     PetscCall(PetscStrFreeAllocpy(to_host ? VECSTANDARD : VecSeq_CUPM::VECCUPM(), &B->defaultvectype));
860     PetscCall(PetscObjectChangeTypeName(pobj, to_host ? MATSEQDENSE : MATSEQDENSECUPM()));
861     // cvec might be the wrong VecType, destroy and rebuild it if necessary
862     // REVIEW ME: this is possibly very inefficient
863     PetscCall(VecDestroy(&MatIMPLCast(B)->cvec));
864 
865     MatComposeOp_CUPM(to_host, pobj, MatConvert_seqdensecupm_seqdense_C(), nullptr, Convert_SeqDenseCUPM_SeqDense);
866     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArray_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>);
867     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayRead_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>);
868     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayWrite_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>);
869     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArray_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>);
870     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayRead_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>);
871     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayWrite_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>);
872     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMPlaceArray_C(), nullptr, PlaceArray);
873     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMResetArray_C(), nullptr, ResetArray);
874     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMReplaceArray_C(), nullptr, ReplaceArray);
875     MatComposeOp_CUPM(to_host, pobj, MatProductSetFromOptions_seqaij_seqdensecupm_C(), nullptr, MatProductSetFromOptions_SeqAIJ_SeqDense);
876 
877     if (to_host) {
878       B->offloadmask = PETSC_OFFLOAD_CPU;
879     } else {
880       Mat_SeqDenseCUPM *mcu;
881 
882       PetscCall(PetscNew(&mcu));
883       B->spptr       = mcu;
884       B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; // REVIEW ME: why not offload host??
885       PetscCall(BindToCPU(B, PETSC_FALSE));
886     }
887 
888     MatSetOp_CUPM(to_host, B, bindtocpu, nullptr, BindToCPU);
889     MatSetOp_CUPM(to_host, B, destroy, MatDestroy_SeqDense, Destroy);
890   }
891   PetscFunctionReturn(PETSC_SUCCESS);
892 }
893 
894 // ==========================================================================================
895 // MatDense_Seq_CUPM - Public API
896 // ==========================================================================================
897 
898 template <device::cupm::DeviceType T>
899 inline constexpr MatType MatDense_Seq_CUPM<T>::MATIMPLCUPM_() noexcept
900 {
901   return MATSEQDENSECUPM();
902 }
903 
904 template <device::cupm::DeviceType T>
905 inline constexpr typename MatDense_Seq_CUPM<T>::Mat_SeqDenseCUPM *MatDense_Seq_CUPM<T>::MatCUPMCast(Mat m) noexcept
906 {
907   return static_cast<Mat_SeqDenseCUPM *>(m->spptr);
908 }
909 
910 template <device::cupm::DeviceType T>
911 inline constexpr Mat_SeqDense *MatDense_Seq_CUPM<T>::MatIMPLCast_(Mat m) noexcept
912 {
913   return static_cast<Mat_SeqDense *>(m->data);
914 }
915 
916 template <device::cupm::DeviceType T>
917 inline constexpr const char *MatDense_Seq_CUPM<T>::MatConvert_seqdensecupm_seqdense_C() noexcept
918 {
919   return T == device::cupm::DeviceType::CUDA ? "MatConvert_seqdensecuda_seqdense_C" : "MatConvert_seqdensehip_seqdense_C";
920 }
921 
922 template <device::cupm::DeviceType T>
923 inline constexpr const char *MatDense_Seq_CUPM<T>::MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept
924 {
925   return T == device::cupm::DeviceType::CUDA ? "MatProductSetFromOptions_seqaij_seqdensecuda_C" : "MatProductSetFromOptions_seqaij_seqdensehip_C";
926 }
927 
928 // ==========================================================================================
929 
930 // MatCreate_SeqDenseCUPM()
931 template <device::cupm::DeviceType T>
932 inline PetscErrorCode MatDense_Seq_CUPM<T>::Create(Mat A) noexcept
933 {
934   PetscFunctionBegin;
935   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
936   PetscCall(MatCreate_SeqDense(A));
937   PetscCall(Convert_SeqDense_SeqDenseCUPM(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
938   PetscFunctionReturn(PETSC_SUCCESS);
939 }
940 
941 template <device::cupm::DeviceType T>
942 inline PetscErrorCode MatDense_Seq_CUPM<T>::Destroy(Mat A) noexcept
943 {
944   PetscFunctionBegin;
945   // prevent copying back data if we own the data pointer
946   if (!MatIMPLCast(A)->user_alloc) A->offloadmask = PETSC_OFFLOAD_CPU;
947   PetscCall(Convert_SeqDenseCUPM_SeqDense(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
948   PetscCall(MatDestroy_SeqDense(A));
949   PetscFunctionReturn(PETSC_SUCCESS);
950 }
951 
952 // obj->ops->setup()
953 template <device::cupm::DeviceType T>
954 inline PetscErrorCode MatDense_Seq_CUPM<T>::SetUp(Mat A) noexcept
955 {
956   PetscFunctionBegin;
957   PetscCall(PetscLayoutSetUp(A->rmap));
958   PetscCall(PetscLayoutSetUp(A->cmap));
959   if (!A->preallocated) {
960     PetscDeviceContext dctx;
961 
962     PetscCall(GetHandles_(&dctx));
963     PetscCall(SetPreallocation(A, dctx));
964   }
965   PetscFunctionReturn(PETSC_SUCCESS);
966 }
967 
968 template <device::cupm::DeviceType T>
969 inline PetscErrorCode MatDense_Seq_CUPM<T>::Reset(Mat A) noexcept
970 {
971   PetscFunctionBegin;
972   if (const auto mcu = MatCUPMCast(A)) {
973     cupmStream_t stream;
974 
975     PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
976     PetscCall(GetHandles_(&stream));
977     if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
978     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_tau, stream));
979     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_ipiv, stream));
980     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_info, stream));
981     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
982     PetscCall(VecDestroy(&mcu->workvec));
983     PetscCall(PetscFree(A->spptr /* mcu */));
984   }
985   PetscFunctionReturn(PETSC_SUCCESS);
986 }
987 
988 // ==========================================================================================
989 
990 template <device::cupm::DeviceType T>
991 inline PetscErrorCode MatDense_Seq_CUPM<T>::BindToCPU(Mat A, PetscBool to_host) noexcept
992 {
993   const auto mimpl = MatIMPLCast(A);
994   const auto pobj  = PetscObjectCast(A);
995 
996   PetscFunctionBegin;
997   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
998   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
999   A->boundtocpu = to_host;
1000   PetscCall(PetscStrFreeAllocpy(to_host ? PETSCRANDER48 : PETSCDEVICERAND(), &A->defaultrandtype));
1001   if (to_host) {
1002     PetscDeviceContext dctx;
1003 
1004     // make sure we have an up-to-date copy on the CPU
1005     PetscCall(GetHandles_(&dctx));
1006     PetscCall(DeviceToHost_(A, dctx));
1007   } else {
1008     PetscBool iscupm;
1009 
1010     if (auto &cvec = mimpl->cvec) {
1011       PetscCall(PetscObjectTypeCompare(PetscObjectCast(cvec), VecSeq_CUPM::VECSEQCUPM(), &iscupm));
1012       if (!iscupm) PetscCall(VecDestroy(&cvec));
1013     }
1014     if (auto &cmat = mimpl->cmat) {
1015       PetscCall(PetscObjectTypeCompare(PetscObjectCast(cmat), MATSEQDENSECUPM(), &iscupm));
1016       if (!iscupm) PetscCall(MatDestroy(&cmat));
1017     }
1018   }
1019 
1020   // ============================================================
1021   // Composed ops
1022   // ============================================================
1023   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArray_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>);
1024   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayRead_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>);
1025   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWrite_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>);
1026   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>);
1027   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>);
1028   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayReadAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>);
1029   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayReadAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>);
1030   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWriteAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>);
1031   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayWriteAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>);
1032   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>);
1033   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>);
1034   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ>);
1035   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ>);
1036   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_WRITE>);
1037   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_WRITE>);
1038   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_SeqDense, GetSubMatrix);
1039   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_SeqDense, RestoreSubMatrix);
1040   MatComposeOp_CUPM(to_host, pobj, "MatQRFactor_C", MatQRFactor_SeqDense, SolveQR::Factor);
1041   // always the same
1042   PetscCall(PetscObjectComposeFunction(pobj, "MatDenseSetLDA_C", MatDenseSetLDA_SeqDense));
1043 
1044   // ============================================================
1045   // Function pointer ops
1046   // ============================================================
1047   MatSetOp_CUPM(to_host, A, duplicate, MatDuplicate_SeqDense, Duplicate);
1048   MatSetOp_CUPM(to_host, A, mult, MatMult_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ false>(A, xx, nullptr, yy); });
1049   MatSetOp_CUPM(to_host, A, multtranspose, MatMultTranspose_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ true>(A, xx, nullptr, yy); });
1050   MatSetOp_CUPM(to_host, A, multadd, MatMultAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ false>);
1051   MatSetOp_CUPM(to_host, A, multtransposeadd, MatMultTransposeAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ true>);
1052   MatSetOp_CUPM(to_host, A, matmultnumeric, MatMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ false>);
1053   MatSetOp_CUPM(to_host, A, mattransposemultnumeric, MatMatTransposeMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ true>);
1054   MatSetOp_CUPM(to_host, A, transposematmultnumeric, MatTransposeMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ true, /* transpose_B */ false>);
1055   MatSetOp_CUPM(to_host, A, axpy, MatAXPY_SeqDense, AXPY);
1056   MatSetOp_CUPM(to_host, A, choleskyfactor, MatCholeskyFactor_SeqDense, SolveCholesky::Factor);
1057   MatSetOp_CUPM(to_host, A, lufactor, MatLUFactor_SeqDense, SolveLU::Factor);
1058   MatSetOp_CUPM(to_host, A, getcolumnvector, MatGetColumnVector_SeqDense, GetColumnVector);
1059   MatSetOp_CUPM(to_host, A, scale, MatScale_SeqDense, Scale);
1060   MatSetOp_CUPM(to_host, A, shift, MatShift_SeqDense, Shift);
1061   MatSetOp_CUPM(to_host, A, copy, MatCopy_SeqDense, Copy);
1062   MatSetOp_CUPM(to_host, A, zeroentries, MatZeroEntries_SeqDense, ZeroEntries);
1063   MatSetOp_CUPM(to_host, A, setup, MatSetUp_SeqDense, SetUp);
1064   MatSetOp_CUPM(to_host, A, setrandom, MatSetRandom_SeqDense, SetRandom);
1065   // seemingly always the same
1066   A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense;
1067 
1068   if (const auto cmat = mimpl->cmat) PetscCall(MatBindToCPU(cmat, to_host));
1069   PetscFunctionReturn(PETSC_SUCCESS);
1070 }
1071 
1072 template <device::cupm::DeviceType T>
1073 inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDenseCUPM_SeqDense(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
1074 {
1075   PetscFunctionBegin;
1076   PetscCall(Convert_Dispatch_</* to host */ true>(M, type, reuse, newmat));
1077   PetscFunctionReturn(PETSC_SUCCESS);
1078 }
1079 
1080 template <device::cupm::DeviceType T>
1081 inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDense_SeqDenseCUPM(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
1082 {
1083   PetscFunctionBegin;
1084   PetscCall(Convert_Dispatch_</* to host */ false>(M, type, reuse, newmat));
1085   PetscFunctionReturn(PETSC_SUCCESS);
1086 }
1087 
1088 // ==========================================================================================
1089 
1090 template <device::cupm::DeviceType T>
1091 template <PetscMemType mtype, PetscMemoryAccessMode access>
1092 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArray(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept
1093 {
1094   constexpr auto hostmem     = PetscMemTypeHost(mtype);
1095   constexpr auto read_access = PetscMemoryAccessRead(access);
1096 
1097   PetscFunctionBegin;
1098   static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
1099   PetscCall(PetscDeviceContextGetOptionalNullContext_Internal(&dctx));
1100   if (hostmem) {
1101     if (read_access) {
1102       PetscCall(DeviceToHost_(m, dctx));
1103     } else if (!MatIMPLCast(m)->v) {
1104       // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed
1105       PetscCall(MatSeqDenseSetPreallocation(m, nullptr));
1106     }
1107     *array = MatIMPLCast(m)->v;
1108   } else {
1109     if (read_access) {
1110       PetscCall(HostToDevice_(m, dctx));
1111     } else if (!MatCUPMCast(m)->d_v) {
1112       // write-only
1113       PetscCall(SetPreallocation(m, dctx, nullptr));
1114     }
1115     *array = MatCUPMCast(m)->d_v;
1116   }
1117   if (PetscMemoryAccessWrite(access)) {
1118     m->offloadmask = hostmem ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1119     PetscCall(PetscObjectStateIncrease(PetscObjectCast(m)));
1120   }
1121   PetscFunctionReturn(PETSC_SUCCESS);
1122 }
1123 
1124 template <device::cupm::DeviceType T>
1125 template <PetscMemType mtype, PetscMemoryAccessMode access>
1126 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArray(Mat m, PetscScalar **array, PetscDeviceContext) noexcept
1127 {
1128   PetscFunctionBegin;
1129   static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
1130   if (PetscMemoryAccessWrite(access)) {
1131     // WRITE or READ_WRITE
1132     m->offloadmask = PetscMemTypeHost(mtype) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1133     PetscCall(PetscObjectStateIncrease(PetscObjectCast(m)));
1134   }
1135   *array = nullptr;
1136   PetscFunctionReturn(PETSC_SUCCESS);
1137 }
1138 
1139 template <device::cupm::DeviceType T>
1140 template <PetscMemoryAccessMode access>
1141 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArrayAndMemType(Mat m, PetscScalar **array, PetscMemType *mtype, PetscDeviceContext dctx) noexcept
1142 {
1143   PetscFunctionBegin;
1144   PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx));
1145   if (mtype) *mtype = PETSC_MEMTYPE_CUPM();
1146   PetscFunctionReturn(PETSC_SUCCESS);
1147 }
1148 
1149 template <device::cupm::DeviceType T>
1150 template <PetscMemoryAccessMode access>
1151 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArrayAndMemType(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept
1152 {
1153   PetscFunctionBegin;
1154   PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx));
1155   PetscFunctionReturn(PETSC_SUCCESS);
1156 }
1157 
1158 // ==========================================================================================
1159 
1160 template <device::cupm::DeviceType T>
1161 inline PetscErrorCode MatDense_Seq_CUPM<T>::PlaceArray(Mat A, const PetscScalar *array) noexcept
1162 {
1163   const auto mimpl = MatIMPLCast(A);
1164   const auto mcu   = MatCUPMCast(A);
1165 
1166   PetscFunctionBegin;
1167   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1168   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1169   PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
1170   if (mimpl->v) {
1171     PetscDeviceContext dctx;
1172 
1173     PetscCall(GetHandles_(&dctx));
1174     PetscCall(HostToDevice_(A, dctx));
1175   }
1176   mcu->unplacedarray         = util::exchange(mcu->d_v, const_cast<PetscScalar *>(array));
1177   mcu->d_unplaced_user_alloc = util::exchange(mcu->d_user_alloc, PETSC_TRUE);
1178   PetscFunctionReturn(PETSC_SUCCESS);
1179 }
1180 
1181 template <device::cupm::DeviceType T>
1182 inline PetscErrorCode MatDense_Seq_CUPM<T>::ReplaceArray(Mat A, const PetscScalar *array) noexcept
1183 {
1184   const auto mimpl = MatIMPLCast(A);
1185   const auto mcu   = MatCUPMCast(A);
1186 
1187   PetscFunctionBegin;
1188   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1189   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1190   PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
1191   if (!mcu->d_user_alloc) {
1192     cupmStream_t stream;
1193 
1194     PetscCall(GetHandles_(&stream));
1195     PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
1196   }
1197   mcu->d_v          = const_cast<PetscScalar *>(array);
1198   mcu->d_user_alloc = PETSC_FALSE;
1199   PetscFunctionReturn(PETSC_SUCCESS);
1200 }
1201 
1202 template <device::cupm::DeviceType T>
1203 inline PetscErrorCode MatDense_Seq_CUPM<T>::ResetArray(Mat A) noexcept
1204 {
1205   const auto mimpl = MatIMPLCast(A);
1206   const auto mcu   = MatCUPMCast(A);
1207 
1208   PetscFunctionBegin;
1209   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1210   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1211   if (mimpl->v) {
1212     PetscDeviceContext dctx;
1213 
1214     PetscCall(GetHandles_(&dctx));
1215     PetscCall(HostToDevice_(A, dctx));
1216   }
1217   mcu->d_v          = util::exchange(mcu->unplacedarray, nullptr);
1218   mcu->d_user_alloc = mcu->d_unplaced_user_alloc;
1219   PetscFunctionReturn(PETSC_SUCCESS);
1220 }
1221 
1222 // ==========================================================================================
1223 
1224 template <device::cupm::DeviceType T>
1225 template <bool transpose_A, bool transpose_B>
1226 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatMult_Numeric_Dispatch(Mat A, Mat B, Mat C) noexcept
1227 {
1228   cupmBlasInt_t      m, n, k;
1229   PetscBool          Aiscupm, Biscupm;
1230   PetscDeviceContext dctx;
1231   cupmBlasHandle_t   handle;
1232 
1233   PetscFunctionBegin;
1234   PetscCall(PetscCUPMBlasIntCast(C->rmap->n, &m));
1235   PetscCall(PetscCUPMBlasIntCast(C->cmap->n, &n));
1236   PetscCall(PetscCUPMBlasIntCast(transpose_A ? A->rmap->n : A->cmap->n, &k));
1237   if (!m || !n || !k) PetscFunctionReturn(PETSC_SUCCESS);
1238 
1239   // we may end up with SEQDENSE as one of the arguments
1240   // REVIEW ME: how? and why is it not B and C????????
1241   PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &Aiscupm));
1242   PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &Biscupm));
1243   if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
1244   if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &B));
1245   PetscCall(PetscInfo(C, "Matrix-Matrix product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, k, n));
1246   PetscCall(GetHandles_(&dctx, &handle));
1247 
1248   PetscCall(PetscLogGpuTimeBegin());
1249   {
1250     const auto one  = cupmScalarCast(1.0);
1251     const auto zero = cupmScalarCast(0.0);
1252     const auto da   = DeviceArrayRead(dctx, A);
1253     const auto db   = DeviceArrayRead(dctx, B);
1254     const auto dc   = DeviceArrayWrite(dctx, C);
1255     PetscInt   alda, blda, clda;
1256 
1257     PetscCall(MatDenseGetLDA(A, &alda));
1258     PetscCall(MatDenseGetLDA(B, &blda));
1259     PetscCall(MatDenseGetLDA(C, &clda));
1260     PetscCallCUPMBLAS(cupmBlasXgemm(handle, transpose_A ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, transpose_B ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, m, n, k, &one, da.cupmdata(), alda, db.cupmdata(), blda, &zero, dc.cupmdata(), clda));
1261   }
1262   PetscCall(PetscLogGpuTimeEnd());
1263 
1264   PetscCall(PetscLogGpuFlops(1.0 * m * n * k + 1.0 * m * n * (k - 1)));
1265   if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
1266   if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
1267   PetscFunctionReturn(PETSC_SUCCESS);
1268 }
1269 
1270 template <device::cupm::DeviceType T>
1271 inline PetscErrorCode MatDense_Seq_CUPM<T>::Copy(Mat A, Mat B, MatStructure str) noexcept
1272 {
1273   const auto m = A->rmap->n;
1274   const auto n = A->cmap->n;
1275 
1276   PetscFunctionBegin;
1277   PetscAssert(m == B->rmap->n && n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "size(B) != size(A)");
1278   // The two matrices must have the same copy implementation to be eligible for fast copy
1279   if (A->ops->copy == B->ops->copy) {
1280     PetscDeviceContext dctx;
1281     cupmStream_t       stream;
1282 
1283     PetscCall(GetHandles_(&dctx, &stream));
1284     PetscCall(PetscLogGpuTimeBegin());
1285     {
1286       const auto va = DeviceArrayRead(dctx, A);
1287       const auto vb = DeviceArrayWrite(dctx, B);
1288       // order is important, DeviceArrayRead/Write() might call SetPreallocation() which sets
1289       // lda!
1290       const auto lda_a = MatIMPLCast(A)->lda;
1291       const auto lda_b = MatIMPLCast(B)->lda;
1292 
1293       if (lda_a > m || lda_b > m) {
1294         PetscAssert(lda_b > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "B lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_b, cupmNAME());
1295         PetscAssert(lda_a > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_a, cupmNAME());
1296         PetscCall(PetscCUPMMemcpy2DAsync(vb.data(), lda_b, va.data(), lda_a, m, n, cupmMemcpyDeviceToDevice, stream));
1297       } else {
1298         PetscCall(PetscCUPMMemcpyAsync(vb.data(), va.data(), m * n, cupmMemcpyDeviceToDevice, stream));
1299       }
1300     }
1301     PetscCall(PetscLogGpuTimeEnd());
1302   } else {
1303     PetscCall(MatCopy_Basic(A, B, str));
1304   }
1305   PetscFunctionReturn(PETSC_SUCCESS);
1306 }
1307 
1308 template <device::cupm::DeviceType T>
1309 inline PetscErrorCode MatDense_Seq_CUPM<T>::ZeroEntries(Mat m) noexcept
1310 {
1311   PetscDeviceContext dctx;
1312   cupmStream_t       stream;
1313 
1314   PetscFunctionBegin;
1315   PetscCall(GetHandles_(&dctx, &stream));
1316   PetscCall(PetscLogGpuTimeBegin());
1317   {
1318     const auto va  = DeviceArrayWrite(dctx, m);
1319     const auto lda = MatIMPLCast(m)->lda;
1320     const auto ma  = m->rmap->n;
1321     const auto na  = m->cmap->n;
1322 
1323     if (lda > ma) {
1324       PetscCall(PetscCUPMMemset2DAsync(va.data(), lda, 0, ma, na, stream));
1325     } else {
1326       PetscCall(PetscCUPMMemsetAsync(va.data(), 0, ma * na, stream));
1327     }
1328   }
1329   PetscCall(PetscLogGpuTimeEnd());
1330   PetscFunctionReturn(PETSC_SUCCESS);
1331 }
1332 
1333 template <device::cupm::DeviceType T>
1334 inline PetscErrorCode MatDense_Seq_CUPM<T>::Scale(Mat A, PetscScalar alpha) noexcept
1335 {
1336   const auto         m = static_cast<cupmBlasInt_t>(A->rmap->n);
1337   const auto         n = static_cast<cupmBlasInt_t>(A->cmap->n);
1338   const auto         N = m * n;
1339   cupmBlasHandle_t   handle;
1340   PetscDeviceContext dctx;
1341 
1342   PetscFunctionBegin;
1343   PetscCall(PetscInfo(A, "Performing Scale %d x %d on backend\n", m, n));
1344   PetscCall(GetHandles_(&dctx, &handle));
1345   PetscCall(PetscLogGpuTimeBegin());
1346   {
1347     const auto cu_alpha = cupmScalarCast(alpha);
1348     const auto da       = DeviceArrayReadWrite(dctx, A);
1349     const auto lda      = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
1350 
1351     if (lda > m) {
1352       for (cupmBlasInt_t j = 0; j < n; ++j) PetscCallCUPMBLAS(cupmBlasXscal(handle, m, &cu_alpha, da.cupmdata() + lda * j, 1));
1353     } else {
1354       PetscCallCUPMBLAS(cupmBlasXscal(handle, N, &cu_alpha, da.cupmdata(), 1));
1355     }
1356   }
1357   PetscCall(PetscLogGpuTimeEnd());
1358   PetscCall(PetscLogGpuFlops(N));
1359   PetscFunctionReturn(PETSC_SUCCESS);
1360 }
1361 
1362 template <device::cupm::DeviceType T>
1363 inline PetscErrorCode MatDense_Seq_CUPM<T>::Shift(Mat A, PetscScalar alpha) noexcept
1364 {
1365   const auto         m = A->rmap->n;
1366   const auto         n = A->cmap->n;
1367   PetscDeviceContext dctx;
1368 
1369   PetscFunctionBegin;
1370   PetscCall(GetHandles_(&dctx));
1371   PetscCall(PetscInfo(A, "Performing Shift %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m, n));
1372   PetscCall(PointwiseUnaryTransform(A, 0, m, n, dctx, device::cupm::functors::make_plus_equals(alpha)));
1373   PetscFunctionReturn(PETSC_SUCCESS);
1374 }
1375 
1376 template <device::cupm::DeviceType T>
1377 inline PetscErrorCode MatDense_Seq_CUPM<T>::AXPY(Mat Y, PetscScalar alpha, Mat X, MatStructure) noexcept
1378 {
1379   const auto         m_x = X->rmap->n, m_y = Y->rmap->n;
1380   const auto         n_x = X->cmap->n, n_y = Y->cmap->n;
1381   PetscDeviceContext dctx;
1382   cupmBlasHandle_t   handle;
1383 
1384   PetscFunctionBegin;
1385   if (!m_x || !n_x) PetscFunctionReturn(PETSC_SUCCESS);
1386   PetscCall(PetscInfo(Y, "Performing AXPY %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m_y, n_y));
1387   PetscCall(GetHandles_(&dctx, &handle));
1388   {
1389     const auto N        = m_x * n_x;
1390     const auto dx       = DeviceArrayRead(dctx, X);
1391     const auto dy       = alpha == 0.0 ? DeviceArrayWrite(dctx, Y).cupmdata() : DeviceArrayReadWrite(dctx, Y).cupmdata();
1392     const auto ldax     = static_cast<cupmBlasInt_t>(MatIMPLCast(X)->lda);
1393     const auto lday     = static_cast<cupmBlasInt_t>(MatIMPLCast(Y)->lda);
1394     const auto cu_alpha = cupmScalarCast(alpha);
1395 
1396     PetscCall(PetscLogGpuTimeBegin());
1397     if (ldax > m_x || lday > m_x) {
1398       for (cupmBlasInt_t j = 0; j < n_x; j++) PetscCallCUPMBLAS(cupmBlasXaxpy(handle, m_x, &cu_alpha, dx.cupmdata() + j * ldax, 1, dy + j * lday, 1));
1399     } else {
1400       PetscCallCUPMBLAS(cupmBlasXaxpy(handle, N, &cu_alpha, dx.cupmdata(), 1, dy, 1));
1401     }
1402     PetscCall(PetscLogGpuTimeEnd());
1403     PetscCall(PetscLogGpuFlops(PetscMax(2 * N - 1, 0)));
1404   }
1405   PetscFunctionReturn(PETSC_SUCCESS);
1406 }
1407 
1408 template <device::cupm::DeviceType T>
1409 inline PetscErrorCode MatDense_Seq_CUPM<T>::Duplicate(Mat A, MatDuplicateOption opt, Mat *B) noexcept
1410 {
1411   const auto         hopt = (opt == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) ? MAT_DO_NOT_COPY_VALUES : opt;
1412   PetscDeviceContext dctx;
1413 
1414   PetscFunctionBegin;
1415   PetscCall(GetHandles_(&dctx));
1416   // do not call SetPreallocation() yet, we call it afterwards??
1417   PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, B, dctx, /* preallocate */ false));
1418   PetscCall(MatDuplicateNoCreate_SeqDense(*B, A, hopt));
1419   if (opt == MAT_COPY_VALUES && hopt != MAT_COPY_VALUES) PetscCall(Copy(A, *B, SAME_NONZERO_PATTERN));
1420   // allocate memory if needed
1421   if (opt != MAT_COPY_VALUES && !MatCUPMCast(*B)->d_v) PetscCall(SetPreallocation(*B, dctx));
1422   PetscFunctionReturn(PETSC_SUCCESS);
1423 }
1424 
1425 template <device::cupm::DeviceType T>
1426 inline PetscErrorCode MatDense_Seq_CUPM<T>::SetRandom(Mat A, PetscRandom rng) noexcept
1427 {
1428   PetscBool device;
1429 
1430   PetscFunctionBegin;
1431   PetscCall(PetscObjectTypeCompare(PetscObjectCast(rng), PETSCDEVICERAND(), &device));
1432   if (device) {
1433     const auto         m = A->rmap->n;
1434     const auto         n = A->cmap->n;
1435     PetscDeviceContext dctx;
1436 
1437     PetscCall(GetHandles_(&dctx));
1438     {
1439       const auto a = DeviceArrayWrite(dctx, A);
1440       PetscInt   lda;
1441 
1442       PetscCall(MatDenseGetLDA(A, &lda));
1443       if (lda > m) {
1444         for (PetscInt i = 0; i < n; i++) PetscCall(PetscRandomGetValues(rng, m, a.data() + i * lda));
1445       } else {
1446         PetscInt mn;
1447 
1448         PetscCall(PetscIntMultError(m, n, &mn));
1449         PetscCall(PetscRandomGetValues(rng, mn, a));
1450       }
1451     }
1452   } else {
1453     PetscCall(MatSetRandom_SeqDense(A, rng));
1454   }
1455   PetscFunctionReturn(PETSC_SUCCESS);
1456 }
1457 
1458 // ==========================================================================================
1459 
1460 template <device::cupm::DeviceType T>
1461 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVector(Mat A, Vec v, PetscInt col) noexcept
1462 {
1463   const auto         offloadmask = A->offloadmask;
1464   const auto         n           = A->rmap->n;
1465   const auto         col_offset  = [&](const PetscScalar *ptr) { return ptr + col * MatIMPLCast(A)->lda; };
1466   PetscBool          viscupm;
1467   PetscDeviceContext dctx;
1468   cupmStream_t       stream;
1469 
1470   PetscFunctionBegin;
1471   PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(v), &viscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), ""));
1472   PetscCall(GetHandles_(&dctx, &stream));
1473   if (viscupm && !v->boundtocpu) {
1474     const auto x = VecSeq_CUPM::DeviceArrayWrite(dctx, v);
1475 
1476     // update device data
1477     if (PetscOffloadDevice(offloadmask)) {
1478       PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToDevice, stream));
1479     } else {
1480       PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(HostArrayRead(dctx, A)), n, cupmMemcpyHostToDevice, stream));
1481     }
1482   } else {
1483     PetscScalar *x;
1484 
1485     // update host data
1486     PetscCall(VecGetArrayWrite(v, &x));
1487     if (PetscOffloadUnallocated(offloadmask) || PetscOffloadHost(offloadmask)) {
1488       PetscCall(PetscArraycpy(x, col_offset(HostArrayRead(dctx, A)), n));
1489     } else if (PetscOffloadDevice(offloadmask)) {
1490       PetscCall(PetscCUPMMemcpyAsync(x, col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToHost, stream));
1491     }
1492     PetscCall(VecRestoreArrayWrite(v, &x));
1493   }
1494   PetscFunctionReturn(PETSC_SUCCESS);
1495 }
1496 
1497 template <device::cupm::DeviceType T>
1498 template <PetscMemoryAccessMode access>
1499 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVec(Mat A, PetscInt col, Vec *v) noexcept
1500 {
1501   using namespace vec::cupm;
1502   const auto         mimpl = MatIMPLCast(A);
1503   PetscDeviceContext dctx;
1504 
1505   PetscFunctionBegin;
1506   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1507   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1508   mimpl->vecinuse = col + 1;
1509   PetscCall(GetHandles_(&dctx));
1510   PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx));
1511   if (!mimpl->cvec) {
1512     // we pass the data of A, to prevent allocating needless GPU memory the first time
1513     // VecCUPMPlaceArray is called
1514     PetscCall(VecCreateSeqCUPMWithArraysAsync<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->bs, A->rmap->n, nullptr, mimpl->ptrinuse, &mimpl->cvec));
1515   }
1516   PetscCall(VecCUPMPlaceArrayAsync<T>(mimpl->cvec, mimpl->ptrinuse + static_cast<std::size_t>(col) * static_cast<std::size_t>(mimpl->lda)));
1517   if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPush(mimpl->cvec));
1518   *v = mimpl->cvec;
1519   PetscFunctionReturn(PETSC_SUCCESS);
1520 }
1521 
1522 template <device::cupm::DeviceType T>
1523 template <PetscMemoryAccessMode access>
1524 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreColumnVec(Mat A, PetscInt, Vec *v) noexcept
1525 {
1526   using namespace vec::cupm;
1527   const auto         mimpl = MatIMPLCast(A);
1528   const auto         cvec  = mimpl->cvec;
1529   PetscDeviceContext dctx;
1530 
1531   PetscFunctionBegin;
1532   PetscCheck(mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
1533   PetscCheck(cvec, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column vector");
1534   mimpl->vecinuse = 0;
1535   if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPop(cvec));
1536   PetscCall(VecCUPMResetArrayAsync<T>(cvec));
1537   PetscCall(GetHandles_(&dctx));
1538   PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx));
1539   if (v) *v = nullptr;
1540   PetscFunctionReturn(PETSC_SUCCESS);
1541 }
1542 
1543 // ==========================================================================================
1544 
1545 template <device::cupm::DeviceType T>
1546 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetFactor(Mat A, MatFactorType ftype, Mat *fact_out) noexcept
1547 {
1548   Mat                fact;
1549   PetscDeviceContext dctx;
1550 
1551   PetscFunctionBegin;
1552   PetscCall(GetHandles_(&dctx));
1553   PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, &fact, dctx, /* preallocate */ false));
1554   fact->factortype = ftype;
1555   switch (ftype) {
1556   case MAT_FACTOR_LU:
1557   case MAT_FACTOR_ILU: // fall-through
1558     fact->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqDense;
1559     fact->ops->ilufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1560     break;
1561   case MAT_FACTOR_CHOLESKY:
1562   case MAT_FACTOR_ICC: // fall-through
1563     fact->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
1564     break;
1565   case MAT_FACTOR_QR: {
1566     const auto pobj = PetscObjectCast(fact);
1567 
1568     PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactor_C", MatQRFactor_SeqDense));
1569     PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactorSymbolic_C", MatQRFactorSymbolic_SeqDense));
1570   } break;
1571   case MAT_FACTOR_NONE:
1572   case MAT_FACTOR_ILUDT:     // fall-through
1573   case MAT_FACTOR_NUM_TYPES: // fall-through
1574     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s not supported", MatFactorTypes[ftype]);
1575   }
1576   PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &fact->solvertype));
1577   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_LU));
1578   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ILU));
1579   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_CHOLESKY));
1580   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ICC));
1581   *fact_out = fact;
1582   PetscFunctionReturn(PETSC_SUCCESS);
1583 }
1584 
1585 template <device::cupm::DeviceType T>
1586 inline PetscErrorCode MatDense_Seq_CUPM<T>::InvertFactors(Mat A) noexcept
1587 {
1588   const auto         mimpl = MatIMPLCast(A);
1589   const auto         mcu   = MatCUPMCast(A);
1590   const auto         n     = static_cast<cupmBlasInt_t>(A->cmap->n);
1591   cupmSolverHandle_t handle;
1592   PetscDeviceContext dctx;
1593   cupmStream_t       stream;
1594 
1595   PetscFunctionBegin;
1596   #if PetscDefined(HAVE_CUDA) && PetscDefined(USING_NVCC)
1597   // HIP appears to have this by default??
1598   PetscCheck(PETSC_PKG_CUDA_VERSION_GE(10, 1, 0), PETSC_COMM_SELF, PETSC_ERR_SUP, "Upgrade to CUDA version 10.1.0 or higher");
1599   #endif
1600   if (!n || !A->rmap->n) PetscFunctionReturn(PETSC_SUCCESS);
1601   PetscCheck(A->factortype == MAT_FACTOR_CHOLESKY, PETSC_COMM_SELF, PETSC_ERR_LIB, "Factor type %s not implemented", MatFactorTypes[A->factortype]);
1602   // spd
1603   PetscCheck(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%sDnsytri not implemented", cupmSolverName());
1604 
1605   PetscCall(GetHandles_(&dctx, &handle, &stream));
1606   {
1607     const auto    da  = DeviceArrayReadWrite(dctx, A);
1608     const auto    lda = static_cast<cupmBlasInt_t>(mimpl->lda);
1609     cupmBlasInt_t il;
1610 
1611     PetscCallCUPMSOLVER(cupmSolverXpotri_bufferSize(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, &il));
1612     if (il > mcu->d_fact_lwork) {
1613       mcu->d_fact_lwork = il;
1614       PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
1615       PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, il, stream));
1616     }
1617     PetscCall(PetscLogGpuTimeBegin());
1618     PetscCallCUPMSOLVER(cupmSolverXpotri(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
1619     PetscCall(PetscLogGpuTimeEnd());
1620   }
1621   PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
1622   // TODO (write cuda kernel)
1623   PetscCall(MatSeqDenseSymmetrize_Private(A, PETSC_TRUE));
1624   PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));
1625 
1626   A->ops->solve          = nullptr;
1627   A->ops->solvetranspose = nullptr;
1628   A->ops->matsolve       = nullptr;
1629   A->factortype          = MAT_FACTOR_NONE;
1630 
1631   PetscCall(PetscFree(A->solvertype));
1632   PetscFunctionReturn(PETSC_SUCCESS);
1633 }
1634 
1635 // ==========================================================================================
1636 
1637 template <device::cupm::DeviceType T>
1638 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetSubMatrix(Mat A, PetscInt rbegin, PetscInt rend, PetscInt cbegin, PetscInt cend, Mat *mat) noexcept
1639 {
1640   const auto         mimpl        = MatIMPLCast(A);
1641   const auto         array_offset = [&](PetscScalar *ptr) { return ptr + rbegin + static_cast<std::size_t>(cbegin) * mimpl->lda; };
1642   const auto         n            = rend - rbegin;
1643   const auto         m            = cend - cbegin;
1644   auto              &cmat         = mimpl->cmat;
1645   PetscDeviceContext dctx;
1646 
1647   PetscFunctionBegin;
1648   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1649   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1650   mimpl->matinuse = cbegin + 1;
1651 
1652   PetscCall(GetHandles_(&dctx));
1653   PetscCall(HostToDevice_(A, dctx));
1654 
1655   if (cmat && ((m != cmat->cmap->N) || (n != cmat->rmap->N))) PetscCall(MatDestroy(&cmat));
1656   {
1657     const auto device_array = array_offset(MatCUPMCast(A)->d_v);
1658 
1659     if (cmat) {
1660       PetscCall(PlaceArray(cmat, device_array));
1661     } else {
1662       PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), n, m, device_array, &cmat, dctx));
1663     }
1664   }
1665   PetscCall(MatDenseSetLDA(cmat, mimpl->lda));
1666   // place CPU array if present but do not copy any data
1667   if (const auto host_array = mimpl->v) {
1668     cmat->offloadmask = PETSC_OFFLOAD_GPU;
1669     PetscCall(MatDensePlaceArray(cmat, array_offset(host_array)));
1670   }
1671 
1672   cmat->offloadmask = A->offloadmask;
1673   *mat              = cmat;
1674   PetscFunctionReturn(PETSC_SUCCESS);
1675 }
1676 
1677 template <device::cupm::DeviceType T>
1678 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreSubMatrix(Mat A, Mat *m) noexcept
1679 {
1680   const auto mimpl = MatIMPLCast(A);
1681   const auto cmat  = mimpl->cmat;
1682   const auto reset = static_cast<bool>(mimpl->v);
1683   bool       copy, was_offload_host;
1684 
1685   PetscFunctionBegin;
1686   PetscCheck(mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetSubMatrix() first");
1687   PetscCheck(cmat, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column matrix");
1688   PetscCheck(*m == cmat, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Not the matrix obtained from MatDenseGetSubMatrix()");
1689   mimpl->matinuse = 0;
1690 
1691   // calls to ResetArray may change it, so save it here
1692   was_offload_host = cmat->offloadmask == PETSC_OFFLOAD_CPU;
1693   if (was_offload_host && !reset) {
1694     copy = true;
1695     PetscCall(MatSeqDenseSetPreallocation(A, nullptr));
1696   } else {
1697     copy = false;
1698   }
1699 
1700   PetscCall(ResetArray(cmat));
1701   if (reset) PetscCall(MatDenseResetArray(cmat));
1702   if (copy) {
1703     PetscDeviceContext dctx;
1704 
1705     PetscCall(GetHandles_(&dctx));
1706     PetscCall(DeviceToHost_(A, dctx));
1707   } else {
1708     A->offloadmask = was_offload_host ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1709   }
1710 
1711   cmat->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1712   *m                = nullptr;
1713   PetscFunctionReturn(PETSC_SUCCESS);
1714 }
1715 
1716 // ==========================================================================================
1717 
1718 namespace
1719 {
1720 
1721 template <device::cupm::DeviceType T>
1722 inline PetscErrorCode MatMatMultNumeric_SeqDenseCUPM_SeqDenseCUPM(Mat A, Mat B, Mat C, PetscBool TA, PetscBool TB) noexcept
1723 {
1724   PetscFunctionBegin;
1725   if (TA) {
1726     if (TB) {
1727       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, true>(A, B, C));
1728     } else {
1729       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, false>(A, B, C));
1730     }
1731   } else {
1732     if (TB) {
1733       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, true>(A, B, C));
1734     } else {
1735       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, false>(A, B, C));
1736     }
1737   }
1738   PetscFunctionReturn(PETSC_SUCCESS);
1739 }
1740 
1741 template <device::cupm::DeviceType T>
1742 inline PetscErrorCode MatSolverTypeRegister_DENSECUPM() noexcept
1743 {
1744   PetscFunctionBegin;
1745   for (auto ftype : util::make_array(MAT_FACTOR_LU, MAT_FACTOR_CHOLESKY, MAT_FACTOR_QR)) {
1746     PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MATSEQDENSE, ftype, MatDense_Seq_CUPM<T>::GetFactor));
1747     PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MatDense_Seq_CUPM<T>::MATSEQDENSECUPM(), ftype, MatDense_Seq_CUPM<T>::GetFactor));
1748   }
1749   PetscFunctionReturn(PETSC_SUCCESS);
1750 }
1751 
1752 } // anonymous namespace
1753 
1754 } // namespace impl
1755 
1756 } // namespace cupm
1757 
1758 } // namespace mat
1759 
1760 } // namespace Petsc
1761 
1762 #endif // __cplusplus
1763 
1764 #endif // PETSCMATSEQDENSECUPM_HPP
1765