xref: /petsc/src/mat/impls/h2opus/cuda/math2opus.cu (revision 1cc06b555e92f8ec64db10330b8bbd830e5bc876)
1f17f7ee2SSatish Balay #include <h2opusconf.h>
2f17f7ee2SSatish Balay /* skip compilation of this .cu file if H2OPUS is CPU only while PETSc has GPU support */
3f17f7ee2SSatish Balay #if !defined(__CUDACC__) || defined(H2OPUS_USE_GPU)
4f17f7ee2SSatish Balay   #include <h2opus.h>
5f17f7ee2SSatish Balay   #if defined(H2OPUS_USE_MPI)
6f17f7ee2SSatish Balay     #include <h2opus/distributed/distributed_h2opus_handle.h>
7f17f7ee2SSatish Balay     #include <h2opus/distributed/distributed_geometric_construction.h>
8f17f7ee2SSatish Balay     #include <h2opus/distributed/distributed_hgemv.h>
9f17f7ee2SSatish Balay     #include <h2opus/distributed/distributed_horthog.h>
10f17f7ee2SSatish Balay     #include <h2opus/distributed/distributed_hcompress.h>
11f17f7ee2SSatish Balay   #endif
12f17f7ee2SSatish Balay   #include <h2opus/util/boxentrygen.h>
13f17f7ee2SSatish Balay   #include <petsc/private/matimpl.h>
14f17f7ee2SSatish Balay   #include <petsc/private/vecimpl.h>
15f17f7ee2SSatish Balay   #include <petsc/private/deviceimpl.h>
16f17f7ee2SSatish Balay   #include <petscsf.h>
17f17f7ee2SSatish Balay 
18f17f7ee2SSatish Balay /* math2opusutils */
19f17f7ee2SSatish Balay PETSC_INTERN PetscErrorCode PetscSFGetVectorSF(PetscSF, PetscInt, PetscInt, PetscInt, PetscSF *);
20f17f7ee2SSatish Balay PETSC_INTERN PetscErrorCode MatDenseGetH2OpusVectorSF(Mat, PetscSF, PetscSF *);
21f17f7ee2SSatish Balay PETSC_INTERN PetscErrorCode VecSign(Vec, Vec);
22f17f7ee2SSatish Balay PETSC_INTERN PetscErrorCode VecSetDelta(Vec, PetscInt);
23f17f7ee2SSatish Balay PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat, NormType, PetscInt, PetscReal *);
24f17f7ee2SSatish Balay 
25f17f7ee2SSatish Balay   #define MatH2OpusGetThrustPointer(v) thrust::raw_pointer_cast((v).data())
26f17f7ee2SSatish Balay 
27f17f7ee2SSatish Balay   /* Use GPU only if H2OPUS is configured for GPU */
28f17f7ee2SSatish Balay   #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU)
29f17f7ee2SSatish Balay     #define PETSC_H2OPUS_USE_GPU
30f17f7ee2SSatish Balay   #endif
31f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
32f17f7ee2SSatish Balay     #define MatH2OpusUpdateIfNeeded(A, B) MatBindToCPU(A, (PetscBool)((A)->boundtocpu || (B)))
33f17f7ee2SSatish Balay   #else
343ba16761SJacob Faibussowitsch     #define MatH2OpusUpdateIfNeeded(A, B) PETSC_SUCCESS
35f17f7ee2SSatish Balay   #endif
36f17f7ee2SSatish Balay 
37f17f7ee2SSatish Balay // TODO H2OPUS:
38f17f7ee2SSatish Balay // DistributedHMatrix
39f17f7ee2SSatish Balay //   unsymmetric ?
40f17f7ee2SSatish Balay //   transpose for distributed_hgemv?
41f17f7ee2SSatish Balay //   clearData()
42f17f7ee2SSatish Balay // Unify interface for sequential and parallel?
43f17f7ee2SSatish Balay // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled)
44f17f7ee2SSatish Balay //
459371c9d4SSatish Balay template <class T>
469371c9d4SSatish Balay class PetscPointCloud : public H2OpusDataSet<T> {
47f17f7ee2SSatish Balay private:
48f17f7ee2SSatish Balay   int            dimension;
49f17f7ee2SSatish Balay   size_t         num_points;
50f17f7ee2SSatish Balay   std::vector<T> pts;
51f17f7ee2SSatish Balay 
52f17f7ee2SSatish Balay public:
53d71ae5a4SJacob Faibussowitsch   PetscPointCloud(int dim, size_t num_pts, const T coords[])
54d71ae5a4SJacob Faibussowitsch   {
55f17f7ee2SSatish Balay     dim              = dim > 0 ? dim : 1;
56f17f7ee2SSatish Balay     this->dimension  = dim;
57f17f7ee2SSatish Balay     this->num_points = num_pts;
58f17f7ee2SSatish Balay 
59f17f7ee2SSatish Balay     pts.resize(num_pts * dim);
60f17f7ee2SSatish Balay     if (coords) {
61036e4bdcSSatish Balay       for (size_t n = 0; n < num_pts; n++)
629371c9d4SSatish Balay         for (int i = 0; i < dim; i++) pts[n * dim + i] = coords[n * dim + i];
63f17f7ee2SSatish Balay     } else {
64036e4bdcSSatish Balay       PetscReal h = 1.0; //num_pts > 1 ? 1./(num_pts - 1) : 0.0;
65036e4bdcSSatish Balay       for (size_t n = 0; n < num_pts; n++) {
66036e4bdcSSatish Balay         pts[n * dim] = n * h;
679371c9d4SSatish Balay         for (int i = 1; i < dim; i++) pts[n * dim + i] = 0.0;
68036e4bdcSSatish Balay       }
69f17f7ee2SSatish Balay     }
70f17f7ee2SSatish Balay   }
71f17f7ee2SSatish Balay 
72d71ae5a4SJacob Faibussowitsch   PetscPointCloud(const PetscPointCloud<T> &other)
73d71ae5a4SJacob Faibussowitsch   {
74f17f7ee2SSatish Balay     size_t N         = other.dimension * other.num_points;
75f17f7ee2SSatish Balay     this->dimension  = other.dimension;
76f17f7ee2SSatish Balay     this->num_points = other.num_points;
77f17f7ee2SSatish Balay     this->pts.resize(N);
789371c9d4SSatish Balay     for (size_t i = 0; i < N; i++) this->pts[i] = other.pts[i];
79f17f7ee2SSatish Balay   }
80f17f7ee2SSatish Balay 
819371c9d4SSatish Balay   int getDimension() const { return dimension; }
82f17f7ee2SSatish Balay 
839371c9d4SSatish Balay   size_t getDataSetSize() const { return num_points; }
84f17f7ee2SSatish Balay 
85d71ae5a4SJacob Faibussowitsch   T getDataPoint(size_t idx, int dim) const
86d71ae5a4SJacob Faibussowitsch   {
87f17f7ee2SSatish Balay     assert(dim < dimension && idx < num_points);
88f17f7ee2SSatish Balay     return pts[idx * dimension + dim];
89f17f7ee2SSatish Balay   }
90f17f7ee2SSatish Balay 
91d71ae5a4SJacob Faibussowitsch   void Print(std::ostream &out = std::cout)
92d71ae5a4SJacob Faibussowitsch   {
93f17f7ee2SSatish Balay     out << "Dimension: " << dimension << std::endl;
94f17f7ee2SSatish Balay     out << "NumPoints: " << num_points << std::endl;
95f17f7ee2SSatish Balay     for (size_t n = 0; n < num_points; n++) {
969371c9d4SSatish Balay       for (int d = 0; d < dimension; d++) out << pts[n * dimension + d] << " ";
97f17f7ee2SSatish Balay       out << std::endl;
98f17f7ee2SSatish Balay     }
99f17f7ee2SSatish Balay   }
100f17f7ee2SSatish Balay };
101f17f7ee2SSatish Balay 
1029371c9d4SSatish Balay template <class T>
1039371c9d4SSatish Balay class PetscFunctionGenerator {
104f17f7ee2SSatish Balay private:
105f17f7ee2SSatish Balay   MatH2OpusKernel k;
106f17f7ee2SSatish Balay   int             dim;
107f17f7ee2SSatish Balay   void           *ctx;
108f17f7ee2SSatish Balay 
109f17f7ee2SSatish Balay public:
110d71ae5a4SJacob Faibussowitsch   PetscFunctionGenerator(MatH2OpusKernel k, int dim, void *ctx)
111d71ae5a4SJacob Faibussowitsch   {
1129371c9d4SSatish Balay     this->k   = k;
1139371c9d4SSatish Balay     this->dim = dim;
1149371c9d4SSatish Balay     this->ctx = ctx;
115f17f7ee2SSatish Balay   }
116d71ae5a4SJacob Faibussowitsch   PetscFunctionGenerator(PetscFunctionGenerator &other)
117d71ae5a4SJacob Faibussowitsch   {
1189371c9d4SSatish Balay     this->k   = other.k;
1199371c9d4SSatish Balay     this->dim = other.dim;
1209371c9d4SSatish Balay     this->ctx = other.ctx;
1219371c9d4SSatish Balay   }
1229371c9d4SSatish Balay   T operator()(PetscReal *pt1, PetscReal *pt2) { return (T)((*this->k)(this->dim, pt1, pt2, this->ctx)); }
123f17f7ee2SSatish Balay };
124f17f7ee2SSatish Balay 
125f17f7ee2SSatish Balay   #include <../src/mat/impls/h2opus/math2opussampler.hpp>
126f17f7ee2SSatish Balay 
127f17f7ee2SSatish Balay   /* just to not clutter the code */
128f17f7ee2SSatish Balay   #if !defined(H2OPUS_USE_GPU)
129f17f7ee2SSatish Balay typedef HMatrix HMatrix_GPU;
130f17f7ee2SSatish Balay     #if defined(H2OPUS_USE_MPI)
131f17f7ee2SSatish Balay typedef DistributedHMatrix DistributedHMatrix_GPU;
132f17f7ee2SSatish Balay     #endif
133f17f7ee2SSatish Balay   #endif
134f17f7ee2SSatish Balay 
135f17f7ee2SSatish Balay typedef struct {
136f17f7ee2SSatish Balay   #if defined(H2OPUS_USE_MPI)
137f17f7ee2SSatish Balay   distributedH2OpusHandle_t handle;
138f17f7ee2SSatish Balay   #else
139f17f7ee2SSatish Balay   h2opusHandle_t handle;
140f17f7ee2SSatish Balay   #endif
141f17f7ee2SSatish Balay   /* Sequential and parallel matrices are two different classes at the moment */
142f17f7ee2SSatish Balay   HMatrix *hmatrix;
143f17f7ee2SSatish Balay   #if defined(H2OPUS_USE_MPI)
144f17f7ee2SSatish Balay   DistributedHMatrix *dist_hmatrix;
145f17f7ee2SSatish Balay   #else
146f17f7ee2SSatish Balay   HMatrix       *dist_hmatrix;     /* just to not clutter the code */
147f17f7ee2SSatish Balay   #endif
148f17f7ee2SSatish Balay   /* May use permutations */
149f17f7ee2SSatish Balay   PetscSF                           sf;
150f17f7ee2SSatish Balay   PetscLayout                       h2opus_rmap, h2opus_cmap;
151f17f7ee2SSatish Balay   IS                                h2opus_indexmap;
152f17f7ee2SSatish Balay   thrust::host_vector<PetscScalar> *xx, *yy;
153f17f7ee2SSatish Balay   PetscInt                          xxs, yys;
154f17f7ee2SSatish Balay   PetscBool                         multsetup;
155f17f7ee2SSatish Balay 
156f17f7ee2SSatish Balay   /* GPU */
157f17f7ee2SSatish Balay   HMatrix_GPU *hmatrix_gpu;
158f17f7ee2SSatish Balay   #if defined(H2OPUS_USE_MPI)
159f17f7ee2SSatish Balay   DistributedHMatrix_GPU *dist_hmatrix_gpu;
160f17f7ee2SSatish Balay   #else
161f17f7ee2SSatish Balay   HMatrix_GPU   *dist_hmatrix_gpu; /* just to not clutter the code */
162f17f7ee2SSatish Balay   #endif
163f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
164f17f7ee2SSatish Balay   thrust::device_vector<PetscScalar> *xx_gpu, *yy_gpu;
165f17f7ee2SSatish Balay   PetscInt                            xxs_gpu, yys_gpu;
166f17f7ee2SSatish Balay   #endif
167f17f7ee2SSatish Balay 
168f17f7ee2SSatish Balay   /* construction from matvecs */
169f17f7ee2SSatish Balay   PetscMatrixSampler *sampler;
170f17f7ee2SSatish Balay   PetscBool           nativemult;
171f17f7ee2SSatish Balay 
172f17f7ee2SSatish Balay   /* Admissibility */
173f17f7ee2SSatish Balay   PetscReal eta;
174f17f7ee2SSatish Balay   PetscInt  leafsize;
175f17f7ee2SSatish Balay 
176f17f7ee2SSatish Balay   /* for dof reordering */
177f17f7ee2SSatish Balay   PetscPointCloud<PetscReal> *ptcloud;
178f17f7ee2SSatish Balay 
179f17f7ee2SSatish Balay   /* kernel for generating matrix entries */
180f17f7ee2SSatish Balay   PetscFunctionGenerator<PetscScalar> *kernel;
181f17f7ee2SSatish Balay 
182f17f7ee2SSatish Balay   /* basis orthogonalized? */
183f17f7ee2SSatish Balay   PetscBool orthogonal;
184f17f7ee2SSatish Balay 
185f17f7ee2SSatish Balay   /* customization */
186f17f7ee2SSatish Balay   PetscInt  basisord;
187f17f7ee2SSatish Balay   PetscInt  max_rank;
188f17f7ee2SSatish Balay   PetscInt  bs;
189f17f7ee2SSatish Balay   PetscReal rtol;
190f17f7ee2SSatish Balay   PetscInt  norm_max_samples;
191f17f7ee2SSatish Balay   PetscBool check_construction;
192f17f7ee2SSatish Balay   PetscBool hara_verbose;
193036e4bdcSSatish Balay   PetscBool resize;
194f17f7ee2SSatish Balay 
195f17f7ee2SSatish Balay   /* keeps track of MatScale values */
196f17f7ee2SSatish Balay   PetscScalar s;
197f17f7ee2SSatish Balay } Mat_H2OPUS;
198f17f7ee2SSatish Balay 
199d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_H2OPUS(Mat A)
200d71ae5a4SJacob Faibussowitsch {
201f17f7ee2SSatish Balay   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
202f17f7ee2SSatish Balay 
203f17f7ee2SSatish Balay   PetscFunctionBegin;
204f17f7ee2SSatish Balay   #if defined(H2OPUS_USE_MPI)
205f17f7ee2SSatish Balay   h2opusDestroyDistributedHandle(a->handle);
206f17f7ee2SSatish Balay   #else
207f17f7ee2SSatish Balay   h2opusDestroyHandle(a->handle);
208f17f7ee2SSatish Balay   #endif
209f17f7ee2SSatish Balay   delete a->dist_hmatrix;
210f17f7ee2SSatish Balay   delete a->hmatrix;
211f17f7ee2SSatish Balay   PetscCall(PetscSFDestroy(&a->sf));
212f17f7ee2SSatish Balay   PetscCall(PetscLayoutDestroy(&a->h2opus_rmap));
213f17f7ee2SSatish Balay   PetscCall(PetscLayoutDestroy(&a->h2opus_cmap));
214f17f7ee2SSatish Balay   PetscCall(ISDestroy(&a->h2opus_indexmap));
215f17f7ee2SSatish Balay   delete a->xx;
216f17f7ee2SSatish Balay   delete a->yy;
217f17f7ee2SSatish Balay   delete a->hmatrix_gpu;
218f17f7ee2SSatish Balay   delete a->dist_hmatrix_gpu;
219f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
220f17f7ee2SSatish Balay   delete a->xx_gpu;
221f17f7ee2SSatish Balay   delete a->yy_gpu;
222f17f7ee2SSatish Balay   #endif
223f17f7ee2SSatish Balay   delete a->sampler;
224f17f7ee2SSatish Balay   delete a->ptcloud;
225f17f7ee2SSatish Balay   delete a->kernel;
226f17f7ee2SSatish Balay   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", NULL));
227f17f7ee2SSatish Balay   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", NULL));
228f17f7ee2SSatish Balay   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", NULL));
229f17f7ee2SSatish Balay   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", NULL));
230f17f7ee2SSatish Balay   PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
231f17f7ee2SSatish Balay   PetscCall(PetscFree(A->data));
2323ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
233f17f7ee2SSatish Balay }
234f17f7ee2SSatish Balay 
235d71ae5a4SJacob Faibussowitsch PetscErrorCode MatH2OpusSetNativeMult(Mat A, PetscBool nm)
236d71ae5a4SJacob Faibussowitsch {
237f17f7ee2SSatish Balay   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
238f17f7ee2SSatish Balay   PetscBool   ish2opus;
239f17f7ee2SSatish Balay 
240f17f7ee2SSatish Balay   PetscFunctionBegin;
241f17f7ee2SSatish Balay   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
242f17f7ee2SSatish Balay   PetscValidLogicalCollectiveBool(A, nm, 2);
243f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
244f17f7ee2SSatish Balay   if (ish2opus) {
245f17f7ee2SSatish Balay     if (a->h2opus_rmap) { /* need to swap layouts for vector creation */
246f17f7ee2SSatish Balay       if ((!a->nativemult && nm) || (a->nativemult && !nm)) {
247f17f7ee2SSatish Balay         PetscLayout t;
248f17f7ee2SSatish Balay         t              = A->rmap;
249f17f7ee2SSatish Balay         A->rmap        = a->h2opus_rmap;
250f17f7ee2SSatish Balay         a->h2opus_rmap = t;
251f17f7ee2SSatish Balay         t              = A->cmap;
252f17f7ee2SSatish Balay         A->cmap        = a->h2opus_cmap;
253f17f7ee2SSatish Balay         a->h2opus_cmap = t;
254f17f7ee2SSatish Balay       }
255f17f7ee2SSatish Balay     }
256f17f7ee2SSatish Balay     a->nativemult = nm;
257f17f7ee2SSatish Balay   }
2583ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
259f17f7ee2SSatish Balay }
260f17f7ee2SSatish Balay 
261d71ae5a4SJacob Faibussowitsch PetscErrorCode MatH2OpusGetNativeMult(Mat A, PetscBool *nm)
262d71ae5a4SJacob Faibussowitsch {
263f17f7ee2SSatish Balay   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
264f17f7ee2SSatish Balay   PetscBool   ish2opus;
265f17f7ee2SSatish Balay 
266f17f7ee2SSatish Balay   PetscFunctionBegin;
267f17f7ee2SSatish Balay   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
268f17f7ee2SSatish Balay   PetscValidPointer(nm, 2);
269f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
270f17f7ee2SSatish Balay   PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
271f17f7ee2SSatish Balay   *nm = a->nativemult;
2723ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
273f17f7ee2SSatish Balay }
274f17f7ee2SSatish Balay 
275d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat A, NormType normtype, PetscReal *n)
276d71ae5a4SJacob Faibussowitsch {
277f17f7ee2SSatish Balay   PetscBool   ish2opus;
278f17f7ee2SSatish Balay   PetscInt    nmax = PETSC_DECIDE;
279f17f7ee2SSatish Balay   Mat_H2OPUS *a    = NULL;
280f17f7ee2SSatish Balay   PetscBool   mult = PETSC_FALSE;
281f17f7ee2SSatish Balay 
282f17f7ee2SSatish Balay   PetscFunctionBegin;
283f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
284f17f7ee2SSatish Balay   if (ish2opus) { /* set userdefine number of samples and fastpath for mult (norms are order independent) */
285f17f7ee2SSatish Balay     a = (Mat_H2OPUS *)A->data;
286f17f7ee2SSatish Balay 
287f17f7ee2SSatish Balay     nmax = a->norm_max_samples;
288f17f7ee2SSatish Balay     mult = a->nativemult;
289f17f7ee2SSatish Balay     PetscCall(MatH2OpusSetNativeMult(A, PETSC_TRUE));
290f17f7ee2SSatish Balay   } else {
291f17f7ee2SSatish Balay     PetscCall(PetscOptionsGetInt(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_approximate_norm_samples", &nmax, NULL));
292f17f7ee2SSatish Balay   }
293f17f7ee2SSatish Balay   PetscCall(MatApproximateNorm_Private(A, normtype, nmax, n));
294f17f7ee2SSatish Balay   if (a) PetscCall(MatH2OpusSetNativeMult(A, mult));
2953ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
296f17f7ee2SSatish Balay }
297f17f7ee2SSatish Balay 
298d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatH2OpusResizeBuffers_Private(Mat A, PetscInt xN, PetscInt yN)
299d71ae5a4SJacob Faibussowitsch {
300f17f7ee2SSatish Balay   Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
301f17f7ee2SSatish Balay   PetscInt    n;
302f17f7ee2SSatish Balay   PetscBool   boundtocpu = PETSC_TRUE;
303f17f7ee2SSatish Balay 
304f17f7ee2SSatish Balay   PetscFunctionBegin;
305f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
306f17f7ee2SSatish Balay   boundtocpu = A->boundtocpu;
307f17f7ee2SSatish Balay   #endif
308f17f7ee2SSatish Balay   PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
309f17f7ee2SSatish Balay   if (boundtocpu) {
3109371c9d4SSatish Balay     if (h2opus->xxs < xN) {
3119371c9d4SSatish Balay       h2opus->xx->resize(n * xN);
3129371c9d4SSatish Balay       h2opus->xxs = xN;
3139371c9d4SSatish Balay     }
3149371c9d4SSatish Balay     if (h2opus->yys < yN) {
3159371c9d4SSatish Balay       h2opus->yy->resize(n * yN);
3169371c9d4SSatish Balay       h2opus->yys = yN;
3179371c9d4SSatish Balay     }
318f17f7ee2SSatish Balay   }
319f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
320f17f7ee2SSatish Balay   if (!boundtocpu) {
3219371c9d4SSatish Balay     if (h2opus->xxs_gpu < xN) {
3229371c9d4SSatish Balay       h2opus->xx_gpu->resize(n * xN);
3239371c9d4SSatish Balay       h2opus->xxs_gpu = xN;
3249371c9d4SSatish Balay     }
3259371c9d4SSatish Balay     if (h2opus->yys_gpu < yN) {
3269371c9d4SSatish Balay       h2opus->yy_gpu->resize(n * yN);
3279371c9d4SSatish Balay       h2opus->yys_gpu = yN;
3289371c9d4SSatish Balay     }
329f17f7ee2SSatish Balay   }
330f17f7ee2SSatish Balay   #endif
3313ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
332f17f7ee2SSatish Balay }
333f17f7ee2SSatish Balay 
334d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultNKernel_H2OPUS(Mat A, PetscBool transA, Mat B, Mat C)
335d71ae5a4SJacob Faibussowitsch {
336f17f7ee2SSatish Balay   Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
337f17f7ee2SSatish Balay   #if defined(H2OPUS_USE_MPI)
338f17f7ee2SSatish Balay   h2opusHandle_t handle = h2opus->handle->handle;
339f17f7ee2SSatish Balay   #else
340f17f7ee2SSatish Balay   h2opusHandle_t handle = h2opus->handle;
341f17f7ee2SSatish Balay   #endif
342f17f7ee2SSatish Balay   PetscBool    boundtocpu = PETSC_TRUE;
343f17f7ee2SSatish Balay   PetscScalar *xx, *yy, *uxx, *uyy;
344f17f7ee2SSatish Balay   PetscInt     blda, clda;
345f17f7ee2SSatish Balay   PetscMPIInt  size;
346f17f7ee2SSatish Balay   PetscSF      bsf, csf;
347f17f7ee2SSatish Balay   PetscBool    usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
348f17f7ee2SSatish Balay 
349f17f7ee2SSatish Balay   PetscFunctionBegin;
350f17f7ee2SSatish Balay   HLibProfile::clear();
351f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
352f17f7ee2SSatish Balay   boundtocpu = A->boundtocpu;
353f17f7ee2SSatish Balay   #endif
354f17f7ee2SSatish Balay   PetscCall(MatDenseGetLDA(B, &blda));
355f17f7ee2SSatish Balay   PetscCall(MatDenseGetLDA(C, &clda));
356f17f7ee2SSatish Balay   if (usesf) {
357f17f7ee2SSatish Balay     PetscInt n;
358f17f7ee2SSatish Balay 
359f17f7ee2SSatish Balay     PetscCall(MatDenseGetH2OpusVectorSF(B, h2opus->sf, &bsf));
360f17f7ee2SSatish Balay     PetscCall(MatDenseGetH2OpusVectorSF(C, h2opus->sf, &csf));
361f17f7ee2SSatish Balay 
362f17f7ee2SSatish Balay     PetscCall(MatH2OpusResizeBuffers_Private(A, B->cmap->N, C->cmap->N));
363f17f7ee2SSatish Balay     PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
364f17f7ee2SSatish Balay     blda = n;
365f17f7ee2SSatish Balay     clda = n;
366f17f7ee2SSatish Balay   }
367f17f7ee2SSatish Balay   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
368f17f7ee2SSatish Balay   if (boundtocpu) {
369f17f7ee2SSatish Balay     PetscCall(MatDenseGetArrayRead(B, (const PetscScalar **)&xx));
370f17f7ee2SSatish Balay     PetscCall(MatDenseGetArrayWrite(C, &yy));
371f17f7ee2SSatish Balay     if (usesf) {
372f17f7ee2SSatish Balay       uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
373f17f7ee2SSatish Balay       uyy = MatH2OpusGetThrustPointer(*h2opus->yy);
374f17f7ee2SSatish Balay       PetscCall(PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
375f17f7ee2SSatish Balay       PetscCall(PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
376f17f7ee2SSatish Balay     } else {
377f17f7ee2SSatish Balay       uxx = xx;
378f17f7ee2SSatish Balay       uyy = yy;
379f17f7ee2SSatish Balay     }
380f17f7ee2SSatish Balay     if (size > 1) {
381f17f7ee2SSatish Balay       PetscCheck(h2opus->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
382036e4bdcSSatish Balay       PetscCheck(!transA || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
383f17f7ee2SSatish Balay   #if defined(H2OPUS_USE_MPI)
384f17f7ee2SSatish Balay       distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
385f17f7ee2SSatish Balay   #endif
386f17f7ee2SSatish Balay     } else {
387f17f7ee2SSatish Balay       PetscCheck(h2opus->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
388f17f7ee2SSatish Balay       hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
389f17f7ee2SSatish Balay     }
390f17f7ee2SSatish Balay     PetscCall(MatDenseRestoreArrayRead(B, (const PetscScalar **)&xx));
391f17f7ee2SSatish Balay     if (usesf) {
392f17f7ee2SSatish Balay       PetscCall(PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
393f17f7ee2SSatish Balay       PetscCall(PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
394f17f7ee2SSatish Balay     }
395f17f7ee2SSatish Balay     PetscCall(MatDenseRestoreArrayWrite(C, &yy));
396f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
397f17f7ee2SSatish Balay   } else {
398f17f7ee2SSatish Balay     PetscBool ciscuda, biscuda;
399f17f7ee2SSatish Balay 
400f17f7ee2SSatish Balay     /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */
401f17f7ee2SSatish Balay     PetscCall(PetscObjectTypeCompareAny((PetscObject)B, &biscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
40248a46eb9SPierre Jolivet     if (!biscuda) PetscCall(MatConvert(B, MATDENSECUDA, MAT_INPLACE_MATRIX, &B));
403f17f7ee2SSatish Balay     PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &ciscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
404f17f7ee2SSatish Balay     if (!ciscuda) {
405f17f7ee2SSatish Balay       C->assembled = PETSC_TRUE;
406f17f7ee2SSatish Balay       PetscCall(MatConvert(C, MATDENSECUDA, MAT_INPLACE_MATRIX, &C));
407f17f7ee2SSatish Balay     }
408f17f7ee2SSatish Balay     PetscCall(MatDenseCUDAGetArrayRead(B, (const PetscScalar **)&xx));
409f17f7ee2SSatish Balay     PetscCall(MatDenseCUDAGetArrayWrite(C, &yy));
410f17f7ee2SSatish Balay     if (usesf) {
411f17f7ee2SSatish Balay       uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
412f17f7ee2SSatish Balay       uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
413f17f7ee2SSatish Balay       PetscCall(PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
414f17f7ee2SSatish Balay       PetscCall(PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
415f17f7ee2SSatish Balay     } else {
416f17f7ee2SSatish Balay       uxx = xx;
417f17f7ee2SSatish Balay       uyy = yy;
418f17f7ee2SSatish Balay     }
419f17f7ee2SSatish Balay     PetscCall(PetscLogGpuTimeBegin());
420f17f7ee2SSatish Balay     if (size > 1) {
421f17f7ee2SSatish Balay       PetscCheck(h2opus->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed GPU matrix");
422036e4bdcSSatish Balay       PetscCheck(!transA || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
423f17f7ee2SSatish Balay     #if defined(H2OPUS_USE_MPI)
424f17f7ee2SSatish Balay       distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
425f17f7ee2SSatish Balay     #endif
426f17f7ee2SSatish Balay     } else {
427f17f7ee2SSatish Balay       PetscCheck(h2opus->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
428f17f7ee2SSatish Balay       hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
429f17f7ee2SSatish Balay     }
430f17f7ee2SSatish Balay     PetscCall(PetscLogGpuTimeEnd());
431f17f7ee2SSatish Balay     PetscCall(MatDenseCUDARestoreArrayRead(B, (const PetscScalar **)&xx));
432f17f7ee2SSatish Balay     if (usesf) {
433f17f7ee2SSatish Balay       PetscCall(PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
434f17f7ee2SSatish Balay       PetscCall(PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
435f17f7ee2SSatish Balay     }
436f17f7ee2SSatish Balay     PetscCall(MatDenseCUDARestoreArrayWrite(C, &yy));
43748a46eb9SPierre Jolivet     if (!biscuda) PetscCall(MatConvert(B, MATDENSE, MAT_INPLACE_MATRIX, &B));
43848a46eb9SPierre Jolivet     if (!ciscuda) PetscCall(MatConvert(C, MATDENSE, MAT_INPLACE_MATRIX, &C));
439f17f7ee2SSatish Balay   #endif
440f17f7ee2SSatish Balay   }
441f17f7ee2SSatish Balay   { /* log flops */
442f17f7ee2SSatish Balay     double gops, time, perf, dev;
443f17f7ee2SSatish Balay     HLibProfile::getHgemvPerf(gops, time, perf, dev);
444f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
445f17f7ee2SSatish Balay     if (boundtocpu) {
446f17f7ee2SSatish Balay       PetscCall(PetscLogFlops(1e9 * gops));
447f17f7ee2SSatish Balay     } else {
448f17f7ee2SSatish Balay       PetscCall(PetscLogGpuFlops(1e9 * gops));
449f17f7ee2SSatish Balay     }
450f17f7ee2SSatish Balay   #else
451f17f7ee2SSatish Balay     PetscCall(PetscLogFlops(1e9 * gops));
452f17f7ee2SSatish Balay   #endif
453f17f7ee2SSatish Balay   }
4543ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
455f17f7ee2SSatish Balay }
456f17f7ee2SSatish Balay 
457d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_H2OPUS(Mat C)
458d71ae5a4SJacob Faibussowitsch {
459f17f7ee2SSatish Balay   Mat_Product *product = C->product;
460f17f7ee2SSatish Balay 
461f17f7ee2SSatish Balay   PetscFunctionBegin;
462f17f7ee2SSatish Balay   MatCheckProduct(C, 1);
463f17f7ee2SSatish Balay   switch (product->type) {
464d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_AB:
465d71ae5a4SJacob Faibussowitsch     PetscCall(MatMultNKernel_H2OPUS(product->A, PETSC_FALSE, product->B, C));
466d71ae5a4SJacob Faibussowitsch     break;
467d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_AtB:
468d71ae5a4SJacob Faibussowitsch     PetscCall(MatMultNKernel_H2OPUS(product->A, PETSC_TRUE, product->B, C));
469d71ae5a4SJacob Faibussowitsch     break;
470d71ae5a4SJacob Faibussowitsch   default:
471d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]);
472f17f7ee2SSatish Balay   }
4733ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
474f17f7ee2SSatish Balay }
475f17f7ee2SSatish Balay 
476d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_H2OPUS(Mat C)
477d71ae5a4SJacob Faibussowitsch {
478f17f7ee2SSatish Balay   Mat_Product *product = C->product;
479f17f7ee2SSatish Balay   PetscBool    cisdense;
480f17f7ee2SSatish Balay   Mat          A, B;
481f17f7ee2SSatish Balay 
482f17f7ee2SSatish Balay   PetscFunctionBegin;
483f17f7ee2SSatish Balay   MatCheckProduct(C, 1);
484f17f7ee2SSatish Balay   A = product->A;
485f17f7ee2SSatish Balay   B = product->B;
486f17f7ee2SSatish Balay   switch (product->type) {
487f17f7ee2SSatish Balay   case MATPRODUCT_AB:
488f17f7ee2SSatish Balay     PetscCall(MatSetSizes(C, A->rmap->n, B->cmap->n, A->rmap->N, B->cmap->N));
489f17f7ee2SSatish Balay     PetscCall(MatSetBlockSizesFromMats(C, product->A, product->B));
490f17f7ee2SSatish Balay     PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
491f17f7ee2SSatish Balay     if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)product->B)->type_name));
492f17f7ee2SSatish Balay     PetscCall(MatSetUp(C));
493f17f7ee2SSatish Balay     break;
494f17f7ee2SSatish Balay   case MATPRODUCT_AtB:
495f17f7ee2SSatish Balay     PetscCall(MatSetSizes(C, A->cmap->n, B->cmap->n, A->cmap->N, B->cmap->N));
496f17f7ee2SSatish Balay     PetscCall(MatSetBlockSizesFromMats(C, product->A, product->B));
497f17f7ee2SSatish Balay     PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
498f17f7ee2SSatish Balay     if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)product->B)->type_name));
499f17f7ee2SSatish Balay     PetscCall(MatSetUp(C));
500f17f7ee2SSatish Balay     break;
501d71ae5a4SJacob Faibussowitsch   default:
502d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]);
503f17f7ee2SSatish Balay   }
504f17f7ee2SSatish Balay   C->ops->productsymbolic = NULL;
505f17f7ee2SSatish Balay   C->ops->productnumeric  = MatProductNumeric_H2OPUS;
5063ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
507f17f7ee2SSatish Balay }
508f17f7ee2SSatish Balay 
509d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_H2OPUS(Mat C)
510d71ae5a4SJacob Faibussowitsch {
511f17f7ee2SSatish Balay   PetscFunctionBegin;
512f17f7ee2SSatish Balay   MatCheckProduct(C, 1);
513ad540459SPierre Jolivet   if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) C->ops->productsymbolic = MatProductSymbolic_H2OPUS;
5143ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
515f17f7ee2SSatish Balay }
516f17f7ee2SSatish Balay 
517d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultKernel_H2OPUS(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans)
518d71ae5a4SJacob Faibussowitsch {
519f17f7ee2SSatish Balay   Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
520f17f7ee2SSatish Balay   #if defined(H2OPUS_USE_MPI)
521f17f7ee2SSatish Balay   h2opusHandle_t handle = h2opus->handle->handle;
522f17f7ee2SSatish Balay   #else
523f17f7ee2SSatish Balay   h2opusHandle_t handle = h2opus->handle;
524f17f7ee2SSatish Balay   #endif
525f17f7ee2SSatish Balay   PetscBool    boundtocpu = PETSC_TRUE;
526f17f7ee2SSatish Balay   PetscInt     n;
527f17f7ee2SSatish Balay   PetscScalar *xx, *yy, *uxx, *uyy;
528f17f7ee2SSatish Balay   PetscMPIInt  size;
529f17f7ee2SSatish Balay   PetscBool    usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
530f17f7ee2SSatish Balay 
531f17f7ee2SSatish Balay   PetscFunctionBegin;
532f17f7ee2SSatish Balay   HLibProfile::clear();
533f17f7ee2SSatish Balay   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
534f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
535f17f7ee2SSatish Balay   boundtocpu = A->boundtocpu;
536f17f7ee2SSatish Balay   #endif
537f17f7ee2SSatish Balay   if (usesf) {
538f17f7ee2SSatish Balay     PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
539f17f7ee2SSatish Balay   } else n = A->rmap->n;
540f17f7ee2SSatish Balay   if (boundtocpu) {
541f17f7ee2SSatish Balay     PetscCall(VecGetArrayRead(x, (const PetscScalar **)&xx));
542f17f7ee2SSatish Balay     if (sy == 0.0) {
543f17f7ee2SSatish Balay       PetscCall(VecGetArrayWrite(y, &yy));
544f17f7ee2SSatish Balay     } else {
545f17f7ee2SSatish Balay       PetscCall(VecGetArray(y, &yy));
546f17f7ee2SSatish Balay     }
547f17f7ee2SSatish Balay     if (usesf) {
548f17f7ee2SSatish Balay       uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
549f17f7ee2SSatish Balay       uyy = MatH2OpusGetThrustPointer(*h2opus->yy);
550f17f7ee2SSatish Balay 
551f17f7ee2SSatish Balay       PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
552f17f7ee2SSatish Balay       PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
553f17f7ee2SSatish Balay       if (sy != 0.0) {
554f17f7ee2SSatish Balay         PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
555f17f7ee2SSatish Balay         PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
556f17f7ee2SSatish Balay       }
557f17f7ee2SSatish Balay     } else {
558f17f7ee2SSatish Balay       uxx = xx;
559f17f7ee2SSatish Balay       uyy = yy;
560f17f7ee2SSatish Balay     }
561f17f7ee2SSatish Balay     if (size > 1) {
562f17f7ee2SSatish Balay       PetscCheck(h2opus->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
563036e4bdcSSatish Balay       PetscCheck(!trans || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
564f17f7ee2SSatish Balay   #if defined(H2OPUS_USE_MPI)
565f17f7ee2SSatish Balay       distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, n, sy, uyy, n, 1, h2opus->handle);
566f17f7ee2SSatish Balay   #endif
567f17f7ee2SSatish Balay     } else {
568f17f7ee2SSatish Balay       PetscCheck(h2opus->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
569f17f7ee2SSatish Balay       hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, n, sy, uyy, n, 1, handle);
570f17f7ee2SSatish Balay     }
571f17f7ee2SSatish Balay     PetscCall(VecRestoreArrayRead(x, (const PetscScalar **)&xx));
572f17f7ee2SSatish Balay     if (usesf) {
573f17f7ee2SSatish Balay       PetscCall(PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
574f17f7ee2SSatish Balay       PetscCall(PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
575f17f7ee2SSatish Balay     }
576f17f7ee2SSatish Balay     if (sy == 0.0) {
577f17f7ee2SSatish Balay       PetscCall(VecRestoreArrayWrite(y, &yy));
578f17f7ee2SSatish Balay     } else {
579f17f7ee2SSatish Balay       PetscCall(VecRestoreArray(y, &yy));
580f17f7ee2SSatish Balay     }
581f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
582f17f7ee2SSatish Balay   } else {
583f17f7ee2SSatish Balay     PetscCall(VecCUDAGetArrayRead(x, (const PetscScalar **)&xx));
584f17f7ee2SSatish Balay     if (sy == 0.0) {
585f17f7ee2SSatish Balay       PetscCall(VecCUDAGetArrayWrite(y, &yy));
586f17f7ee2SSatish Balay     } else {
587f17f7ee2SSatish Balay       PetscCall(VecCUDAGetArray(y, &yy));
588f17f7ee2SSatish Balay     }
589f17f7ee2SSatish Balay     if (usesf) {
590f17f7ee2SSatish Balay       uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
591f17f7ee2SSatish Balay       uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
592f17f7ee2SSatish Balay 
593f17f7ee2SSatish Balay       PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
594f17f7ee2SSatish Balay       PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
595f17f7ee2SSatish Balay       if (sy != 0.0) {
596f17f7ee2SSatish Balay         PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
597f17f7ee2SSatish Balay         PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
598f17f7ee2SSatish Balay       }
599f17f7ee2SSatish Balay     } else {
600f17f7ee2SSatish Balay       uxx = xx;
601f17f7ee2SSatish Balay       uyy = yy;
602f17f7ee2SSatish Balay     }
603f17f7ee2SSatish Balay     PetscCall(PetscLogGpuTimeBegin());
604f17f7ee2SSatish Balay     if (size > 1) {
605f17f7ee2SSatish Balay       PetscCheck(h2opus->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed GPU matrix");
606036e4bdcSSatish Balay       PetscCheck(!trans || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
607f17f7ee2SSatish Balay     #if defined(H2OPUS_USE_MPI)
608f17f7ee2SSatish Balay       distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, h2opus->handle);
609f17f7ee2SSatish Balay     #endif
610f17f7ee2SSatish Balay     } else {
611f17f7ee2SSatish Balay       PetscCheck(h2opus->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
612f17f7ee2SSatish Balay       hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle);
613f17f7ee2SSatish Balay     }
614f17f7ee2SSatish Balay     PetscCall(PetscLogGpuTimeEnd());
615f17f7ee2SSatish Balay     PetscCall(VecCUDARestoreArrayRead(x, (const PetscScalar **)&xx));
616f17f7ee2SSatish Balay     if (usesf) {
617f17f7ee2SSatish Balay       PetscCall(PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
618f17f7ee2SSatish Balay       PetscCall(PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
619f17f7ee2SSatish Balay     }
620f17f7ee2SSatish Balay     if (sy == 0.0) {
621f17f7ee2SSatish Balay       PetscCall(VecCUDARestoreArrayWrite(y, &yy));
622f17f7ee2SSatish Balay     } else {
623f17f7ee2SSatish Balay       PetscCall(VecCUDARestoreArray(y, &yy));
624f17f7ee2SSatish Balay     }
625f17f7ee2SSatish Balay   #endif
626f17f7ee2SSatish Balay   }
627f17f7ee2SSatish Balay   { /* log flops */
628f17f7ee2SSatish Balay     double gops, time, perf, dev;
629f17f7ee2SSatish Balay     HLibProfile::getHgemvPerf(gops, time, perf, dev);
630f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
631f17f7ee2SSatish Balay     if (boundtocpu) {
632f17f7ee2SSatish Balay       PetscCall(PetscLogFlops(1e9 * gops));
633f17f7ee2SSatish Balay     } else {
634f17f7ee2SSatish Balay       PetscCall(PetscLogGpuFlops(1e9 * gops));
635f17f7ee2SSatish Balay     }
636f17f7ee2SSatish Balay   #else
637f17f7ee2SSatish Balay     PetscCall(PetscLogFlops(1e9 * gops));
638f17f7ee2SSatish Balay   #endif
639f17f7ee2SSatish Balay   }
6403ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
641f17f7ee2SSatish Balay }
642f17f7ee2SSatish Balay 
643d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_H2OPUS(Mat A, Vec x, Vec y)
644d71ae5a4SJacob Faibussowitsch {
645f17f7ee2SSatish Balay   PetscBool xiscuda, yiscuda;
646f17f7ee2SSatish Balay 
647f17f7ee2SSatish Balay   PetscFunctionBegin;
648f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
649f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, ""));
650f17f7ee2SSatish Balay   PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda));
651f17f7ee2SSatish Balay   PetscCall(MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_TRUE));
6523ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
653f17f7ee2SSatish Balay }
654f17f7ee2SSatish Balay 
655d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_H2OPUS(Mat A, Vec x, Vec y)
656d71ae5a4SJacob Faibussowitsch {
657f17f7ee2SSatish Balay   PetscBool xiscuda, yiscuda;
658f17f7ee2SSatish Balay 
659f17f7ee2SSatish Balay   PetscFunctionBegin;
660f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
661f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, ""));
662f17f7ee2SSatish Balay   PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda));
663f17f7ee2SSatish Balay   PetscCall(MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_FALSE));
6643ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
665f17f7ee2SSatish Balay }
666f17f7ee2SSatish Balay 
667d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
668d71ae5a4SJacob Faibussowitsch {
669f17f7ee2SSatish Balay   PetscBool xiscuda, ziscuda;
670f17f7ee2SSatish Balay 
671f17f7ee2SSatish Balay   PetscFunctionBegin;
672f17f7ee2SSatish Balay   PetscCall(VecCopy(y, z));
673f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
674f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, ""));
675f17f7ee2SSatish Balay   PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda));
676f17f7ee2SSatish Balay   PetscCall(MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_TRUE));
6773ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
678f17f7ee2SSatish Balay }
679f17f7ee2SSatish Balay 
680d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
681d71ae5a4SJacob Faibussowitsch {
682f17f7ee2SSatish Balay   PetscBool xiscuda, ziscuda;
683f17f7ee2SSatish Balay 
684f17f7ee2SSatish Balay   PetscFunctionBegin;
685f17f7ee2SSatish Balay   PetscCall(VecCopy(y, z));
686f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
687f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, ""));
688f17f7ee2SSatish Balay   PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda));
689f17f7ee2SSatish Balay   PetscCall(MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_FALSE));
6903ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
691f17f7ee2SSatish Balay }
692f17f7ee2SSatish Balay 
693d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_H2OPUS(Mat A, PetscScalar s)
694d71ae5a4SJacob Faibussowitsch {
695f17f7ee2SSatish Balay   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
696f17f7ee2SSatish Balay 
697f17f7ee2SSatish Balay   PetscFunctionBegin;
698f17f7ee2SSatish Balay   a->s *= s;
6993ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
700f17f7ee2SSatish Balay }
701f17f7ee2SSatish Balay 
702d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetFromOptions_H2OPUS(Mat A, PetscOptionItems *PetscOptionsObject)
703d71ae5a4SJacob Faibussowitsch {
704f17f7ee2SSatish Balay   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
705f17f7ee2SSatish Balay 
706f17f7ee2SSatish Balay   PetscFunctionBegin;
707036e4bdcSSatish Balay   PetscOptionsHeadBegin(PetscOptionsObject, "H2OPUS options");
708f17f7ee2SSatish Balay   PetscCall(PetscOptionsInt("-mat_h2opus_leafsize", "Leaf size of cluster tree", NULL, a->leafsize, &a->leafsize, NULL));
709f17f7ee2SSatish Balay   PetscCall(PetscOptionsReal("-mat_h2opus_eta", "Admissibility condition tolerance", NULL, a->eta, &a->eta, NULL));
710f17f7ee2SSatish Balay   PetscCall(PetscOptionsInt("-mat_h2opus_order", "Basis order for off-diagonal sampling when constructed from kernel", NULL, a->basisord, &a->basisord, NULL));
711f17f7ee2SSatish Balay   PetscCall(PetscOptionsInt("-mat_h2opus_maxrank", "Maximum rank when constructed from matvecs", NULL, a->max_rank, &a->max_rank, NULL));
712f17f7ee2SSatish Balay   PetscCall(PetscOptionsInt("-mat_h2opus_samples", "Maximum number of samples to be taken concurrently when constructing from matvecs", NULL, a->bs, &a->bs, NULL));
713aaa8cc7dSPierre Jolivet   PetscCall(PetscOptionsInt("-mat_h2opus_normsamples", "Maximum number of samples to be when estimating norms", NULL, a->norm_max_samples, &a->norm_max_samples, NULL));
714f17f7ee2SSatish Balay   PetscCall(PetscOptionsReal("-mat_h2opus_rtol", "Relative tolerance for construction from sampling", NULL, a->rtol, &a->rtol, NULL));
715f17f7ee2SSatish Balay   PetscCall(PetscOptionsBool("-mat_h2opus_check", "Check error when constructing from sampling during MatAssemblyEnd()", NULL, a->check_construction, &a->check_construction, NULL));
716f17f7ee2SSatish Balay   PetscCall(PetscOptionsBool("-mat_h2opus_hara_verbose", "Verbose output from hara construction", NULL, a->hara_verbose, &a->hara_verbose, NULL));
717036e4bdcSSatish Balay   PetscCall(PetscOptionsBool("-mat_h2opus_resize", "Resize after compression", NULL, a->resize, &a->resize, NULL));
718036e4bdcSSatish Balay   PetscOptionsHeadEnd();
7193ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
720f17f7ee2SSatish Balay }
721f17f7ee2SSatish Balay 
722f17f7ee2SSatish Balay static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat, PetscInt, const PetscReal[], PetscBool, MatH2OpusKernel, void *);
723f17f7ee2SSatish Balay 
724d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatH2OpusInferCoordinates_Private(Mat A)
725d71ae5a4SJacob Faibussowitsch {
726f17f7ee2SSatish Balay   Mat_H2OPUS        *a = (Mat_H2OPUS *)A->data;
727f17f7ee2SSatish Balay   Vec                c;
728f17f7ee2SSatish Balay   PetscInt           spacedim;
729f17f7ee2SSatish Balay   const PetscScalar *coords;
730f17f7ee2SSatish Balay 
731f17f7ee2SSatish Balay   PetscFunctionBegin;
7323ba16761SJacob Faibussowitsch   if (a->ptcloud) PetscFunctionReturn(PETSC_SUCCESS);
733f17f7ee2SSatish Balay   PetscCall(PetscObjectQuery((PetscObject)A, "__math2opus_coords", (PetscObject *)&c));
734f17f7ee2SSatish Balay   if (!c && a->sampler) {
735f17f7ee2SSatish Balay     Mat S = a->sampler->GetSamplingMat();
736f17f7ee2SSatish Balay 
737f17f7ee2SSatish Balay     PetscCall(PetscObjectQuery((PetscObject)S, "__math2opus_coords", (PetscObject *)&c));
738f17f7ee2SSatish Balay   }
739f17f7ee2SSatish Balay   if (!c) {
740f17f7ee2SSatish Balay     PetscCall(MatH2OpusSetCoords_H2OPUS(A, -1, NULL, PETSC_FALSE, NULL, NULL));
741f17f7ee2SSatish Balay   } else {
742f17f7ee2SSatish Balay     PetscCall(VecGetArrayRead(c, &coords));
743f17f7ee2SSatish Balay     PetscCall(VecGetBlockSize(c, &spacedim));
744f17f7ee2SSatish Balay     PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, PETSC_FALSE, NULL, NULL));
745f17f7ee2SSatish Balay     PetscCall(VecRestoreArrayRead(c, &coords));
746f17f7ee2SSatish Balay   }
7473ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
748f17f7ee2SSatish Balay }
749f17f7ee2SSatish Balay 
750d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetUpMultiply_H2OPUS(Mat A)
751d71ae5a4SJacob Faibussowitsch {
752f17f7ee2SSatish Balay   MPI_Comm      comm;
753f17f7ee2SSatish Balay   PetscMPIInt   size;
754f17f7ee2SSatish Balay   Mat_H2OPUS   *a = (Mat_H2OPUS *)A->data;
755f17f7ee2SSatish Balay   PetscInt      n = 0, *idx = NULL;
756f17f7ee2SSatish Balay   int          *iidx = NULL;
757f17f7ee2SSatish Balay   PetscCopyMode own;
758f17f7ee2SSatish Balay   PetscBool     rid;
759f17f7ee2SSatish Balay 
760f17f7ee2SSatish Balay   PetscFunctionBegin;
7613ba16761SJacob Faibussowitsch   if (a->multsetup) PetscFunctionReturn(PETSC_SUCCESS);
762f17f7ee2SSatish Balay   if (a->sf) { /* MatDuplicate_H2OPUS takes reference to the SF */
763f17f7ee2SSatish Balay     PetscCall(PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL));
764f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
765f17f7ee2SSatish Balay     a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
766f17f7ee2SSatish Balay     a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
767f17f7ee2SSatish Balay     a->xxs_gpu = 1;
768f17f7ee2SSatish Balay     a->yys_gpu = 1;
769f17f7ee2SSatish Balay   #endif
770f17f7ee2SSatish Balay     a->xx  = new thrust::host_vector<PetscScalar>(n);
771f17f7ee2SSatish Balay     a->yy  = new thrust::host_vector<PetscScalar>(n);
772f17f7ee2SSatish Balay     a->xxs = 1;
773f17f7ee2SSatish Balay     a->yys = 1;
774f17f7ee2SSatish Balay   } else {
775f17f7ee2SSatish Balay     IS is;
776f17f7ee2SSatish Balay     PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
777f17f7ee2SSatish Balay     PetscCallMPI(MPI_Comm_size(comm, &size));
778f17f7ee2SSatish Balay     if (!a->h2opus_indexmap) {
779f17f7ee2SSatish Balay       if (size > 1) {
780f17f7ee2SSatish Balay         PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
781f17f7ee2SSatish Balay   #if defined(H2OPUS_USE_MPI)
782f17f7ee2SSatish Balay         iidx = MatH2OpusGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map);
783f17f7ee2SSatish Balay         n    = a->dist_hmatrix->basis_tree.basis_branch.index_map.size();
784f17f7ee2SSatish Balay   #endif
785f17f7ee2SSatish Balay       } else {
786f17f7ee2SSatish Balay         iidx = MatH2OpusGetThrustPointer(a->hmatrix->u_basis_tree.index_map);
787f17f7ee2SSatish Balay         n    = a->hmatrix->u_basis_tree.index_map.size();
788f17f7ee2SSatish Balay       }
789f17f7ee2SSatish Balay 
790f17f7ee2SSatish Balay       if (PetscDefined(USE_64BIT_INDICES)) {
791f17f7ee2SSatish Balay         PetscInt i;
792f17f7ee2SSatish Balay 
793f17f7ee2SSatish Balay         own = PETSC_OWN_POINTER;
794f17f7ee2SSatish Balay         PetscCall(PetscMalloc1(n, &idx));
795f17f7ee2SSatish Balay         for (i = 0; i < n; i++) idx[i] = iidx[i];
796f17f7ee2SSatish Balay       } else {
797f17f7ee2SSatish Balay         own = PETSC_COPY_VALUES;
798f17f7ee2SSatish Balay         idx = (PetscInt *)iidx;
799f17f7ee2SSatish Balay       }
800f17f7ee2SSatish Balay       PetscCall(ISCreateGeneral(comm, n, idx, own, &is));
801f17f7ee2SSatish Balay       PetscCall(ISSetPermutation(is));
802f17f7ee2SSatish Balay       PetscCall(ISViewFromOptions(is, (PetscObject)A, "-mat_h2opus_indexmap_view"));
803f17f7ee2SSatish Balay       a->h2opus_indexmap = is;
804f17f7ee2SSatish Balay     }
805f17f7ee2SSatish Balay     PetscCall(ISGetLocalSize(a->h2opus_indexmap, &n));
806f17f7ee2SSatish Balay     PetscCall(ISGetIndices(a->h2opus_indexmap, (const PetscInt **)&idx));
807f17f7ee2SSatish Balay     rid = (PetscBool)(n == A->rmap->n);
808f17f7ee2SSatish Balay     PetscCall(MPIU_Allreduce(MPI_IN_PLACE, &rid, 1, MPIU_BOOL, MPI_LAND, comm));
80948a46eb9SPierre Jolivet     if (rid) PetscCall(ISIdentity(a->h2opus_indexmap, &rid));
810f17f7ee2SSatish Balay     if (!rid) {
811f17f7ee2SSatish Balay       if (size > 1) { /* Parallel distribution may be different, save it here for fast path in MatMult (see MatH2OpusSetNativeMult) */
812f17f7ee2SSatish Balay         PetscCall(PetscLayoutCreate(comm, &a->h2opus_rmap));
813f17f7ee2SSatish Balay         PetscCall(PetscLayoutSetLocalSize(a->h2opus_rmap, n));
814f17f7ee2SSatish Balay         PetscCall(PetscLayoutSetUp(a->h2opus_rmap));
815f17f7ee2SSatish Balay         PetscCall(PetscLayoutReference(a->h2opus_rmap, &a->h2opus_cmap));
816f17f7ee2SSatish Balay       }
817f17f7ee2SSatish Balay       PetscCall(PetscSFCreate(comm, &a->sf));
818f17f7ee2SSatish Balay       PetscCall(PetscSFSetGraphLayout(a->sf, A->rmap, n, NULL, PETSC_OWN_POINTER, idx));
819036e4bdcSSatish Balay       PetscCall(PetscSFSetUp(a->sf));
820f17f7ee2SSatish Balay       PetscCall(PetscSFViewFromOptions(a->sf, (PetscObject)A, "-mat_h2opus_sf_view"));
821f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
822f17f7ee2SSatish Balay       a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
823f17f7ee2SSatish Balay       a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
824f17f7ee2SSatish Balay       a->xxs_gpu = 1;
825f17f7ee2SSatish Balay       a->yys_gpu = 1;
826f17f7ee2SSatish Balay   #endif
827f17f7ee2SSatish Balay       a->xx  = new thrust::host_vector<PetscScalar>(n);
828f17f7ee2SSatish Balay       a->yy  = new thrust::host_vector<PetscScalar>(n);
829f17f7ee2SSatish Balay       a->xxs = 1;
830f17f7ee2SSatish Balay       a->yys = 1;
831f17f7ee2SSatish Balay     }
832f17f7ee2SSatish Balay     PetscCall(ISRestoreIndices(a->h2opus_indexmap, (const PetscInt **)&idx));
833f17f7ee2SSatish Balay   }
834f17f7ee2SSatish Balay   a->multsetup = PETSC_TRUE;
8353ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
836f17f7ee2SSatish Balay }
837f17f7ee2SSatish Balay 
838d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_H2OPUS(Mat A, MatAssemblyType assemblytype)
839d71ae5a4SJacob Faibussowitsch {
840f17f7ee2SSatish Balay   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
841f17f7ee2SSatish Balay   #if defined(H2OPUS_USE_MPI)
842f17f7ee2SSatish Balay   h2opusHandle_t handle = a->handle->handle;
843f17f7ee2SSatish Balay   #else
844f17f7ee2SSatish Balay   h2opusHandle_t handle = a->handle;
845f17f7ee2SSatish Balay   #endif
846f17f7ee2SSatish Balay   PetscBool   kernel       = PETSC_FALSE;
847f17f7ee2SSatish Balay   PetscBool   boundtocpu   = PETSC_TRUE;
848f17f7ee2SSatish Balay   PetscBool   samplingdone = PETSC_FALSE;
849f17f7ee2SSatish Balay   MPI_Comm    comm;
850f17f7ee2SSatish Balay   PetscMPIInt size;
851f17f7ee2SSatish Balay 
852f17f7ee2SSatish Balay   PetscFunctionBegin;
853f17f7ee2SSatish Balay   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
854036e4bdcSSatish Balay   PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
855036e4bdcSSatish Balay   PetscCheck(A->rmap->N == A->cmap->N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");
856f17f7ee2SSatish Balay 
857f17f7ee2SSatish Balay   /* XXX */
858f17f7ee2SSatish Balay   a->leafsize = PetscMin(a->leafsize, PetscMin(A->rmap->N, A->cmap->N));
859f17f7ee2SSatish Balay 
860f17f7ee2SSatish Balay   PetscCallMPI(MPI_Comm_size(comm, &size));
861f17f7ee2SSatish Balay   /* TODO REUSABILITY of geometric construction */
862f17f7ee2SSatish Balay   delete a->hmatrix;
863f17f7ee2SSatish Balay   delete a->dist_hmatrix;
864f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
865f17f7ee2SSatish Balay   delete a->hmatrix_gpu;
866f17f7ee2SSatish Balay   delete a->dist_hmatrix_gpu;
867f17f7ee2SSatish Balay   #endif
868f17f7ee2SSatish Balay   a->orthogonal = PETSC_FALSE;
869f17f7ee2SSatish Balay 
870f17f7ee2SSatish Balay   /* TODO: other? */
871f17f7ee2SSatish Balay   H2OpusBoxCenterAdmissibility adm(a->eta);
872f17f7ee2SSatish Balay 
873f17f7ee2SSatish Balay   PetscCall(PetscLogEventBegin(MAT_H2Opus_Build, A, 0, 0, 0));
874f17f7ee2SSatish Balay   if (size > 1) {
875f17f7ee2SSatish Balay   #if defined(H2OPUS_USE_MPI)
876f17f7ee2SSatish Balay     a->dist_hmatrix = new DistributedHMatrix(A->rmap->n /* ,A->symmetric */);
877f17f7ee2SSatish Balay   #else
878f17f7ee2SSatish Balay     a->dist_hmatrix = NULL;
879f17f7ee2SSatish Balay   #endif
880b94d7dedSBarry Smith   } else a->hmatrix = new HMatrix(A->rmap->n, A->symmetric == PETSC_BOOL3_TRUE);
881f17f7ee2SSatish Balay   PetscCall(MatH2OpusInferCoordinates_Private(A));
882f17f7ee2SSatish Balay   PetscCheck(a->ptcloud, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing pointcloud");
883f17f7ee2SSatish Balay   if (a->kernel) {
884f17f7ee2SSatish Balay     BoxEntryGen<PetscScalar, H2OPUS_HWTYPE_CPU, PetscFunctionGenerator<PetscScalar>> entry_gen(*a->kernel);
885f17f7ee2SSatish Balay     if (size > 1) {
886f17f7ee2SSatish Balay       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
887f17f7ee2SSatish Balay   #if defined(H2OPUS_USE_MPI)
888f17f7ee2SSatish Balay       buildDistributedHMatrix(*a->dist_hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord, a->handle);
889f17f7ee2SSatish Balay   #endif
890f17f7ee2SSatish Balay     } else {
891f17f7ee2SSatish Balay       buildHMatrix(*a->hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord);
892f17f7ee2SSatish Balay     }
893f17f7ee2SSatish Balay     kernel = PETSC_TRUE;
894f17f7ee2SSatish Balay   } else {
895036e4bdcSSatish Balay     PetscCheck(size <= 1, comm, PETSC_ERR_SUP, "Construction from sampling not supported in parallel");
896f17f7ee2SSatish Balay     buildHMatrixStructure(*a->hmatrix, a->ptcloud, a->leafsize, adm);
897f17f7ee2SSatish Balay   }
898f17f7ee2SSatish Balay   PetscCall(MatSetUpMultiply_H2OPUS(A));
899f17f7ee2SSatish Balay 
900f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
901f17f7ee2SSatish Balay   boundtocpu = A->boundtocpu;
902f17f7ee2SSatish Balay   if (!boundtocpu) {
903f17f7ee2SSatish Balay     if (size > 1) {
904f17f7ee2SSatish Balay       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
905f17f7ee2SSatish Balay     #if defined(H2OPUS_USE_MPI)
906f17f7ee2SSatish Balay       a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
907f17f7ee2SSatish Balay     #endif
908f17f7ee2SSatish Balay     } else {
909f17f7ee2SSatish Balay       a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
910f17f7ee2SSatish Balay     }
911f17f7ee2SSatish Balay   }
912f17f7ee2SSatish Balay   #endif
913f17f7ee2SSatish Balay   if (size == 1) {
914f17f7ee2SSatish Balay     if (!kernel && a->sampler && a->sampler->GetSamplingMat()) {
915f17f7ee2SSatish Balay       PetscReal Anorm;
916f17f7ee2SSatish Balay       bool      verbose;
917f17f7ee2SSatish Balay 
918f17f7ee2SSatish Balay       PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_hara_verbose", &a->hara_verbose, NULL));
919f17f7ee2SSatish Balay       verbose = a->hara_verbose;
920f17f7ee2SSatish Balay       PetscCall(MatApproximateNorm_Private(a->sampler->GetSamplingMat(), NORM_2, a->norm_max_samples, &Anorm));
921f17f7ee2SSatish Balay       if (a->hara_verbose) PetscCall(PetscPrintf(PETSC_COMM_SELF, "Sampling uses max rank %d, tol %g (%g*%g), %s samples %d\n", a->max_rank, a->rtol * Anorm, a->rtol, Anorm, boundtocpu ? "CPU" : "GPU", a->bs));
922ad540459SPierre Jolivet       if (a->sf && !a->nativemult) a->sampler->SetIndexMap(a->hmatrix->u_basis_tree.index_map.size(), a->hmatrix->u_basis_tree.index_map.data());
923f17f7ee2SSatish Balay       a->sampler->SetStream(handle->getMainStream());
924f17f7ee2SSatish Balay       if (boundtocpu) {
925f17f7ee2SSatish Balay         a->sampler->SetGPUSampling(false);
926f17f7ee2SSatish Balay         hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose);
927f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
928f17f7ee2SSatish Balay       } else {
929f17f7ee2SSatish Balay         a->sampler->SetGPUSampling(true);
930f17f7ee2SSatish Balay         hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose);
931f17f7ee2SSatish Balay   #endif
932f17f7ee2SSatish Balay       }
933f17f7ee2SSatish Balay       samplingdone = PETSC_TRUE;
934f17f7ee2SSatish Balay     }
935f17f7ee2SSatish Balay   }
936f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
937f17f7ee2SSatish Balay   if (!boundtocpu) {
938f17f7ee2SSatish Balay     delete a->hmatrix;
939f17f7ee2SSatish Balay     delete a->dist_hmatrix;
940f17f7ee2SSatish Balay     a->hmatrix      = NULL;
941f17f7ee2SSatish Balay     a->dist_hmatrix = NULL;
942f17f7ee2SSatish Balay   }
943f17f7ee2SSatish Balay   A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
944f17f7ee2SSatish Balay   #endif
945f17f7ee2SSatish Balay   PetscCall(PetscLogEventEnd(MAT_H2Opus_Build, A, 0, 0, 0));
946f17f7ee2SSatish Balay 
947f17f7ee2SSatish Balay   if (!a->s) a->s = 1.0;
948f17f7ee2SSatish Balay   A->assembled = PETSC_TRUE;
949f17f7ee2SSatish Balay 
950f17f7ee2SSatish Balay   if (samplingdone) {
951f17f7ee2SSatish Balay     PetscBool check  = a->check_construction;
952f17f7ee2SSatish Balay     PetscBool checke = PETSC_FALSE;
953f17f7ee2SSatish Balay 
954f17f7ee2SSatish Balay     PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check", &check, NULL));
955f17f7ee2SSatish Balay     PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check_explicit", &checke, NULL));
956f17f7ee2SSatish Balay     if (check) {
957f17f7ee2SSatish Balay       Mat       E, Ae;
958f17f7ee2SSatish Balay       PetscReal n1, ni, n2;
959f17f7ee2SSatish Balay       PetscReal n1A, niA, n2A;
960f17f7ee2SSatish Balay       void (*normfunc)(void);
961f17f7ee2SSatish Balay 
962f17f7ee2SSatish Balay       Ae = a->sampler->GetSamplingMat();
963f17f7ee2SSatish Balay       PetscCall(MatConvert(A, MATSHELL, MAT_INITIAL_MATRIX, &E));
964f17f7ee2SSatish Balay       PetscCall(MatShellSetOperation(E, MATOP_NORM, (void (*)(void))MatNorm_H2OPUS));
965f17f7ee2SSatish Balay       PetscCall(MatAXPY(E, -1.0, Ae, DIFFERENT_NONZERO_PATTERN));
966f17f7ee2SSatish Balay       PetscCall(MatNorm(E, NORM_1, &n1));
967f17f7ee2SSatish Balay       PetscCall(MatNorm(E, NORM_INFINITY, &ni));
968f17f7ee2SSatish Balay       PetscCall(MatNorm(E, NORM_2, &n2));
969f17f7ee2SSatish Balay       if (checke) {
970f17f7ee2SSatish Balay         Mat eA, eE, eAe;
971f17f7ee2SSatish Balay 
972f17f7ee2SSatish Balay         PetscCall(MatComputeOperator(A, MATAIJ, &eA));
973f17f7ee2SSatish Balay         PetscCall(MatComputeOperator(E, MATAIJ, &eE));
974f17f7ee2SSatish Balay         PetscCall(MatComputeOperator(Ae, MATAIJ, &eAe));
975f17f7ee2SSatish Balay         PetscCall(MatChop(eA, PETSC_SMALL));
976f17f7ee2SSatish Balay         PetscCall(MatChop(eE, PETSC_SMALL));
977f17f7ee2SSatish Balay         PetscCall(MatChop(eAe, PETSC_SMALL));
978f17f7ee2SSatish Balay         PetscCall(PetscObjectSetName((PetscObject)eA, "H2Mat"));
979f17f7ee2SSatish Balay         PetscCall(MatView(eA, NULL));
980f17f7ee2SSatish Balay         PetscCall(PetscObjectSetName((PetscObject)eAe, "S"));
981f17f7ee2SSatish Balay         PetscCall(MatView(eAe, NULL));
982f17f7ee2SSatish Balay         PetscCall(PetscObjectSetName((PetscObject)eE, "H2Mat - S"));
983f17f7ee2SSatish Balay         PetscCall(MatView(eE, NULL));
984f17f7ee2SSatish Balay         PetscCall(MatDestroy(&eA));
985f17f7ee2SSatish Balay         PetscCall(MatDestroy(&eE));
986f17f7ee2SSatish Balay         PetscCall(MatDestroy(&eAe));
987f17f7ee2SSatish Balay       }
988f17f7ee2SSatish Balay 
989f17f7ee2SSatish Balay       PetscCall(MatGetOperation(Ae, MATOP_NORM, &normfunc));
990f17f7ee2SSatish Balay       PetscCall(MatSetOperation(Ae, MATOP_NORM, (void (*)(void))MatNorm_H2OPUS));
991f17f7ee2SSatish Balay       PetscCall(MatNorm(Ae, NORM_1, &n1A));
992f17f7ee2SSatish Balay       PetscCall(MatNorm(Ae, NORM_INFINITY, &niA));
993f17f7ee2SSatish Balay       PetscCall(MatNorm(Ae, NORM_2, &n2A));
994f17f7ee2SSatish Balay       n1A = PetscMax(n1A, PETSC_SMALL);
995f17f7ee2SSatish Balay       n2A = PetscMax(n2A, PETSC_SMALL);
996f17f7ee2SSatish Balay       niA = PetscMax(niA, PETSC_SMALL);
997f17f7ee2SSatish Balay       PetscCall(MatSetOperation(Ae, MATOP_NORM, normfunc));
998f17f7ee2SSatish Balay       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "MATH2OPUS construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n", (double)n1, (double)ni, (double)n2, (double)(n1 / n1A), (double)(ni / niA), (double)(n2 / n2A)));
999f17f7ee2SSatish Balay       PetscCall(MatDestroy(&E));
1000f17f7ee2SSatish Balay     }
1001f17f7ee2SSatish Balay     a->sampler->SetSamplingMat(NULL);
1002f17f7ee2SSatish Balay   }
10033ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1004f17f7ee2SSatish Balay }
1005f17f7ee2SSatish Balay 
1006d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_H2OPUS(Mat A)
1007d71ae5a4SJacob Faibussowitsch {
1008f17f7ee2SSatish Balay   PetscMPIInt size;
1009f17f7ee2SSatish Balay   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1010f17f7ee2SSatish Balay 
1011f17f7ee2SSatish Balay   PetscFunctionBegin;
1012f17f7ee2SSatish Balay   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1013036e4bdcSSatish Balay   PetscCheck(size <= 1, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not yet supported");
1014f17f7ee2SSatish Balay   a->hmatrix->clearData();
1015f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
1016f17f7ee2SSatish Balay   if (a->hmatrix_gpu) a->hmatrix_gpu->clearData();
1017f17f7ee2SSatish Balay   #endif
10183ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1019f17f7ee2SSatish Balay }
1020f17f7ee2SSatish Balay 
1021d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDuplicate_H2OPUS(Mat B, MatDuplicateOption op, Mat *nA)
1022d71ae5a4SJacob Faibussowitsch {
1023f17f7ee2SSatish Balay   Mat         A;
1024f17f7ee2SSatish Balay   Mat_H2OPUS *a, *b = (Mat_H2OPUS *)B->data;
1025f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
1026f17f7ee2SSatish Balay   PetscBool iscpu = PETSC_FALSE;
1027f17f7ee2SSatish Balay   #else
1028f17f7ee2SSatish Balay   PetscBool iscpu = PETSC_TRUE;
1029f17f7ee2SSatish Balay   #endif
1030f17f7ee2SSatish Balay   MPI_Comm comm;
1031f17f7ee2SSatish Balay 
1032f17f7ee2SSatish Balay   PetscFunctionBegin;
1033f17f7ee2SSatish Balay   PetscCall(PetscObjectGetComm((PetscObject)B, &comm));
1034f17f7ee2SSatish Balay   PetscCall(MatCreate(comm, &A));
1035f17f7ee2SSatish Balay   PetscCall(MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N));
1036f17f7ee2SSatish Balay   PetscCall(MatSetType(A, MATH2OPUS));
1037f17f7ee2SSatish Balay   PetscCall(MatPropagateSymmetryOptions(B, A));
1038f17f7ee2SSatish Balay   a = (Mat_H2OPUS *)A->data;
1039f17f7ee2SSatish Balay 
1040f17f7ee2SSatish Balay   a->eta              = b->eta;
1041f17f7ee2SSatish Balay   a->leafsize         = b->leafsize;
1042f17f7ee2SSatish Balay   a->basisord         = b->basisord;
1043f17f7ee2SSatish Balay   a->max_rank         = b->max_rank;
1044f17f7ee2SSatish Balay   a->bs               = b->bs;
1045f17f7ee2SSatish Balay   a->rtol             = b->rtol;
1046f17f7ee2SSatish Balay   a->norm_max_samples = b->norm_max_samples;
1047f17f7ee2SSatish Balay   if (op == MAT_COPY_VALUES) a->s = b->s;
1048f17f7ee2SSatish Balay 
1049f17f7ee2SSatish Balay   a->ptcloud = new PetscPointCloud<PetscReal>(*b->ptcloud);
1050f17f7ee2SSatish Balay   if (op == MAT_COPY_VALUES && b->kernel) a->kernel = new PetscFunctionGenerator<PetscScalar>(*b->kernel);
1051f17f7ee2SSatish Balay 
1052f17f7ee2SSatish Balay   #if defined(H2OPUS_USE_MPI)
1053ad540459SPierre Jolivet   if (b->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix);
1054f17f7ee2SSatish Balay     #if defined(PETSC_H2OPUS_USE_GPU)
1055ad540459SPierre Jolivet   if (b->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu);
1056f17f7ee2SSatish Balay     #endif
1057f17f7ee2SSatish Balay   #endif
1058f17f7ee2SSatish Balay   if (b->hmatrix) {
1059f17f7ee2SSatish Balay     a->hmatrix = new HMatrix(*b->hmatrix);
1060f17f7ee2SSatish Balay     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData();
1061f17f7ee2SSatish Balay   }
1062f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
1063f17f7ee2SSatish Balay   if (b->hmatrix_gpu) {
1064f17f7ee2SSatish Balay     a->hmatrix_gpu = new HMatrix_GPU(*b->hmatrix_gpu);
1065f17f7ee2SSatish Balay     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData();
1066f17f7ee2SSatish Balay   }
1067f17f7ee2SSatish Balay   #endif
1068f17f7ee2SSatish Balay   if (b->sf) {
1069f17f7ee2SSatish Balay     PetscCall(PetscObjectReference((PetscObject)b->sf));
1070f17f7ee2SSatish Balay     a->sf = b->sf;
1071f17f7ee2SSatish Balay   }
1072f17f7ee2SSatish Balay   if (b->h2opus_indexmap) {
1073f17f7ee2SSatish Balay     PetscCall(PetscObjectReference((PetscObject)b->h2opus_indexmap));
1074f17f7ee2SSatish Balay     a->h2opus_indexmap = b->h2opus_indexmap;
1075f17f7ee2SSatish Balay   }
1076f17f7ee2SSatish Balay 
1077f17f7ee2SSatish Balay   PetscCall(MatSetUp(A));
1078f17f7ee2SSatish Balay   PetscCall(MatSetUpMultiply_H2OPUS(A));
1079f17f7ee2SSatish Balay   if (op == MAT_COPY_VALUES) {
1080f17f7ee2SSatish Balay     A->assembled  = PETSC_TRUE;
1081f17f7ee2SSatish Balay     a->orthogonal = b->orthogonal;
1082f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
1083f17f7ee2SSatish Balay     A->offloadmask = B->offloadmask;
1084f17f7ee2SSatish Balay   #endif
1085f17f7ee2SSatish Balay   }
1086f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
1087f17f7ee2SSatish Balay   iscpu = B->boundtocpu;
1088f17f7ee2SSatish Balay   #endif
1089f17f7ee2SSatish Balay   PetscCall(MatBindToCPU(A, iscpu));
1090f17f7ee2SSatish Balay 
1091f17f7ee2SSatish Balay   *nA = A;
10923ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1093f17f7ee2SSatish Balay }
1094f17f7ee2SSatish Balay 
1095d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatView_H2OPUS(Mat A, PetscViewer view)
1096d71ae5a4SJacob Faibussowitsch {
1097f17f7ee2SSatish Balay   Mat_H2OPUS       *h2opus = (Mat_H2OPUS *)A->data;
1098036e4bdcSSatish Balay   PetscBool         isascii, vieweps;
1099f17f7ee2SSatish Balay   PetscMPIInt       size;
1100f17f7ee2SSatish Balay   PetscViewerFormat format;
1101f17f7ee2SSatish Balay 
1102f17f7ee2SSatish Balay   PetscFunctionBegin;
1103f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompare((PetscObject)view, PETSCVIEWERASCII, &isascii));
1104f17f7ee2SSatish Balay   PetscCall(PetscViewerGetFormat(view, &format));
1105f17f7ee2SSatish Balay   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1106f17f7ee2SSatish Balay   if (isascii) {
1107f17f7ee2SSatish Balay     if (format == PETSC_VIEWER_ASCII_MATLAB) {
1108f17f7ee2SSatish Balay       if (size == 1) {
1109f17f7ee2SSatish Balay         FILE *fp;
1110f17f7ee2SSatish Balay         PetscCall(PetscViewerASCIIGetPointer(view, &fp));
1111f17f7ee2SSatish Balay         dumpHMatrix(*h2opus->hmatrix, 6, fp);
1112f17f7ee2SSatish Balay       }
1113f17f7ee2SSatish Balay     } else {
1114f17f7ee2SSatish Balay       PetscCall(PetscViewerASCIIPrintf(view, "  H-Matrix constructed from %s\n", h2opus->kernel ? "Kernel" : "Mat"));
1115f17f7ee2SSatish Balay       PetscCall(PetscViewerASCIIPrintf(view, "  PointCloud dim %" PetscInt_FMT "\n", h2opus->ptcloud ? h2opus->ptcloud->getDimension() : 0));
1116f17f7ee2SSatish Balay       PetscCall(PetscViewerASCIIPrintf(view, "  Admissibility parameters: leaf size %" PetscInt_FMT ", eta %g\n", h2opus->leafsize, (double)h2opus->eta));
1117f17f7ee2SSatish Balay       if (!h2opus->kernel) {
1118f17f7ee2SSatish Balay         PetscCall(PetscViewerASCIIPrintf(view, "  Sampling parameters: max_rank %" PetscInt_FMT ", samples %" PetscInt_FMT ", tolerance %g\n", h2opus->max_rank, h2opus->bs, (double)h2opus->rtol));
1119f17f7ee2SSatish Balay       } else {
1120f17f7ee2SSatish Balay         PetscCall(PetscViewerASCIIPrintf(view, "  Offdiagonal blocks approximation order %" PetscInt_FMT "\n", h2opus->basisord));
1121f17f7ee2SSatish Balay       }
1122f17f7ee2SSatish Balay       PetscCall(PetscViewerASCIIPrintf(view, "  Number of samples for norms %" PetscInt_FMT "\n", h2opus->norm_max_samples));
1123f17f7ee2SSatish Balay       if (size == 1) {
1124f17f7ee2SSatish Balay         double dense_mem_cpu = h2opus->hmatrix ? h2opus->hmatrix->getDenseMemoryUsage() : 0;
1125f17f7ee2SSatish Balay         double low_rank_cpu  = h2opus->hmatrix ? h2opus->hmatrix->getLowRankMemoryUsage() : 0;
1126f17f7ee2SSatish Balay   #if defined(PETSC_HAVE_CUDA)
1127f17f7ee2SSatish Balay         double dense_mem_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getDenseMemoryUsage() : 0;
1128f17f7ee2SSatish Balay         double low_rank_gpu  = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getLowRankMemoryUsage() : 0;
1129f17f7ee2SSatish Balay   #endif
1130f17f7ee2SSatish Balay         PetscCall(PetscViewerASCIIPrintf(view, "  Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu));
1131f17f7ee2SSatish Balay   #if defined(PETSC_HAVE_CUDA)
1132f17f7ee2SSatish Balay         PetscCall(PetscViewerASCIIPrintf(view, "  Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu));
1133f17f7ee2SSatish Balay   #endif
1134f17f7ee2SSatish Balay       } else {
1135f17f7ee2SSatish Balay   #if defined(PETSC_HAVE_CUDA)
1136f17f7ee2SSatish Balay         double      matrix_mem[4] = {0., 0., 0., 0.};
1137f17f7ee2SSatish Balay         PetscMPIInt rsize         = 4;
1138f17f7ee2SSatish Balay   #else
1139f17f7ee2SSatish Balay         double      matrix_mem[2] = {0., 0.};
1140f17f7ee2SSatish Balay         PetscMPIInt rsize         = 2;
1141f17f7ee2SSatish Balay   #endif
1142f17f7ee2SSatish Balay   #if defined(H2OPUS_USE_MPI)
1143f17f7ee2SSatish Balay         matrix_mem[0] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalDenseMemoryUsage() : 0;
1144f17f7ee2SSatish Balay         matrix_mem[1] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalLowRankMemoryUsage() : 0;
1145f17f7ee2SSatish Balay     #if defined(PETSC_HAVE_CUDA)
1146f17f7ee2SSatish Balay         matrix_mem[2] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalDenseMemoryUsage() : 0;
1147f17f7ee2SSatish Balay         matrix_mem[3] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalLowRankMemoryUsage() : 0;
1148f17f7ee2SSatish Balay     #endif
1149f17f7ee2SSatish Balay   #endif
1150f17f7ee2SSatish Balay         PetscCall(MPIU_Allreduce(MPI_IN_PLACE, matrix_mem, rsize, MPI_DOUBLE_PRECISION, MPI_SUM, PetscObjectComm((PetscObject)A)));
1151f17f7ee2SSatish Balay         PetscCall(PetscViewerASCIIPrintf(view, "  Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[0], matrix_mem[1], matrix_mem[0] + matrix_mem[1]));
1152f17f7ee2SSatish Balay   #if defined(PETSC_HAVE_CUDA)
1153f17f7ee2SSatish Balay         PetscCall(PetscViewerASCIIPrintf(view, "  Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[2], matrix_mem[3], matrix_mem[2] + matrix_mem[3]));
1154f17f7ee2SSatish Balay   #endif
1155f17f7ee2SSatish Balay       }
1156f17f7ee2SSatish Balay     }
1157f17f7ee2SSatish Balay   }
1158036e4bdcSSatish Balay   vieweps = PETSC_FALSE;
1159036e4bdcSSatish Balay   PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps", &vieweps, NULL));
1160036e4bdcSSatish Balay   if (vieweps) {
1161f17f7ee2SSatish Balay     char        filename[256];
1162f17f7ee2SSatish Balay     const char *name;
1163f17f7ee2SSatish Balay 
1164f17f7ee2SSatish Balay     PetscCall(PetscObjectGetName((PetscObject)A, &name));
1165f17f7ee2SSatish Balay     PetscCall(PetscSNPrintf(filename, sizeof(filename), "%s_structure.eps", name));
1166036e4bdcSSatish Balay     PetscCall(PetscOptionsGetString(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps_filename", filename, sizeof(filename), NULL));
1167f17f7ee2SSatish Balay     outputEps(*h2opus->hmatrix, filename);
1168f17f7ee2SSatish Balay   }
11693ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1170f17f7ee2SSatish Balay }
1171f17f7ee2SSatish Balay 
1172d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat A, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx)
1173d71ae5a4SJacob Faibussowitsch {
1174f17f7ee2SSatish Balay   Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
1175f17f7ee2SSatish Balay   PetscReal  *gcoords;
1176f17f7ee2SSatish Balay   PetscInt    N;
1177f17f7ee2SSatish Balay   MPI_Comm    comm;
1178f17f7ee2SSatish Balay   PetscMPIInt size;
1179f17f7ee2SSatish Balay   PetscBool   cong;
1180f17f7ee2SSatish Balay 
1181f17f7ee2SSatish Balay   PetscFunctionBegin;
1182f17f7ee2SSatish Balay   PetscCall(PetscLayoutSetUp(A->rmap));
1183f17f7ee2SSatish Balay   PetscCall(PetscLayoutSetUp(A->cmap));
1184f17f7ee2SSatish Balay   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1185f17f7ee2SSatish Balay   PetscCall(MatHasCongruentLayouts(A, &cong));
1186f17f7ee2SSatish Balay   PetscCheck(cong, comm, PETSC_ERR_SUP, "Only for square matrices with congruent layouts");
1187f17f7ee2SSatish Balay   N = A->rmap->N;
1188f17f7ee2SSatish Balay   PetscCallMPI(MPI_Comm_size(comm, &size));
1189f17f7ee2SSatish Balay   if (spacedim > 0 && size > 1 && cdist) {
1190f17f7ee2SSatish Balay     PetscSF      sf;
1191f17f7ee2SSatish Balay     MPI_Datatype dtype;
1192f17f7ee2SSatish Balay 
1193f17f7ee2SSatish Balay     PetscCallMPI(MPI_Type_contiguous(spacedim, MPIU_REAL, &dtype));
1194f17f7ee2SSatish Balay     PetscCallMPI(MPI_Type_commit(&dtype));
1195f17f7ee2SSatish Balay 
1196f17f7ee2SSatish Balay     PetscCall(PetscSFCreate(comm, &sf));
1197f17f7ee2SSatish Balay     PetscCall(PetscSFSetGraphWithPattern(sf, A->rmap, PETSCSF_PATTERN_ALLGATHER));
1198f17f7ee2SSatish Balay     PetscCall(PetscMalloc1(spacedim * N, &gcoords));
1199f17f7ee2SSatish Balay     PetscCall(PetscSFBcastBegin(sf, dtype, coords, gcoords, MPI_REPLACE));
1200f17f7ee2SSatish Balay     PetscCall(PetscSFBcastEnd(sf, dtype, coords, gcoords, MPI_REPLACE));
1201f17f7ee2SSatish Balay     PetscCall(PetscSFDestroy(&sf));
1202f17f7ee2SSatish Balay     PetscCallMPI(MPI_Type_free(&dtype));
1203f17f7ee2SSatish Balay   } else gcoords = (PetscReal *)coords;
1204f17f7ee2SSatish Balay 
1205f17f7ee2SSatish Balay   delete h2opus->ptcloud;
1206f17f7ee2SSatish Balay   delete h2opus->kernel;
1207f17f7ee2SSatish Balay   h2opus->ptcloud = new PetscPointCloud<PetscReal>(spacedim, N, gcoords);
1208f17f7ee2SSatish Balay   if (kernel) h2opus->kernel = new PetscFunctionGenerator<PetscScalar>(kernel, spacedim, kernelctx);
1209f17f7ee2SSatish Balay   if (gcoords != coords) PetscCall(PetscFree(gcoords));
1210f17f7ee2SSatish Balay   A->preallocated = PETSC_TRUE;
12113ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1212f17f7ee2SSatish Balay }
1213f17f7ee2SSatish Balay 
1214f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
1215d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatBindToCPU_H2OPUS(Mat A, PetscBool flg)
1216d71ae5a4SJacob Faibussowitsch {
1217f17f7ee2SSatish Balay   PetscMPIInt size;
1218f17f7ee2SSatish Balay   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1219f17f7ee2SSatish Balay 
1220f17f7ee2SSatish Balay   PetscFunctionBegin;
1221f17f7ee2SSatish Balay   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1222f17f7ee2SSatish Balay   if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) {
1223f17f7ee2SSatish Balay     if (size > 1) {
1224f17f7ee2SSatish Balay       PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1225f17f7ee2SSatish Balay     #if defined(H2OPUS_USE_MPI)
1226f17f7ee2SSatish Balay       if (!a->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix_gpu);
1227f17f7ee2SSatish Balay       else *a->dist_hmatrix = *a->dist_hmatrix_gpu;
1228f17f7ee2SSatish Balay     #endif
1229f17f7ee2SSatish Balay     } else {
1230f17f7ee2SSatish Balay       PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1231f17f7ee2SSatish Balay       if (!a->hmatrix) a->hmatrix = new HMatrix(*a->hmatrix_gpu);
1232f17f7ee2SSatish Balay       else *a->hmatrix = *a->hmatrix_gpu;
1233f17f7ee2SSatish Balay     }
1234f17f7ee2SSatish Balay     delete a->hmatrix_gpu;
1235f17f7ee2SSatish Balay     delete a->dist_hmatrix_gpu;
1236f17f7ee2SSatish Balay     a->hmatrix_gpu      = NULL;
1237f17f7ee2SSatish Balay     a->dist_hmatrix_gpu = NULL;
1238f17f7ee2SSatish Balay     A->offloadmask      = PETSC_OFFLOAD_CPU;
1239f17f7ee2SSatish Balay   } else if (!flg && A->offloadmask == PETSC_OFFLOAD_CPU) {
1240f17f7ee2SSatish Balay     if (size > 1) {
1241f17f7ee2SSatish Balay       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1242f17f7ee2SSatish Balay     #if defined(H2OPUS_USE_MPI)
1243f17f7ee2SSatish Balay       if (!a->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
1244f17f7ee2SSatish Balay       else *a->dist_hmatrix_gpu = *a->dist_hmatrix;
1245f17f7ee2SSatish Balay     #endif
1246f17f7ee2SSatish Balay     } else {
1247f17f7ee2SSatish Balay       PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1248f17f7ee2SSatish Balay       if (!a->hmatrix_gpu) a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
1249f17f7ee2SSatish Balay       else *a->hmatrix_gpu = *a->hmatrix;
1250f17f7ee2SSatish Balay     }
1251f17f7ee2SSatish Balay     delete a->hmatrix;
1252f17f7ee2SSatish Balay     delete a->dist_hmatrix;
1253f17f7ee2SSatish Balay     a->hmatrix      = NULL;
1254f17f7ee2SSatish Balay     a->dist_hmatrix = NULL;
1255f17f7ee2SSatish Balay     A->offloadmask  = PETSC_OFFLOAD_GPU;
1256f17f7ee2SSatish Balay   }
1257f17f7ee2SSatish Balay   PetscCall(PetscFree(A->defaultvectype));
1258f17f7ee2SSatish Balay   if (!flg) {
1259f17f7ee2SSatish Balay     PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
1260f17f7ee2SSatish Balay   } else {
1261f17f7ee2SSatish Balay     PetscCall(PetscStrallocpy(VECSTANDARD, &A->defaultvectype));
1262f17f7ee2SSatish Balay   }
1263f17f7ee2SSatish Balay   A->boundtocpu = flg;
12643ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1265f17f7ee2SSatish Balay }
1266f17f7ee2SSatish Balay   #endif
1267f17f7ee2SSatish Balay 
1268f17f7ee2SSatish Balay /*MC
1269f17f7ee2SSatish Balay      MATH2OPUS = "h2opus" - A matrix type for hierarchical matrices using the H2Opus package.
1270f17f7ee2SSatish Balay 
12712ef1f0ffSBarry Smith    Options Database Key:
12722ef1f0ffSBarry Smith .     -mat_type h2opus - matrix type to "h2opus"
12732ef1f0ffSBarry Smith 
12742ef1f0ffSBarry Smith    Level: beginner
1275f17f7ee2SSatish Balay 
1276f17f7ee2SSatish Balay    Notes:
127711a5261eSBarry Smith      H2Opus implements hierarchical matrices in the H^2 flavour. It supports CPU or NVIDIA GPUs.
127811a5261eSBarry Smith 
12792ef1f0ffSBarry Smith      For CPU only builds, use `./configure --download-h2opus --download-thrust` to install PETSc to use H2Opus.
12802ef1f0ffSBarry Smith      In order to run on NVIDIA GPUs, use `./configure --download-h2opus --download-magma --download-kblas`.
1281f17f7ee2SSatish Balay 
128211a5261eSBarry Smith    Reference:
128311a5261eSBarry Smith .  * -  "H2Opus: A distributed-memory multi-GPU software package for non-local operators", https://arxiv.org/abs/2109.05451
128411a5261eSBarry Smith 
1285*1cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MATH2OPUS`, `MATHTOOL`, `MATDENSE`, `MatCreateH2OpusFromKernel()`, `MatCreateH2OpusFromMat()`
1286f17f7ee2SSatish Balay M*/
1287d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_H2OPUS(Mat A)
1288d71ae5a4SJacob Faibussowitsch {
1289f17f7ee2SSatish Balay   Mat_H2OPUS *a;
1290f17f7ee2SSatish Balay   PetscMPIInt size;
1291f17f7ee2SSatish Balay 
1292f17f7ee2SSatish Balay   PetscFunctionBegin;
1293f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
1294f17f7ee2SSatish Balay   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
1295f17f7ee2SSatish Balay   #endif
12964dfa11a4SJacob Faibussowitsch   PetscCall(PetscNew(&a));
1297f17f7ee2SSatish Balay   A->data = (void *)a;
1298f17f7ee2SSatish Balay 
1299f17f7ee2SSatish Balay   a->eta              = 0.9;
1300f17f7ee2SSatish Balay   a->leafsize         = 32;
1301f17f7ee2SSatish Balay   a->basisord         = 4;
1302f17f7ee2SSatish Balay   a->max_rank         = 64;
1303f17f7ee2SSatish Balay   a->bs               = 32;
1304f17f7ee2SSatish Balay   a->rtol             = 1.e-4;
1305f17f7ee2SSatish Balay   a->s                = 1.0;
1306f17f7ee2SSatish Balay   a->norm_max_samples = 10;
1307036e4bdcSSatish Balay   a->resize           = PETSC_TRUE; /* reallocate after compression */
1308f17f7ee2SSatish Balay   #if defined(H2OPUS_USE_MPI)
1309f17f7ee2SSatish Balay   h2opusCreateDistributedHandleComm(&a->handle, PetscObjectComm((PetscObject)A));
1310f17f7ee2SSatish Balay   #else
1311f17f7ee2SSatish Balay   h2opusCreateHandle(&a->handle);
1312f17f7ee2SSatish Balay   #endif
1313f17f7ee2SSatish Balay   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1314f17f7ee2SSatish Balay   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATH2OPUS));
1315f17f7ee2SSatish Balay   PetscCall(PetscMemzero(A->ops, sizeof(struct _MatOps)));
1316f17f7ee2SSatish Balay 
1317f17f7ee2SSatish Balay   A->ops->destroy          = MatDestroy_H2OPUS;
1318f17f7ee2SSatish Balay   A->ops->view             = MatView_H2OPUS;
1319f17f7ee2SSatish Balay   A->ops->assemblyend      = MatAssemblyEnd_H2OPUS;
1320f17f7ee2SSatish Balay   A->ops->mult             = MatMult_H2OPUS;
1321f17f7ee2SSatish Balay   A->ops->multtranspose    = MatMultTranspose_H2OPUS;
1322f17f7ee2SSatish Balay   A->ops->multadd          = MatMultAdd_H2OPUS;
1323f17f7ee2SSatish Balay   A->ops->multtransposeadd = MatMultTransposeAdd_H2OPUS;
1324f17f7ee2SSatish Balay   A->ops->scale            = MatScale_H2OPUS;
1325f17f7ee2SSatish Balay   A->ops->duplicate        = MatDuplicate_H2OPUS;
1326f17f7ee2SSatish Balay   A->ops->setfromoptions   = MatSetFromOptions_H2OPUS;
1327f17f7ee2SSatish Balay   A->ops->norm             = MatNorm_H2OPUS;
1328f17f7ee2SSatish Balay   A->ops->zeroentries      = MatZeroEntries_H2OPUS;
1329f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
1330f17f7ee2SSatish Balay   A->ops->bindtocpu = MatBindToCPU_H2OPUS;
1331f17f7ee2SSatish Balay   #endif
1332f17f7ee2SSatish Balay 
1333f17f7ee2SSatish Balay   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", MatProductSetFromOptions_H2OPUS));
1334f17f7ee2SSatish Balay   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", MatProductSetFromOptions_H2OPUS));
1335f17f7ee2SSatish Balay   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", MatProductSetFromOptions_H2OPUS));
1336f17f7ee2SSatish Balay   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", MatProductSetFromOptions_H2OPUS));
1337f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
1338f17f7ee2SSatish Balay   PetscCall(PetscFree(A->defaultvectype));
1339f17f7ee2SSatish Balay   PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
1340f17f7ee2SSatish Balay   #endif
13413ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1342f17f7ee2SSatish Balay }
1343f17f7ee2SSatish Balay 
1344f17f7ee2SSatish Balay /*@C
1345f17f7ee2SSatish Balay      MatH2OpusOrthogonalize - Orthogonalize the basis tree of a hierarchical matrix.
1346f17f7ee2SSatish Balay 
1347f17f7ee2SSatish Balay    Input Parameter:
1348f17f7ee2SSatish Balay .     A - the matrix
1349f17f7ee2SSatish Balay 
1350f17f7ee2SSatish Balay    Level: intermediate
1351f17f7ee2SSatish Balay 
1352*1cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`
1353f17f7ee2SSatish Balay @*/
1354d71ae5a4SJacob Faibussowitsch PetscErrorCode MatH2OpusOrthogonalize(Mat A)
1355d71ae5a4SJacob Faibussowitsch {
1356f17f7ee2SSatish Balay   PetscBool   ish2opus;
1357f17f7ee2SSatish Balay   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1358f17f7ee2SSatish Balay   PetscMPIInt size;
1359f17f7ee2SSatish Balay   PetscBool   boundtocpu = PETSC_TRUE;
1360f17f7ee2SSatish Balay 
1361f17f7ee2SSatish Balay   PetscFunctionBegin;
1362f17f7ee2SSatish Balay   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1363f17f7ee2SSatish Balay   PetscValidType(A, 1);
1364f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
13653ba16761SJacob Faibussowitsch   if (!ish2opus) PetscFunctionReturn(PETSC_SUCCESS);
13663ba16761SJacob Faibussowitsch   if (a->orthogonal) PetscFunctionReturn(PETSC_SUCCESS);
1367f17f7ee2SSatish Balay   HLibProfile::clear();
1368f17f7ee2SSatish Balay   PetscCall(PetscLogEventBegin(MAT_H2Opus_Orthog, A, 0, 0, 0));
1369f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
1370f17f7ee2SSatish Balay   boundtocpu = A->boundtocpu;
1371f17f7ee2SSatish Balay   #endif
1372f17f7ee2SSatish Balay   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1373f17f7ee2SSatish Balay   if (size > 1) {
1374f17f7ee2SSatish Balay     if (boundtocpu) {
1375f17f7ee2SSatish Balay       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1376f17f7ee2SSatish Balay   #if defined(H2OPUS_USE_MPI)
1377f17f7ee2SSatish Balay       distributed_horthog(*a->dist_hmatrix, a->handle);
1378f17f7ee2SSatish Balay   #endif
1379f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
1380f17f7ee2SSatish Balay       A->offloadmask = PETSC_OFFLOAD_CPU;
1381f17f7ee2SSatish Balay     } else {
1382f17f7ee2SSatish Balay       PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1383f17f7ee2SSatish Balay       PetscCall(PetscLogGpuTimeBegin());
1384f17f7ee2SSatish Balay     #if defined(H2OPUS_USE_MPI)
1385f17f7ee2SSatish Balay       distributed_horthog(*a->dist_hmatrix_gpu, a->handle);
1386f17f7ee2SSatish Balay     #endif
1387f17f7ee2SSatish Balay       PetscCall(PetscLogGpuTimeEnd());
1388f17f7ee2SSatish Balay   #endif
1389f17f7ee2SSatish Balay     }
1390f17f7ee2SSatish Balay   } else {
1391f17f7ee2SSatish Balay   #if defined(H2OPUS_USE_MPI)
1392f17f7ee2SSatish Balay     h2opusHandle_t handle = a->handle->handle;
1393f17f7ee2SSatish Balay   #else
1394f17f7ee2SSatish Balay     h2opusHandle_t handle = a->handle;
1395f17f7ee2SSatish Balay   #endif
1396f17f7ee2SSatish Balay     if (boundtocpu) {
1397f17f7ee2SSatish Balay       PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1398f17f7ee2SSatish Balay       horthog(*a->hmatrix, handle);
1399f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
1400f17f7ee2SSatish Balay       A->offloadmask = PETSC_OFFLOAD_CPU;
1401f17f7ee2SSatish Balay     } else {
1402f17f7ee2SSatish Balay       PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1403f17f7ee2SSatish Balay       PetscCall(PetscLogGpuTimeBegin());
1404f17f7ee2SSatish Balay       horthog(*a->hmatrix_gpu, handle);
1405f17f7ee2SSatish Balay       PetscCall(PetscLogGpuTimeEnd());
1406f17f7ee2SSatish Balay   #endif
1407f17f7ee2SSatish Balay     }
1408f17f7ee2SSatish Balay   }
1409f17f7ee2SSatish Balay   a->orthogonal = PETSC_TRUE;
1410f17f7ee2SSatish Balay   { /* log flops */
1411f17f7ee2SSatish Balay     double gops, time, perf, dev;
1412f17f7ee2SSatish Balay     HLibProfile::getHorthogPerf(gops, time, perf, dev);
1413f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
1414f17f7ee2SSatish Balay     if (boundtocpu) {
1415f17f7ee2SSatish Balay       PetscCall(PetscLogFlops(1e9 * gops));
1416f17f7ee2SSatish Balay     } else {
1417f17f7ee2SSatish Balay       PetscCall(PetscLogGpuFlops(1e9 * gops));
1418f17f7ee2SSatish Balay     }
1419f17f7ee2SSatish Balay   #else
1420f17f7ee2SSatish Balay     PetscCall(PetscLogFlops(1e9 * gops));
1421f17f7ee2SSatish Balay   #endif
1422f17f7ee2SSatish Balay   }
1423f17f7ee2SSatish Balay   PetscCall(PetscLogEventEnd(MAT_H2Opus_Orthog, A, 0, 0, 0));
14243ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1425f17f7ee2SSatish Balay }
1426f17f7ee2SSatish Balay 
1427f17f7ee2SSatish Balay /*@C
1428f17f7ee2SSatish Balay      MatH2OpusCompress - Compress a hierarchical matrix.
1429f17f7ee2SSatish Balay 
1430f17f7ee2SSatish Balay    Input Parameters:
1431f17f7ee2SSatish Balay +     A - the matrix
1432f17f7ee2SSatish Balay -     tol - the absolute truncation threshold
1433f17f7ee2SSatish Balay 
1434f17f7ee2SSatish Balay    Level: intermediate
1435f17f7ee2SSatish Balay 
1436*1cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusOrthogonalize()`
1437f17f7ee2SSatish Balay @*/
1438d71ae5a4SJacob Faibussowitsch PetscErrorCode MatH2OpusCompress(Mat A, PetscReal tol)
1439d71ae5a4SJacob Faibussowitsch {
1440f17f7ee2SSatish Balay   PetscBool   ish2opus;
1441f17f7ee2SSatish Balay   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1442f17f7ee2SSatish Balay   PetscMPIInt size;
1443f17f7ee2SSatish Balay   PetscBool   boundtocpu = PETSC_TRUE;
1444f17f7ee2SSatish Balay 
1445f17f7ee2SSatish Balay   PetscFunctionBegin;
1446f17f7ee2SSatish Balay   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1447f17f7ee2SSatish Balay   PetscValidType(A, 1);
1448f17f7ee2SSatish Balay   PetscValidLogicalCollectiveReal(A, tol, 2);
1449f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
14503ba16761SJacob Faibussowitsch   if (!ish2opus || tol <= 0.0) PetscFunctionReturn(PETSC_SUCCESS);
1451f17f7ee2SSatish Balay   PetscCall(MatH2OpusOrthogonalize(A));
1452f17f7ee2SSatish Balay   HLibProfile::clear();
1453f17f7ee2SSatish Balay   PetscCall(PetscLogEventBegin(MAT_H2Opus_Compress, A, 0, 0, 0));
1454f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
1455f17f7ee2SSatish Balay   boundtocpu = A->boundtocpu;
1456f17f7ee2SSatish Balay   #endif
1457f17f7ee2SSatish Balay   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1458f17f7ee2SSatish Balay   if (size > 1) {
1459f17f7ee2SSatish Balay     if (boundtocpu) {
1460f17f7ee2SSatish Balay       PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1461f17f7ee2SSatish Balay   #if defined(H2OPUS_USE_MPI)
1462f17f7ee2SSatish Balay       distributed_hcompress(*a->dist_hmatrix, tol, a->handle);
1463036e4bdcSSatish Balay       if (a->resize) {
1464036e4bdcSSatish Balay         DistributedHMatrix *dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix);
1465036e4bdcSSatish Balay         delete a->dist_hmatrix;
1466036e4bdcSSatish Balay         a->dist_hmatrix = dist_hmatrix;
1467036e4bdcSSatish Balay       }
1468f17f7ee2SSatish Balay   #endif
1469f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
1470f17f7ee2SSatish Balay       A->offloadmask = PETSC_OFFLOAD_CPU;
1471f17f7ee2SSatish Balay     } else {
1472f17f7ee2SSatish Balay       PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1473f17f7ee2SSatish Balay       PetscCall(PetscLogGpuTimeBegin());
1474f17f7ee2SSatish Balay     #if defined(H2OPUS_USE_MPI)
1475f17f7ee2SSatish Balay       distributed_hcompress(*a->dist_hmatrix_gpu, tol, a->handle);
1476036e4bdcSSatish Balay 
1477036e4bdcSSatish Balay       if (a->resize) {
1478036e4bdcSSatish Balay         DistributedHMatrix_GPU *dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix_gpu);
1479036e4bdcSSatish Balay         delete a->dist_hmatrix_gpu;
1480036e4bdcSSatish Balay         a->dist_hmatrix_gpu = dist_hmatrix_gpu;
1481036e4bdcSSatish Balay       }
1482f17f7ee2SSatish Balay     #endif
1483f17f7ee2SSatish Balay       PetscCall(PetscLogGpuTimeEnd());
1484f17f7ee2SSatish Balay   #endif
1485f17f7ee2SSatish Balay     }
1486f17f7ee2SSatish Balay   } else {
1487f17f7ee2SSatish Balay   #if defined(H2OPUS_USE_MPI)
1488f17f7ee2SSatish Balay     h2opusHandle_t handle = a->handle->handle;
1489f17f7ee2SSatish Balay   #else
1490f17f7ee2SSatish Balay     h2opusHandle_t handle = a->handle;
1491f17f7ee2SSatish Balay   #endif
1492f17f7ee2SSatish Balay     if (boundtocpu) {
1493f17f7ee2SSatish Balay       PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1494f17f7ee2SSatish Balay       hcompress(*a->hmatrix, tol, handle);
1495036e4bdcSSatish Balay 
1496036e4bdcSSatish Balay       if (a->resize) {
1497036e4bdcSSatish Balay         HMatrix *hmatrix = new HMatrix(*a->hmatrix);
1498036e4bdcSSatish Balay         delete a->hmatrix;
1499036e4bdcSSatish Balay         a->hmatrix = hmatrix;
1500036e4bdcSSatish Balay       }
1501f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
1502f17f7ee2SSatish Balay       A->offloadmask = PETSC_OFFLOAD_CPU;
1503f17f7ee2SSatish Balay     } else {
1504f17f7ee2SSatish Balay       PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1505f17f7ee2SSatish Balay       PetscCall(PetscLogGpuTimeBegin());
1506f17f7ee2SSatish Balay       hcompress(*a->hmatrix_gpu, tol, handle);
1507f17f7ee2SSatish Balay       PetscCall(PetscLogGpuTimeEnd());
1508036e4bdcSSatish Balay 
1509036e4bdcSSatish Balay       if (a->resize) {
1510036e4bdcSSatish Balay         HMatrix_GPU *hmatrix_gpu = new HMatrix_GPU(*a->hmatrix_gpu);
1511036e4bdcSSatish Balay         delete a->hmatrix_gpu;
1512036e4bdcSSatish Balay         a->hmatrix_gpu = hmatrix_gpu;
1513036e4bdcSSatish Balay       }
1514f17f7ee2SSatish Balay   #endif
1515f17f7ee2SSatish Balay     }
1516f17f7ee2SSatish Balay   }
1517f17f7ee2SSatish Balay   { /* log flops */
1518f17f7ee2SSatish Balay     double gops, time, perf, dev;
1519f17f7ee2SSatish Balay     HLibProfile::getHcompressPerf(gops, time, perf, dev);
1520f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
1521f17f7ee2SSatish Balay     if (boundtocpu) {
1522f17f7ee2SSatish Balay       PetscCall(PetscLogFlops(1e9 * gops));
1523f17f7ee2SSatish Balay     } else {
1524f17f7ee2SSatish Balay       PetscCall(PetscLogGpuFlops(1e9 * gops));
1525f17f7ee2SSatish Balay     }
1526f17f7ee2SSatish Balay   #else
1527f17f7ee2SSatish Balay     PetscCall(PetscLogFlops(1e9 * gops));
1528f17f7ee2SSatish Balay   #endif
1529f17f7ee2SSatish Balay   }
1530f17f7ee2SSatish Balay   PetscCall(PetscLogEventEnd(MAT_H2Opus_Compress, A, 0, 0, 0));
15313ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1532f17f7ee2SSatish Balay }
1533f17f7ee2SSatish Balay 
1534f17f7ee2SSatish Balay /*@C
15352ef1f0ffSBarry Smith      MatH2OpusSetSamplingMat - Set a matrix to be sampled from matrix-vector products on another matrix to construct a hierarchical matrix.
1536f17f7ee2SSatish Balay 
1537f17f7ee2SSatish Balay    Input Parameters:
1538f17f7ee2SSatish Balay +     A - the hierarchical matrix
1539f17f7ee2SSatish Balay .     B - the matrix to be sampled
1540f17f7ee2SSatish Balay .     bs - maximum number of samples to be taken concurrently
1541f17f7ee2SSatish Balay -     tol - relative tolerance for construction
1542f17f7ee2SSatish Balay 
154311a5261eSBarry Smith    Notes:
154411a5261eSBarry Smith    You need to call `MatAssemblyBegin()` and `MatAssemblyEnd()` to update the hierarchical matrix.
1545f17f7ee2SSatish Balay 
1546f17f7ee2SSatish Balay    Level: intermediate
1547f17f7ee2SSatish Balay 
1548*1cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`
1549f17f7ee2SSatish Balay @*/
1550d71ae5a4SJacob Faibussowitsch PetscErrorCode MatH2OpusSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol)
1551d71ae5a4SJacob Faibussowitsch {
1552f17f7ee2SSatish Balay   PetscBool ish2opus;
1553f17f7ee2SSatish Balay 
1554f17f7ee2SSatish Balay   PetscFunctionBegin;
1555f17f7ee2SSatish Balay   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1556f17f7ee2SSatish Balay   PetscValidType(A, 1);
1557f17f7ee2SSatish Balay   if (B) PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
1558f17f7ee2SSatish Balay   PetscValidLogicalCollectiveInt(A, bs, 3);
1559f17f7ee2SSatish Balay   PetscValidLogicalCollectiveReal(A, tol, 3);
1560f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1561f17f7ee2SSatish Balay   if (ish2opus) {
1562f17f7ee2SSatish Balay     Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1563f17f7ee2SSatish Balay 
1564f17f7ee2SSatish Balay     if (!a->sampler) a->sampler = new PetscMatrixSampler();
1565f17f7ee2SSatish Balay     a->sampler->SetSamplingMat(B);
1566f17f7ee2SSatish Balay     if (bs > 0) a->bs = bs;
1567f17f7ee2SSatish Balay     if (tol > 0.) a->rtol = tol;
1568f17f7ee2SSatish Balay     delete a->kernel;
1569f17f7ee2SSatish Balay   }
15703ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1571f17f7ee2SSatish Balay }
1572f17f7ee2SSatish Balay 
1573f17f7ee2SSatish Balay /*@C
157411a5261eSBarry Smith      MatCreateH2OpusFromKernel - Creates a `MATH2OPUS` from a user-supplied kernel.
1575f17f7ee2SSatish Balay 
1576f17f7ee2SSatish Balay    Input Parameters:
1577f17f7ee2SSatish Balay +     comm - MPI communicator
15782ef1f0ffSBarry Smith .     m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
15792ef1f0ffSBarry Smith .     n - number of local columns (or `PETSC_DECIDE` to have calculated if `N` is given)
15802ef1f0ffSBarry Smith .     M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
15812ef1f0ffSBarry Smith .     N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
1582f17f7ee2SSatish Balay .     spacedim - dimension of the space coordinates
1583f17f7ee2SSatish Balay .     coords - coordinates of the points
1584f17f7ee2SSatish Balay .     cdist - whether or not coordinates are distributed
15852ef1f0ffSBarry Smith .     kernel - computational kernel (or `NULL`)
1586f17f7ee2SSatish Balay .     kernelctx - kernel context
1587f17f7ee2SSatish Balay .     eta - admissibility condition tolerance
1588f17f7ee2SSatish Balay .     leafsize - leaf size in cluster tree
1589f17f7ee2SSatish Balay -     basisord - approximation order for Chebychev interpolation of low-rank blocks
1590f17f7ee2SSatish Balay 
1591f17f7ee2SSatish Balay    Output Parameter:
1592f17f7ee2SSatish Balay .     nA - matrix
1593f17f7ee2SSatish Balay 
1594f17f7ee2SSatish Balay    Options Database Keys:
159511a5261eSBarry Smith +     -mat_h2opus_leafsize <`PetscInt`> - Leaf size of cluster tree
159611a5261eSBarry Smith .     -mat_h2opus_eta <`PetscReal`> - Admissibility condition tolerance
159711a5261eSBarry Smith .     -mat_h2opus_order <`PetscInt`> - Chebychev approximation order
159811a5261eSBarry Smith -     -mat_h2opus_normsamples <`PetscInt`> - Maximum number of samples to be used when estimating norms
1599f17f7ee2SSatish Balay 
1600f17f7ee2SSatish Balay    Level: intermediate
1601f17f7ee2SSatish Balay 
1602*1cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`
1603f17f7ee2SSatish Balay @*/
1604d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateH2OpusFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat *nA)
1605d71ae5a4SJacob Faibussowitsch {
1606f17f7ee2SSatish Balay   Mat         A;
1607f17f7ee2SSatish Balay   Mat_H2OPUS *h2opus;
1608f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
1609f17f7ee2SSatish Balay   PetscBool iscpu = PETSC_FALSE;
1610f17f7ee2SSatish Balay   #else
1611f17f7ee2SSatish Balay   PetscBool iscpu = PETSC_TRUE;
1612f17f7ee2SSatish Balay   #endif
1613f17f7ee2SSatish Balay 
1614f17f7ee2SSatish Balay   PetscFunctionBegin;
1615036e4bdcSSatish Balay   PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
1616f17f7ee2SSatish Balay   PetscCall(MatCreate(comm, &A));
1617f17f7ee2SSatish Balay   PetscCall(MatSetSizes(A, m, n, M, N));
1618036e4bdcSSatish Balay   PetscCheck(M == N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");
1619f17f7ee2SSatish Balay   PetscCall(MatSetType(A, MATH2OPUS));
1620f17f7ee2SSatish Balay   PetscCall(MatBindToCPU(A, iscpu));
1621f17f7ee2SSatish Balay   PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, kernel, kernelctx));
1622f17f7ee2SSatish Balay 
1623f17f7ee2SSatish Balay   h2opus = (Mat_H2OPUS *)A->data;
1624f17f7ee2SSatish Balay   if (eta > 0.) h2opus->eta = eta;
1625f17f7ee2SSatish Balay   if (leafsize > 0) h2opus->leafsize = leafsize;
1626f17f7ee2SSatish Balay   if (basisord > 0) h2opus->basisord = basisord;
1627f17f7ee2SSatish Balay 
1628f17f7ee2SSatish Balay   *nA = A;
16293ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1630f17f7ee2SSatish Balay }
1631f17f7ee2SSatish Balay 
1632f17f7ee2SSatish Balay /*@C
163311a5261eSBarry Smith      MatCreateH2OpusFromMat - Creates a `MATH2OPUS` sampling from a user-supplied operator.
1634f17f7ee2SSatish Balay 
1635f17f7ee2SSatish Balay    Input Parameters:
1636f17f7ee2SSatish Balay +     B - the matrix to be sampled
1637f17f7ee2SSatish Balay .     spacedim - dimension of the space coordinates
1638f17f7ee2SSatish Balay .     coords - coordinates of the points
1639f17f7ee2SSatish Balay .     cdist - whether or not coordinates are distributed
1640f17f7ee2SSatish Balay .     eta - admissibility condition tolerance
1641f17f7ee2SSatish Balay .     leafsize - leaf size in cluster tree
1642f17f7ee2SSatish Balay .     maxrank - maximum rank allowed
1643f17f7ee2SSatish Balay .     bs - maximum number of samples to be taken concurrently
1644f17f7ee2SSatish Balay -     rtol - relative tolerance for construction
1645f17f7ee2SSatish Balay 
1646f17f7ee2SSatish Balay    Output Parameter:
1647f17f7ee2SSatish Balay .     nA - matrix
1648f17f7ee2SSatish Balay 
1649f17f7ee2SSatish Balay    Options Database Keys:
165011a5261eSBarry Smith +     -mat_h2opus_leafsize <`PetscInt`> - Leaf size of cluster tree
165111a5261eSBarry Smith .     -mat_h2opus_eta <`PetscReal`> - Admissibility condition tolerance
165211a5261eSBarry Smith .     -mat_h2opus_maxrank <`PetscInt`> - Maximum rank when constructed from matvecs
165311a5261eSBarry Smith .     -mat_h2opus_samples <`PetscInt`> - Maximum number of samples to be taken concurrently when constructing from matvecs
165411a5261eSBarry Smith .     -mat_h2opus_rtol <`PetscReal`> - Relative tolerance for construction from sampling
165511a5261eSBarry Smith .     -mat_h2opus_check <`PetscBool`> - Check error when constructing from sampling during MatAssemblyEnd()
165611a5261eSBarry Smith .     -mat_h2opus_hara_verbose <`PetscBool`> - Verbose output from hara construction
1657aaa8cc7dSPierre Jolivet -     -mat_h2opus_normsamples <`PetscInt`> - Maximum number of samples to be when estimating norms
1658f17f7ee2SSatish Balay 
16592ef1f0ffSBarry Smith    Level: intermediate
16602ef1f0ffSBarry Smith 
166111a5261eSBarry Smith    Note:
166211a5261eSBarry Smith    Not available in parallel
1663f17f7ee2SSatish Balay 
1664*1cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromKernel()`
1665f17f7ee2SSatish Balay @*/
1666d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateH2OpusFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA)
1667d71ae5a4SJacob Faibussowitsch {
1668f17f7ee2SSatish Balay   Mat         A;
1669f17f7ee2SSatish Balay   Mat_H2OPUS *h2opus;
1670f17f7ee2SSatish Balay   MPI_Comm    comm;
1671f17f7ee2SSatish Balay   PetscBool   boundtocpu = PETSC_TRUE;
1672f17f7ee2SSatish Balay 
1673f17f7ee2SSatish Balay   PetscFunctionBegin;
1674f17f7ee2SSatish Balay   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
1675f17f7ee2SSatish Balay   PetscValidLogicalCollectiveInt(B, spacedim, 2);
1676f17f7ee2SSatish Balay   PetscValidLogicalCollectiveBool(B, cdist, 4);
1677f17f7ee2SSatish Balay   PetscValidLogicalCollectiveReal(B, eta, 5);
1678f17f7ee2SSatish Balay   PetscValidLogicalCollectiveInt(B, leafsize, 6);
1679f17f7ee2SSatish Balay   PetscValidLogicalCollectiveInt(B, maxrank, 7);
1680f17f7ee2SSatish Balay   PetscValidLogicalCollectiveInt(B, bs, 8);
1681f17f7ee2SSatish Balay   PetscValidLogicalCollectiveReal(B, rtol, 9);
1682f17f7ee2SSatish Balay   PetscValidPointer(nA, 10);
1683f17f7ee2SSatish Balay   PetscCall(PetscObjectGetComm((PetscObject)B, &comm));
1684036e4bdcSSatish Balay   PetscCheck(B->rmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
1685036e4bdcSSatish Balay   PetscCheck(B->rmap->N == B->cmap->N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");
1686f17f7ee2SSatish Balay   PetscCall(MatCreate(comm, &A));
1687f17f7ee2SSatish Balay   PetscCall(MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N));
1688f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
1689f17f7ee2SSatish Balay   {
1690f17f7ee2SSatish Balay     PetscBool iscuda;
1691f17f7ee2SSatish Balay     VecType   vtype;
1692f17f7ee2SSatish Balay 
1693f17f7ee2SSatish Balay     PetscCall(MatGetVecType(B, &vtype));
1694f17f7ee2SSatish Balay     PetscCall(PetscStrcmp(vtype, VECCUDA, &iscuda));
1695f17f7ee2SSatish Balay     if (!iscuda) {
1696f17f7ee2SSatish Balay       PetscCall(PetscStrcmp(vtype, VECSEQCUDA, &iscuda));
169748a46eb9SPierre Jolivet       if (!iscuda) PetscCall(PetscStrcmp(vtype, VECMPICUDA, &iscuda));
1698f17f7ee2SSatish Balay     }
1699f17f7ee2SSatish Balay     if (iscuda && !B->boundtocpu) boundtocpu = PETSC_FALSE;
1700f17f7ee2SSatish Balay   }
1701f17f7ee2SSatish Balay   #endif
1702f17f7ee2SSatish Balay   PetscCall(MatSetType(A, MATH2OPUS));
1703f17f7ee2SSatish Balay   PetscCall(MatBindToCPU(A, boundtocpu));
170448a46eb9SPierre Jolivet   if (spacedim) PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, NULL, NULL));
1705f17f7ee2SSatish Balay   PetscCall(MatPropagateSymmetryOptions(B, A));
1706f17f7ee2SSatish Balay   /* PetscCheck(A->symmetric,comm,PETSC_ERR_SUP,"Unsymmetric sampling does not work"); */
1707f17f7ee2SSatish Balay 
1708f17f7ee2SSatish Balay   h2opus          = (Mat_H2OPUS *)A->data;
1709f17f7ee2SSatish Balay   h2opus->sampler = new PetscMatrixSampler(B);
1710f17f7ee2SSatish Balay   if (eta > 0.) h2opus->eta = eta;
1711f17f7ee2SSatish Balay   if (leafsize > 0) h2opus->leafsize = leafsize;
1712f17f7ee2SSatish Balay   if (maxrank > 0) h2opus->max_rank = maxrank;
1713f17f7ee2SSatish Balay   if (bs > 0) h2opus->bs = bs;
1714f17f7ee2SSatish Balay   if (rtol > 0.) h2opus->rtol = rtol;
1715f17f7ee2SSatish Balay   *nA             = A;
1716f17f7ee2SSatish Balay   A->preallocated = PETSC_TRUE;
17173ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1718f17f7ee2SSatish Balay }
1719f17f7ee2SSatish Balay 
1720f17f7ee2SSatish Balay /*@C
1721f17f7ee2SSatish Balay      MatH2OpusGetIndexMap - Access reordering index set.
1722f17f7ee2SSatish Balay 
17232fe279fdSBarry Smith    Input Parameter:
1724f17f7ee2SSatish Balay .     A - the matrix
1725f17f7ee2SSatish Balay 
1726f17f7ee2SSatish Balay    Output Parameter:
1727f17f7ee2SSatish Balay .     indexmap - the index set for the reordering
1728f17f7ee2SSatish Balay 
1729f17f7ee2SSatish Balay    Level: intermediate
1730f17f7ee2SSatish Balay 
1731*1cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1732f17f7ee2SSatish Balay @*/
1733d71ae5a4SJacob Faibussowitsch PetscErrorCode MatH2OpusGetIndexMap(Mat A, IS *indexmap)
1734d71ae5a4SJacob Faibussowitsch {
1735f17f7ee2SSatish Balay   PetscBool   ish2opus;
1736f17f7ee2SSatish Balay   Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1737f17f7ee2SSatish Balay 
1738f17f7ee2SSatish Balay   PetscFunctionBegin;
1739f17f7ee2SSatish Balay   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1740f17f7ee2SSatish Balay   PetscValidType(A, 1);
1741f17f7ee2SSatish Balay   PetscValidPointer(indexmap, 2);
1742f17f7ee2SSatish Balay   PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1743f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1744f17f7ee2SSatish Balay   PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
1745f17f7ee2SSatish Balay   *indexmap = a->h2opus_indexmap;
17463ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1747f17f7ee2SSatish Balay }
1748f17f7ee2SSatish Balay 
1749f17f7ee2SSatish Balay /*@C
1750f17f7ee2SSatish Balay      MatH2OpusMapVec - Maps a vector between PETSc and H2Opus ordering
1751f17f7ee2SSatish Balay 
1752f17f7ee2SSatish Balay    Input Parameters:
1753f17f7ee2SSatish Balay +     A - the matrix
1754f17f7ee2SSatish Balay .     nativetopetsc - if true, maps from H2Opus ordering to PETSc ordering. If false, applies the reverse map
1755f17f7ee2SSatish Balay -     in - the vector to be mapped
1756f17f7ee2SSatish Balay 
1757f17f7ee2SSatish Balay    Output Parameter:
1758f17f7ee2SSatish Balay .     out - the newly created mapped vector
1759f17f7ee2SSatish Balay 
1760f17f7ee2SSatish Balay    Level: intermediate
1761f17f7ee2SSatish Balay 
1762*1cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1763f17f7ee2SSatish Balay @*/
1764d71ae5a4SJacob Faibussowitsch PetscErrorCode MatH2OpusMapVec(Mat A, PetscBool nativetopetsc, Vec in, Vec *out)
1765d71ae5a4SJacob Faibussowitsch {
1766f17f7ee2SSatish Balay   PetscBool    ish2opus;
1767f17f7ee2SSatish Balay   Mat_H2OPUS  *a = (Mat_H2OPUS *)A->data;
1768f17f7ee2SSatish Balay   PetscScalar *xin, *xout;
1769f17f7ee2SSatish Balay   PetscBool    nm;
1770f17f7ee2SSatish Balay 
1771f17f7ee2SSatish Balay   PetscFunctionBegin;
1772f17f7ee2SSatish Balay   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1773f17f7ee2SSatish Balay   PetscValidType(A, 1);
1774f17f7ee2SSatish Balay   PetscValidLogicalCollectiveBool(A, nativetopetsc, 2);
1775f17f7ee2SSatish Balay   PetscValidHeaderSpecific(in, VEC_CLASSID, 3);
1776f17f7ee2SSatish Balay   PetscValidPointer(out, 4);
1777f17f7ee2SSatish Balay   PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1778f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1779f17f7ee2SSatish Balay   PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
1780f17f7ee2SSatish Balay   nm = a->nativemult;
1781f17f7ee2SSatish Balay   PetscCall(MatH2OpusSetNativeMult(A, (PetscBool)!nativetopetsc));
1782f17f7ee2SSatish Balay   PetscCall(MatCreateVecs(A, out, NULL));
1783f17f7ee2SSatish Balay   PetscCall(MatH2OpusSetNativeMult(A, nm));
1784f17f7ee2SSatish Balay   if (!a->sf) { /* same ordering */
1785f17f7ee2SSatish Balay     PetscCall(VecCopy(in, *out));
17863ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1787f17f7ee2SSatish Balay   }
1788f17f7ee2SSatish Balay   PetscCall(VecGetArrayRead(in, (const PetscScalar **)&xin));
1789f17f7ee2SSatish Balay   PetscCall(VecGetArrayWrite(*out, &xout));
1790f17f7ee2SSatish Balay   if (nativetopetsc) {
1791f17f7ee2SSatish Balay     PetscCall(PetscSFReduceBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1792f17f7ee2SSatish Balay     PetscCall(PetscSFReduceEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1793f17f7ee2SSatish Balay   } else {
1794f17f7ee2SSatish Balay     PetscCall(PetscSFBcastBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1795f17f7ee2SSatish Balay     PetscCall(PetscSFBcastEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1796f17f7ee2SSatish Balay   }
1797f17f7ee2SSatish Balay   PetscCall(VecRestoreArrayRead(in, (const PetscScalar **)&xin));
1798f17f7ee2SSatish Balay   PetscCall(VecRestoreArrayWrite(*out, &xout));
17993ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1800f17f7ee2SSatish Balay }
1801f17f7ee2SSatish Balay 
1802f17f7ee2SSatish Balay /*@C
1803f17f7ee2SSatish Balay      MatH2OpusLowRankUpdate - Perform a low-rank update of the form A = A + s * U * V^T
1804f17f7ee2SSatish Balay 
1805f17f7ee2SSatish Balay    Input Parameters:
180611a5261eSBarry Smith +     A - the hierarchical `MATH2OPUS` matrix
1807f17f7ee2SSatish Balay .     s - the scaling factor
1808f17f7ee2SSatish Balay .     U - the dense low-rank update matrix
18092ef1f0ffSBarry Smith -     V - (optional) the dense low-rank update matrix (if `NULL`, then `V` = `U` is assumed)
1810f17f7ee2SSatish Balay 
181111a5261eSBarry Smith    Note:
18122ef1f0ffSBarry Smith    The `U` and `V` matrices must be in dense format
1813f17f7ee2SSatish Balay 
1814f17f7ee2SSatish Balay    Level: intermediate
1815f17f7ee2SSatish Balay 
1816*1cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`, `MATDENSE`
1817f17f7ee2SSatish Balay @*/
1818d71ae5a4SJacob Faibussowitsch PetscErrorCode MatH2OpusLowRankUpdate(Mat A, Mat U, Mat V, PetscScalar s)
1819d71ae5a4SJacob Faibussowitsch {
1820f17f7ee2SSatish Balay   PetscBool flg;
1821f17f7ee2SSatish Balay 
1822f17f7ee2SSatish Balay   PetscFunctionBegin;
1823f17f7ee2SSatish Balay   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1824f17f7ee2SSatish Balay   PetscValidType(A, 1);
1825f17f7ee2SSatish Balay   PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1826f17f7ee2SSatish Balay   PetscValidHeaderSpecific(U, MAT_CLASSID, 2);
1827f17f7ee2SSatish Balay   PetscCheckSameComm(A, 1, U, 2);
1828f17f7ee2SSatish Balay   if (V) {
1829f17f7ee2SSatish Balay     PetscValidHeaderSpecific(V, MAT_CLASSID, 3);
1830f17f7ee2SSatish Balay     PetscCheckSameComm(A, 1, V, 3);
1831f17f7ee2SSatish Balay   }
1832f17f7ee2SSatish Balay   PetscValidLogicalCollectiveScalar(A, s, 4);
1833f17f7ee2SSatish Balay 
1834f17f7ee2SSatish Balay   if (!V) V = U;
1835036e4bdcSSatish Balay   PetscCheck(U->cmap->N == V->cmap->N, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Non matching rank update %" PetscInt_FMT " != %" PetscInt_FMT, U->cmap->N, V->cmap->N);
18363ba16761SJacob Faibussowitsch   if (!U->cmap->N) PetscFunctionReturn(PETSC_SUCCESS);
1837f17f7ee2SSatish Balay   PetscCall(PetscLayoutCompare(U->rmap, A->rmap, &flg));
1838f17f7ee2SSatish Balay   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "A and U must have the same row layout");
1839f17f7ee2SSatish Balay   PetscCall(PetscLayoutCompare(V->rmap, A->cmap, &flg));
1840f17f7ee2SSatish Balay   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "A column layout must match V row column layout");
1841f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &flg));
1842f17f7ee2SSatish Balay   if (flg) {
1843f17f7ee2SSatish Balay     Mat_H2OPUS        *a = (Mat_H2OPUS *)A->data;
1844f17f7ee2SSatish Balay     const PetscScalar *u, *v, *uu, *vv;
1845f17f7ee2SSatish Balay     PetscInt           ldu, ldv;
1846f17f7ee2SSatish Balay     PetscMPIInt        size;
1847f17f7ee2SSatish Balay   #if defined(H2OPUS_USE_MPI)
1848f17f7ee2SSatish Balay     h2opusHandle_t handle = a->handle->handle;
1849f17f7ee2SSatish Balay   #else
1850f17f7ee2SSatish Balay     h2opusHandle_t handle = a->handle;
1851f17f7ee2SSatish Balay   #endif
1852f17f7ee2SSatish Balay     PetscBool usesf = (PetscBool)(a->sf && !a->nativemult);
1853f17f7ee2SSatish Balay     PetscSF   usf, vsf;
1854f17f7ee2SSatish Balay 
1855f17f7ee2SSatish Balay     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1856036e4bdcSSatish Balay     PetscCheck(size <= 1, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not yet implemented in parallel");
1857f17f7ee2SSatish Balay     PetscCall(PetscLogEventBegin(MAT_H2Opus_LR, A, 0, 0, 0));
1858f17f7ee2SSatish Balay     PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)U, &flg, MATSEQDENSE, MATMPIDENSE, ""));
1859f17f7ee2SSatish Balay     PetscCheck(flg, PetscObjectComm((PetscObject)U), PETSC_ERR_SUP, "Not for U of type %s", ((PetscObject)U)->type_name);
1860f17f7ee2SSatish Balay     PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)V, &flg, MATSEQDENSE, MATMPIDENSE, ""));
1861f17f7ee2SSatish Balay     PetscCheck(flg, PetscObjectComm((PetscObject)V), PETSC_ERR_SUP, "Not for V of type %s", ((PetscObject)V)->type_name);
1862f17f7ee2SSatish Balay     PetscCall(MatDenseGetLDA(U, &ldu));
1863f17f7ee2SSatish Balay     PetscCall(MatDenseGetLDA(V, &ldv));
1864f17f7ee2SSatish Balay     PetscCall(MatBoundToCPU(A, &flg));
1865f17f7ee2SSatish Balay     if (usesf) {
1866f17f7ee2SSatish Balay       PetscInt n;
1867f17f7ee2SSatish Balay 
1868f17f7ee2SSatish Balay       PetscCall(MatDenseGetH2OpusVectorSF(U, a->sf, &usf));
1869f17f7ee2SSatish Balay       PetscCall(MatDenseGetH2OpusVectorSF(V, a->sf, &vsf));
1870f17f7ee2SSatish Balay       PetscCall(MatH2OpusResizeBuffers_Private(A, U->cmap->N, V->cmap->N));
1871f17f7ee2SSatish Balay       PetscCall(PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL));
1872f17f7ee2SSatish Balay       ldu = n;
1873f17f7ee2SSatish Balay       ldv = n;
1874f17f7ee2SSatish Balay     }
1875f17f7ee2SSatish Balay     if (flg) {
1876f17f7ee2SSatish Balay       PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1877f17f7ee2SSatish Balay       PetscCall(MatDenseGetArrayRead(U, &u));
1878f17f7ee2SSatish Balay       PetscCall(MatDenseGetArrayRead(V, &v));
1879f17f7ee2SSatish Balay       if (usesf) {
1880f17f7ee2SSatish Balay         vv = MatH2OpusGetThrustPointer(*a->yy);
1881f17f7ee2SSatish Balay         PetscCall(PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1882f17f7ee2SSatish Balay         PetscCall(PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1883f17f7ee2SSatish Balay         if (U != V) {
1884f17f7ee2SSatish Balay           uu = MatH2OpusGetThrustPointer(*a->xx);
1885f17f7ee2SSatish Balay           PetscCall(PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1886f17f7ee2SSatish Balay           PetscCall(PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1887f17f7ee2SSatish Balay         } else uu = vv;
18889371c9d4SSatish Balay       } else {
18899371c9d4SSatish Balay         uu = u;
18909371c9d4SSatish Balay         vv = v;
18919371c9d4SSatish Balay       }
1892f17f7ee2SSatish Balay       hlru_global(*a->hmatrix, uu, ldu, vv, ldv, U->cmap->N, s, handle);
1893f17f7ee2SSatish Balay       PetscCall(MatDenseRestoreArrayRead(U, &u));
1894f17f7ee2SSatish Balay       PetscCall(MatDenseRestoreArrayRead(V, &v));
1895f17f7ee2SSatish Balay     } else {
1896f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
1897f17f7ee2SSatish Balay       PetscBool flgU, flgV;
1898f17f7ee2SSatish Balay 
1899f17f7ee2SSatish Balay       PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1900f17f7ee2SSatish Balay       PetscCall(PetscObjectTypeCompareAny((PetscObject)U, &flgU, MATSEQDENSE, MATMPIDENSE, ""));
1901f17f7ee2SSatish Balay       if (flgU) PetscCall(MatConvert(U, MATDENSECUDA, MAT_INPLACE_MATRIX, &U));
1902f17f7ee2SSatish Balay       PetscCall(PetscObjectTypeCompareAny((PetscObject)V, &flgV, MATSEQDENSE, MATMPIDENSE, ""));
1903f17f7ee2SSatish Balay       if (flgV) PetscCall(MatConvert(V, MATDENSECUDA, MAT_INPLACE_MATRIX, &V));
1904f17f7ee2SSatish Balay       PetscCall(MatDenseCUDAGetArrayRead(U, &u));
1905f17f7ee2SSatish Balay       PetscCall(MatDenseCUDAGetArrayRead(V, &v));
1906f17f7ee2SSatish Balay       if (usesf) {
1907f17f7ee2SSatish Balay         vv = MatH2OpusGetThrustPointer(*a->yy_gpu);
1908f17f7ee2SSatish Balay         PetscCall(PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1909f17f7ee2SSatish Balay         PetscCall(PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1910f17f7ee2SSatish Balay         if (U != V) {
1911f17f7ee2SSatish Balay           uu = MatH2OpusGetThrustPointer(*a->xx_gpu);
1912f17f7ee2SSatish Balay           PetscCall(PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1913f17f7ee2SSatish Balay           PetscCall(PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1914f17f7ee2SSatish Balay         } else uu = vv;
19159371c9d4SSatish Balay       } else {
19169371c9d4SSatish Balay         uu = u;
19179371c9d4SSatish Balay         vv = v;
19189371c9d4SSatish Balay       }
1919f17f7ee2SSatish Balay   #else
1920f17f7ee2SSatish Balay       SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "This should not happen");
1921f17f7ee2SSatish Balay   #endif
1922f17f7ee2SSatish Balay       hlru_global(*a->hmatrix_gpu, uu, ldu, vv, ldv, U->cmap->N, s, handle);
1923f17f7ee2SSatish Balay   #if defined(PETSC_H2OPUS_USE_GPU)
1924f17f7ee2SSatish Balay       PetscCall(MatDenseCUDARestoreArrayRead(U, &u));
1925f17f7ee2SSatish Balay       PetscCall(MatDenseCUDARestoreArrayRead(V, &v));
1926f17f7ee2SSatish Balay       if (flgU) PetscCall(MatConvert(U, MATDENSE, MAT_INPLACE_MATRIX, &U));
1927f17f7ee2SSatish Balay       if (flgV) PetscCall(MatConvert(V, MATDENSE, MAT_INPLACE_MATRIX, &V));
1928f17f7ee2SSatish Balay   #endif
1929f17f7ee2SSatish Balay     }
1930f17f7ee2SSatish Balay     PetscCall(PetscLogEventEnd(MAT_H2Opus_LR, A, 0, 0, 0));
1931f17f7ee2SSatish Balay     a->orthogonal = PETSC_FALSE;
1932f17f7ee2SSatish Balay   }
19333ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1934f17f7ee2SSatish Balay }
1935f17f7ee2SSatish Balay #endif
1936