xref: /petsc/src/mat/impls/h2opus/cuda/math2opus.cu (revision 036e4bdc5ff4e9428be71571d75dc99dc1cdff13)
1f17f7ee2SSatish Balay #include <h2opusconf.h>
2f17f7ee2SSatish Balay /* skip compilation of this .cu file if H2OPUS is CPU only while PETSc has GPU support */
3f17f7ee2SSatish Balay #if !defined(__CUDACC__) || defined(H2OPUS_USE_GPU)
4f17f7ee2SSatish Balay #include <h2opus.h>
5f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
6f17f7ee2SSatish Balay #include <h2opus/distributed/distributed_h2opus_handle.h>
7f17f7ee2SSatish Balay #include <h2opus/distributed/distributed_geometric_construction.h>
8f17f7ee2SSatish Balay #include <h2opus/distributed/distributed_hgemv.h>
9f17f7ee2SSatish Balay #include <h2opus/distributed/distributed_horthog.h>
10f17f7ee2SSatish Balay #include <h2opus/distributed/distributed_hcompress.h>
11f17f7ee2SSatish Balay #endif
12f17f7ee2SSatish Balay #include <h2opus/util/boxentrygen.h>
13f17f7ee2SSatish Balay #include <petsc/private/matimpl.h>
14f17f7ee2SSatish Balay #include <petsc/private/vecimpl.h>
15f17f7ee2SSatish Balay #include <petsc/private/deviceimpl.h>
16f17f7ee2SSatish Balay #include <petscsf.h>
17f17f7ee2SSatish Balay 
18f17f7ee2SSatish Balay /* math2opusutils */
19f17f7ee2SSatish Balay PETSC_INTERN PetscErrorCode PetscSFGetVectorSF(PetscSF,PetscInt,PetscInt,PetscInt,PetscSF*);
20f17f7ee2SSatish Balay PETSC_INTERN PetscErrorCode MatDenseGetH2OpusVectorSF(Mat,PetscSF,PetscSF*);
21f17f7ee2SSatish Balay PETSC_INTERN PetscErrorCode VecSign(Vec,Vec);
22f17f7ee2SSatish Balay PETSC_INTERN PetscErrorCode VecSetDelta(Vec,PetscInt);
23f17f7ee2SSatish Balay PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat,NormType,PetscInt,PetscReal*);
24f17f7ee2SSatish Balay 
25f17f7ee2SSatish Balay #define MatH2OpusGetThrustPointer(v) thrust::raw_pointer_cast((v).data())
26f17f7ee2SSatish Balay 
27f17f7ee2SSatish Balay /* Use GPU only if H2OPUS is configured for GPU */
28f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU)
29f17f7ee2SSatish Balay #define PETSC_H2OPUS_USE_GPU
30f17f7ee2SSatish Balay #endif
31f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
32f17f7ee2SSatish Balay #define MatH2OpusUpdateIfNeeded(A,B) MatBindToCPU(A,(PetscBool)((A)->boundtocpu || (B)))
33f17f7ee2SSatish Balay #else
34f17f7ee2SSatish Balay #define MatH2OpusUpdateIfNeeded(A,B) 0
35f17f7ee2SSatish Balay #endif
36f17f7ee2SSatish Balay 
37f17f7ee2SSatish Balay // TODO H2OPUS:
38f17f7ee2SSatish Balay // DistributedHMatrix
39f17f7ee2SSatish Balay //   unsymmetric ?
40f17f7ee2SSatish Balay //   transpose for distributed_hgemv?
41f17f7ee2SSatish Balay //   clearData()
42f17f7ee2SSatish Balay // Unify interface for sequential and parallel?
43f17f7ee2SSatish Balay // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled)
44f17f7ee2SSatish Balay //
45f17f7ee2SSatish Balay template <class T> class PetscPointCloud : public H2OpusDataSet<T>
46f17f7ee2SSatish Balay {
47f17f7ee2SSatish Balay   private:
48f17f7ee2SSatish Balay     int dimension;
49f17f7ee2SSatish Balay     size_t num_points;
50f17f7ee2SSatish Balay     std::vector<T> pts;
51f17f7ee2SSatish Balay 
52f17f7ee2SSatish Balay   public:
53f17f7ee2SSatish Balay     PetscPointCloud(int dim, size_t num_pts, const T coords[])
54f17f7ee2SSatish Balay     {
55f17f7ee2SSatish Balay       dim = dim > 0 ? dim : 1;
56f17f7ee2SSatish Balay       this->dimension = dim;
57f17f7ee2SSatish Balay       this->num_points = num_pts;
58f17f7ee2SSatish Balay 
59f17f7ee2SSatish Balay       pts.resize(num_pts*dim);
60f17f7ee2SSatish Balay       if (coords) {
61*036e4bdcSSatish Balay         for (size_t n = 0; n < num_pts; n++)
62f17f7ee2SSatish Balay           for (int i = 0; i < dim; i++)
63f17f7ee2SSatish Balay             pts[n*dim + i] = coords[n*dim + i];
64f17f7ee2SSatish Balay       } else {
65*036e4bdcSSatish Balay         PetscReal h = 1.0; //num_pts > 1 ? 1./(num_pts - 1) : 0.0;
66*036e4bdcSSatish Balay         for (size_t n = 0; n < num_pts; n++) {
67*036e4bdcSSatish Balay           pts[n*dim] = n*h;
68*036e4bdcSSatish Balay           for (int i = 1; i < dim; i++)
69*036e4bdcSSatish Balay             pts[n*dim + i] = 0.0;
70*036e4bdcSSatish Balay         }
71f17f7ee2SSatish Balay       }
72f17f7ee2SSatish Balay     }
73f17f7ee2SSatish Balay 
74f17f7ee2SSatish Balay     PetscPointCloud(const PetscPointCloud<T>& other)
75f17f7ee2SSatish Balay     {
76f17f7ee2SSatish Balay       size_t N = other.dimension * other.num_points;
77f17f7ee2SSatish Balay       this->dimension = other.dimension;
78f17f7ee2SSatish Balay       this->num_points = other.num_points;
79f17f7ee2SSatish Balay       this->pts.resize(N);
80f17f7ee2SSatish Balay       for (size_t i = 0; i < N; i++)
81f17f7ee2SSatish Balay         this->pts[i] = other.pts[i];
82f17f7ee2SSatish Balay     }
83f17f7ee2SSatish Balay 
84f17f7ee2SSatish Balay     int getDimension() const
85f17f7ee2SSatish Balay     {
86f17f7ee2SSatish Balay         return dimension;
87f17f7ee2SSatish Balay     }
88f17f7ee2SSatish Balay 
89f17f7ee2SSatish Balay     size_t getDataSetSize() const
90f17f7ee2SSatish Balay     {
91f17f7ee2SSatish Balay         return num_points;
92f17f7ee2SSatish Balay     }
93f17f7ee2SSatish Balay 
94f17f7ee2SSatish Balay     T getDataPoint(size_t idx, int dim) const
95f17f7ee2SSatish Balay     {
96f17f7ee2SSatish Balay         assert(dim < dimension && idx < num_points);
97f17f7ee2SSatish Balay         return pts[idx*dimension + dim];
98f17f7ee2SSatish Balay     }
99f17f7ee2SSatish Balay 
100f17f7ee2SSatish Balay     void Print(std::ostream& out = std::cout)
101f17f7ee2SSatish Balay     {
102f17f7ee2SSatish Balay        out << "Dimension: " << dimension << std::endl;
103f17f7ee2SSatish Balay        out << "NumPoints: " << num_points << std::endl;
104f17f7ee2SSatish Balay        for (size_t n = 0; n < num_points; n++) {
105f17f7ee2SSatish Balay          for (int d = 0; d < dimension; d++)
106f17f7ee2SSatish Balay            out << pts[n*dimension + d] << " ";
107f17f7ee2SSatish Balay          out << std::endl;
108f17f7ee2SSatish Balay        }
109f17f7ee2SSatish Balay     }
110f17f7ee2SSatish Balay };
111f17f7ee2SSatish Balay 
112f17f7ee2SSatish Balay template<class T> class PetscFunctionGenerator
113f17f7ee2SSatish Balay {
114f17f7ee2SSatish Balay private:
115f17f7ee2SSatish Balay   MatH2OpusKernel k;
116f17f7ee2SSatish Balay   int             dim;
117f17f7ee2SSatish Balay   void            *ctx;
118f17f7ee2SSatish Balay 
119f17f7ee2SSatish Balay public:
120f17f7ee2SSatish Balay     PetscFunctionGenerator(MatH2OpusKernel k, int dim, void* ctx) { this->k = k; this->dim = dim; this->ctx = ctx; }
121f17f7ee2SSatish Balay     PetscFunctionGenerator(PetscFunctionGenerator& other) { this->k = other.k; this->dim = other.dim; this->ctx = other.ctx; }
122f17f7ee2SSatish Balay     T operator()(PetscReal *pt1, PetscReal *pt2)
123f17f7ee2SSatish Balay     {
124f17f7ee2SSatish Balay         return (T)((*this->k)(this->dim,pt1,pt2,this->ctx));
125f17f7ee2SSatish Balay     }
126f17f7ee2SSatish Balay };
127f17f7ee2SSatish Balay 
128f17f7ee2SSatish Balay #include <../src/mat/impls/h2opus/math2opussampler.hpp>
129f17f7ee2SSatish Balay 
130f17f7ee2SSatish Balay /* just to not clutter the code */
131f17f7ee2SSatish Balay #if !defined(H2OPUS_USE_GPU)
132f17f7ee2SSatish Balay typedef HMatrix HMatrix_GPU;
133f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
134f17f7ee2SSatish Balay typedef DistributedHMatrix DistributedHMatrix_GPU;
135f17f7ee2SSatish Balay #endif
136f17f7ee2SSatish Balay #endif
137f17f7ee2SSatish Balay 
138f17f7ee2SSatish Balay typedef struct {
139f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
140f17f7ee2SSatish Balay   distributedH2OpusHandle_t handle;
141f17f7ee2SSatish Balay #else
142f17f7ee2SSatish Balay   h2opusHandle_t handle;
143f17f7ee2SSatish Balay #endif
144f17f7ee2SSatish Balay   /* Sequential and parallel matrices are two different classes at the moment */
145f17f7ee2SSatish Balay   HMatrix *hmatrix;
146f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
147f17f7ee2SSatish Balay   DistributedHMatrix *dist_hmatrix;
148f17f7ee2SSatish Balay #else
149f17f7ee2SSatish Balay   HMatrix *dist_hmatrix; /* just to not clutter the code */
150f17f7ee2SSatish Balay #endif
151f17f7ee2SSatish Balay   /* May use permutations */
152f17f7ee2SSatish Balay   PetscSF sf;
153f17f7ee2SSatish Balay   PetscLayout h2opus_rmap, h2opus_cmap;
154f17f7ee2SSatish Balay   IS h2opus_indexmap;
155f17f7ee2SSatish Balay   thrust::host_vector<PetscScalar> *xx,*yy;
156f17f7ee2SSatish Balay   PetscInt xxs,yys;
157f17f7ee2SSatish Balay   PetscBool multsetup;
158f17f7ee2SSatish Balay 
159f17f7ee2SSatish Balay   /* GPU */
160f17f7ee2SSatish Balay   HMatrix_GPU *hmatrix_gpu;
161f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
162f17f7ee2SSatish Balay   DistributedHMatrix_GPU *dist_hmatrix_gpu;
163f17f7ee2SSatish Balay #else
164f17f7ee2SSatish Balay   HMatrix_GPU *dist_hmatrix_gpu; /* just to not clutter the code */
165f17f7ee2SSatish Balay #endif
166f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
167f17f7ee2SSatish Balay   thrust::device_vector<PetscScalar> *xx_gpu,*yy_gpu;
168f17f7ee2SSatish Balay   PetscInt xxs_gpu,yys_gpu;
169f17f7ee2SSatish Balay #endif
170f17f7ee2SSatish Balay 
171f17f7ee2SSatish Balay   /* construction from matvecs */
172f17f7ee2SSatish Balay   PetscMatrixSampler* sampler;
173f17f7ee2SSatish Balay   PetscBool nativemult;
174f17f7ee2SSatish Balay 
175f17f7ee2SSatish Balay   /* Admissibility */
176f17f7ee2SSatish Balay   PetscReal eta;
177f17f7ee2SSatish Balay   PetscInt  leafsize;
178f17f7ee2SSatish Balay 
179f17f7ee2SSatish Balay   /* for dof reordering */
180f17f7ee2SSatish Balay   PetscPointCloud<PetscReal> *ptcloud;
181f17f7ee2SSatish Balay 
182f17f7ee2SSatish Balay   /* kernel for generating matrix entries */
183f17f7ee2SSatish Balay   PetscFunctionGenerator<PetscScalar> *kernel;
184f17f7ee2SSatish Balay 
185f17f7ee2SSatish Balay   /* basis orthogonalized? */
186f17f7ee2SSatish Balay   PetscBool orthogonal;
187f17f7ee2SSatish Balay 
188f17f7ee2SSatish Balay   /* customization */
189f17f7ee2SSatish Balay   PetscInt  basisord;
190f17f7ee2SSatish Balay   PetscInt  max_rank;
191f17f7ee2SSatish Balay   PetscInt  bs;
192f17f7ee2SSatish Balay   PetscReal rtol;
193f17f7ee2SSatish Balay   PetscInt  norm_max_samples;
194f17f7ee2SSatish Balay   PetscBool check_construction;
195f17f7ee2SSatish Balay   PetscBool hara_verbose;
196*036e4bdcSSatish Balay   PetscBool resize;
197f17f7ee2SSatish Balay 
198f17f7ee2SSatish Balay   /* keeps track of MatScale values */
199f17f7ee2SSatish Balay   PetscScalar s;
200f17f7ee2SSatish Balay } Mat_H2OPUS;
201f17f7ee2SSatish Balay 
202f17f7ee2SSatish Balay static PetscErrorCode MatDestroy_H2OPUS(Mat A)
203f17f7ee2SSatish Balay {
204f17f7ee2SSatish Balay   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
205f17f7ee2SSatish Balay 
206f17f7ee2SSatish Balay   PetscFunctionBegin;
207f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
208f17f7ee2SSatish Balay   h2opusDestroyDistributedHandle(a->handle);
209f17f7ee2SSatish Balay #else
210f17f7ee2SSatish Balay   h2opusDestroyHandle(a->handle);
211f17f7ee2SSatish Balay #endif
212f17f7ee2SSatish Balay   delete a->dist_hmatrix;
213f17f7ee2SSatish Balay   delete a->hmatrix;
214f17f7ee2SSatish Balay   PetscCall(PetscSFDestroy(&a->sf));
215f17f7ee2SSatish Balay   PetscCall(PetscLayoutDestroy(&a->h2opus_rmap));
216f17f7ee2SSatish Balay   PetscCall(PetscLayoutDestroy(&a->h2opus_cmap));
217f17f7ee2SSatish Balay   PetscCall(ISDestroy(&a->h2opus_indexmap));
218f17f7ee2SSatish Balay   delete a->xx;
219f17f7ee2SSatish Balay   delete a->yy;
220f17f7ee2SSatish Balay   delete a->hmatrix_gpu;
221f17f7ee2SSatish Balay   delete a->dist_hmatrix_gpu;
222f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
223f17f7ee2SSatish Balay   delete a->xx_gpu;
224f17f7ee2SSatish Balay   delete a->yy_gpu;
225f17f7ee2SSatish Balay #endif
226f17f7ee2SSatish Balay   delete a->sampler;
227f17f7ee2SSatish Balay   delete a->ptcloud;
228f17f7ee2SSatish Balay   delete a->kernel;
229f17f7ee2SSatish Balay   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdense_C",NULL));
230f17f7ee2SSatish Balay   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdensecuda_C",NULL));
231f17f7ee2SSatish Balay   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidense_C",NULL));
232f17f7ee2SSatish Balay   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidensecuda_C",NULL));
233f17f7ee2SSatish Balay   PetscCall(PetscObjectChangeTypeName((PetscObject)A,NULL));
234f17f7ee2SSatish Balay   PetscCall(PetscFree(A->data));
235f17f7ee2SSatish Balay   PetscFunctionReturn(0);
236f17f7ee2SSatish Balay }
237f17f7ee2SSatish Balay 
238f17f7ee2SSatish Balay PetscErrorCode MatH2OpusSetNativeMult(Mat A, PetscBool nm)
239f17f7ee2SSatish Balay {
240f17f7ee2SSatish Balay   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
241f17f7ee2SSatish Balay   PetscBool      ish2opus;
242f17f7ee2SSatish Balay 
243f17f7ee2SSatish Balay   PetscFunctionBegin;
244f17f7ee2SSatish Balay   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
245f17f7ee2SSatish Balay   PetscValidLogicalCollectiveBool(A,nm,2);
246f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
247f17f7ee2SSatish Balay   if (ish2opus) {
248f17f7ee2SSatish Balay     if (a->h2opus_rmap) { /* need to swap layouts for vector creation */
249f17f7ee2SSatish Balay       if ((!a->nativemult && nm) || (a->nativemult && !nm)) {
250f17f7ee2SSatish Balay         PetscLayout t;
251f17f7ee2SSatish Balay         t = A->rmap;
252f17f7ee2SSatish Balay         A->rmap = a->h2opus_rmap;
253f17f7ee2SSatish Balay         a->h2opus_rmap = t;
254f17f7ee2SSatish Balay         t = A->cmap;
255f17f7ee2SSatish Balay         A->cmap = a->h2opus_cmap;
256f17f7ee2SSatish Balay         a->h2opus_cmap = t;
257f17f7ee2SSatish Balay       }
258f17f7ee2SSatish Balay     }
259f17f7ee2SSatish Balay     a->nativemult = nm;
260f17f7ee2SSatish Balay   }
261f17f7ee2SSatish Balay   PetscFunctionReturn(0);
262f17f7ee2SSatish Balay }
263f17f7ee2SSatish Balay 
264f17f7ee2SSatish Balay PetscErrorCode MatH2OpusGetNativeMult(Mat A, PetscBool *nm)
265f17f7ee2SSatish Balay {
266f17f7ee2SSatish Balay   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
267f17f7ee2SSatish Balay   PetscBool      ish2opus;
268f17f7ee2SSatish Balay 
269f17f7ee2SSatish Balay   PetscFunctionBegin;
270f17f7ee2SSatish Balay   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
271f17f7ee2SSatish Balay   PetscValidPointer(nm,2);
272f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
273f17f7ee2SSatish Balay   PetscCheck(ish2opus,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name);
274f17f7ee2SSatish Balay   *nm = a->nativemult;
275f17f7ee2SSatish Balay   PetscFunctionReturn(0);
276f17f7ee2SSatish Balay }
277f17f7ee2SSatish Balay 
278f17f7ee2SSatish Balay PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat A, NormType normtype, PetscReal* n)
279f17f7ee2SSatish Balay {
280f17f7ee2SSatish Balay   PetscBool      ish2opus;
281f17f7ee2SSatish Balay   PetscInt       nmax = PETSC_DECIDE;
282f17f7ee2SSatish Balay   Mat_H2OPUS     *a = NULL;
283f17f7ee2SSatish Balay   PetscBool      mult = PETSC_FALSE;
284f17f7ee2SSatish Balay 
285f17f7ee2SSatish Balay   PetscFunctionBegin;
286f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
287f17f7ee2SSatish Balay   if (ish2opus) { /* set userdefine number of samples and fastpath for mult (norms are order independent) */
288f17f7ee2SSatish Balay     a = (Mat_H2OPUS*)A->data;
289f17f7ee2SSatish Balay 
290f17f7ee2SSatish Balay     nmax = a->norm_max_samples;
291f17f7ee2SSatish Balay     mult = a->nativemult;
292f17f7ee2SSatish Balay     PetscCall(MatH2OpusSetNativeMult(A,PETSC_TRUE));
293f17f7ee2SSatish Balay   } else {
294f17f7ee2SSatish Balay     PetscCall(PetscOptionsGetInt(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_approximate_norm_samples",&nmax,NULL));
295f17f7ee2SSatish Balay   }
296f17f7ee2SSatish Balay   PetscCall(MatApproximateNorm_Private(A,normtype,nmax,n));
297f17f7ee2SSatish Balay   if (a) PetscCall(MatH2OpusSetNativeMult(A,mult));
298f17f7ee2SSatish Balay   PetscFunctionReturn(0);
299f17f7ee2SSatish Balay }
300f17f7ee2SSatish Balay 
301f17f7ee2SSatish Balay static PetscErrorCode MatH2OpusResizeBuffers_Private(Mat A, PetscInt xN, PetscInt yN)
302f17f7ee2SSatish Balay {
303f17f7ee2SSatish Balay   Mat_H2OPUS     *h2opus = (Mat_H2OPUS*)A->data;
304f17f7ee2SSatish Balay   PetscInt       n;
305f17f7ee2SSatish Balay   PetscBool      boundtocpu = PETSC_TRUE;
306f17f7ee2SSatish Balay 
307f17f7ee2SSatish Balay   PetscFunctionBegin;
308f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
309f17f7ee2SSatish Balay   boundtocpu = A->boundtocpu;
310f17f7ee2SSatish Balay #endif
311f17f7ee2SSatish Balay   PetscCall(PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL));
312f17f7ee2SSatish Balay   if (boundtocpu) {
313f17f7ee2SSatish Balay     if (h2opus->xxs < xN) { h2opus->xx->resize(n*xN); h2opus->xxs = xN; }
314f17f7ee2SSatish Balay     if (h2opus->yys < yN) { h2opus->yy->resize(n*yN); h2opus->yys = yN; }
315f17f7ee2SSatish Balay   }
316f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
317f17f7ee2SSatish Balay   if (!boundtocpu) {
318f17f7ee2SSatish Balay     if (h2opus->xxs_gpu < xN) { h2opus->xx_gpu->resize(n*xN); h2opus->xxs_gpu = xN; }
319f17f7ee2SSatish Balay     if (h2opus->yys_gpu < yN) { h2opus->yy_gpu->resize(n*yN); h2opus->yys_gpu = yN; }
320f17f7ee2SSatish Balay   }
321f17f7ee2SSatish Balay #endif
322f17f7ee2SSatish Balay   PetscFunctionReturn(0);
323f17f7ee2SSatish Balay }
324f17f7ee2SSatish Balay 
325f17f7ee2SSatish Balay static PetscErrorCode MatMultNKernel_H2OPUS(Mat A, PetscBool transA, Mat B, Mat C)
326f17f7ee2SSatish Balay {
327f17f7ee2SSatish Balay   Mat_H2OPUS     *h2opus = (Mat_H2OPUS*)A->data;
328f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
329f17f7ee2SSatish Balay   h2opusHandle_t handle = h2opus->handle->handle;
330f17f7ee2SSatish Balay #else
331f17f7ee2SSatish Balay   h2opusHandle_t handle = h2opus->handle;
332f17f7ee2SSatish Balay #endif
333f17f7ee2SSatish Balay   PetscBool      boundtocpu = PETSC_TRUE;
334f17f7ee2SSatish Balay   PetscScalar    *xx,*yy,*uxx,*uyy;
335f17f7ee2SSatish Balay   PetscInt       blda,clda;
336f17f7ee2SSatish Balay   PetscMPIInt    size;
337f17f7ee2SSatish Balay   PetscSF        bsf,csf;
338f17f7ee2SSatish Balay   PetscBool      usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
339f17f7ee2SSatish Balay 
340f17f7ee2SSatish Balay   PetscFunctionBegin;
341f17f7ee2SSatish Balay   HLibProfile::clear();
342f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
343f17f7ee2SSatish Balay   boundtocpu = A->boundtocpu;
344f17f7ee2SSatish Balay #endif
345f17f7ee2SSatish Balay   PetscCall(MatDenseGetLDA(B,&blda));
346f17f7ee2SSatish Balay   PetscCall(MatDenseGetLDA(C,&clda));
347f17f7ee2SSatish Balay   if (usesf) {
348f17f7ee2SSatish Balay     PetscInt n;
349f17f7ee2SSatish Balay 
350f17f7ee2SSatish Balay     PetscCall(MatDenseGetH2OpusVectorSF(B,h2opus->sf,&bsf));
351f17f7ee2SSatish Balay     PetscCall(MatDenseGetH2OpusVectorSF(C,h2opus->sf,&csf));
352f17f7ee2SSatish Balay 
353f17f7ee2SSatish Balay     PetscCall(MatH2OpusResizeBuffers_Private(A,B->cmap->N,C->cmap->N));
354f17f7ee2SSatish Balay     PetscCall(PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL));
355f17f7ee2SSatish Balay     blda = n;
356f17f7ee2SSatish Balay     clda = n;
357f17f7ee2SSatish Balay   }
358f17f7ee2SSatish Balay   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
359f17f7ee2SSatish Balay   if (boundtocpu) {
360f17f7ee2SSatish Balay     PetscCall(MatDenseGetArrayRead(B,(const PetscScalar**)&xx));
361f17f7ee2SSatish Balay     PetscCall(MatDenseGetArrayWrite(C,&yy));
362f17f7ee2SSatish Balay     if (usesf) {
363f17f7ee2SSatish Balay       uxx  = MatH2OpusGetThrustPointer(*h2opus->xx);
364f17f7ee2SSatish Balay       uyy  = MatH2OpusGetThrustPointer(*h2opus->yy);
365f17f7ee2SSatish Balay       PetscCall(PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
366f17f7ee2SSatish Balay       PetscCall(PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
367f17f7ee2SSatish Balay     } else {
368f17f7ee2SSatish Balay       uxx = xx;
369f17f7ee2SSatish Balay       uyy = yy;
370f17f7ee2SSatish Balay     }
371f17f7ee2SSatish Balay     if (size > 1) {
372f17f7ee2SSatish Balay       PetscCheck(h2opus->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
373*036e4bdcSSatish Balay       PetscCheck(!transA || A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
374f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
375f17f7ee2SSatish Balay       distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
376f17f7ee2SSatish Balay #endif
377f17f7ee2SSatish Balay     } else {
378f17f7ee2SSatish Balay       PetscCheck(h2opus->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
379f17f7ee2SSatish Balay       hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
380f17f7ee2SSatish Balay     }
381f17f7ee2SSatish Balay     PetscCall(MatDenseRestoreArrayRead(B,(const PetscScalar**)&xx));
382f17f7ee2SSatish Balay     if (usesf) {
383f17f7ee2SSatish Balay       PetscCall(PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
384f17f7ee2SSatish Balay       PetscCall(PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
385f17f7ee2SSatish Balay     }
386f17f7ee2SSatish Balay     PetscCall(MatDenseRestoreArrayWrite(C,&yy));
387f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
388f17f7ee2SSatish Balay   } else {
389f17f7ee2SSatish Balay     PetscBool ciscuda,biscuda;
390f17f7ee2SSatish Balay 
391f17f7ee2SSatish Balay     /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */
392f17f7ee2SSatish Balay     PetscCall(PetscObjectTypeCompareAny((PetscObject)B,&biscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,""));
393f17f7ee2SSatish Balay     if (!biscuda) {
394f17f7ee2SSatish Balay       PetscCall(MatConvert(B,MATDENSECUDA,MAT_INPLACE_MATRIX,&B));
395f17f7ee2SSatish Balay     }
396f17f7ee2SSatish Balay     PetscCall(PetscObjectTypeCompareAny((PetscObject)C,&ciscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,""));
397f17f7ee2SSatish Balay     if (!ciscuda) {
398f17f7ee2SSatish Balay       C->assembled = PETSC_TRUE;
399f17f7ee2SSatish Balay       PetscCall(MatConvert(C,MATDENSECUDA,MAT_INPLACE_MATRIX,&C));
400f17f7ee2SSatish Balay     }
401f17f7ee2SSatish Balay     PetscCall(MatDenseCUDAGetArrayRead(B,(const PetscScalar**)&xx));
402f17f7ee2SSatish Balay     PetscCall(MatDenseCUDAGetArrayWrite(C,&yy));
403f17f7ee2SSatish Balay     if (usesf) {
404f17f7ee2SSatish Balay       uxx  = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
405f17f7ee2SSatish Balay       uyy  = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
406f17f7ee2SSatish Balay       PetscCall(PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
407f17f7ee2SSatish Balay       PetscCall(PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
408f17f7ee2SSatish Balay     } else {
409f17f7ee2SSatish Balay       uxx = xx;
410f17f7ee2SSatish Balay       uyy = yy;
411f17f7ee2SSatish Balay     }
412f17f7ee2SSatish Balay     PetscCall(PetscLogGpuTimeBegin());
413f17f7ee2SSatish Balay     if (size > 1) {
414f17f7ee2SSatish Balay       PetscCheck(h2opus->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed GPU matrix");
415*036e4bdcSSatish Balay       PetscCheck(!transA || A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
416f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
417f17f7ee2SSatish Balay       distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
418f17f7ee2SSatish Balay #endif
419f17f7ee2SSatish Balay     } else {
420f17f7ee2SSatish Balay       PetscCheck(h2opus->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
421f17f7ee2SSatish Balay       hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
422f17f7ee2SSatish Balay     }
423f17f7ee2SSatish Balay     PetscCall(PetscLogGpuTimeEnd());
424f17f7ee2SSatish Balay     PetscCall(MatDenseCUDARestoreArrayRead(B,(const PetscScalar**)&xx));
425f17f7ee2SSatish Balay     if (usesf) {
426f17f7ee2SSatish Balay       PetscCall(PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
427f17f7ee2SSatish Balay       PetscCall(PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
428f17f7ee2SSatish Balay     }
429f17f7ee2SSatish Balay     PetscCall(MatDenseCUDARestoreArrayWrite(C,&yy));
430f17f7ee2SSatish Balay     if (!biscuda) {
431f17f7ee2SSatish Balay       PetscCall(MatConvert(B,MATDENSE,MAT_INPLACE_MATRIX,&B));
432f17f7ee2SSatish Balay     }
433f17f7ee2SSatish Balay     if (!ciscuda) {
434f17f7ee2SSatish Balay       PetscCall(MatConvert(C,MATDENSE,MAT_INPLACE_MATRIX,&C));
435f17f7ee2SSatish Balay     }
436f17f7ee2SSatish Balay #endif
437f17f7ee2SSatish Balay   }
438f17f7ee2SSatish Balay   { /* log flops */
439f17f7ee2SSatish Balay     double gops,time,perf,dev;
440f17f7ee2SSatish Balay     HLibProfile::getHgemvPerf(gops,time,perf,dev);
441f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
442f17f7ee2SSatish Balay     if (boundtocpu) {
443f17f7ee2SSatish Balay       PetscCall(PetscLogFlops(1e9*gops));
444f17f7ee2SSatish Balay     } else {
445f17f7ee2SSatish Balay       PetscCall(PetscLogGpuFlops(1e9*gops));
446f17f7ee2SSatish Balay     }
447f17f7ee2SSatish Balay #else
448f17f7ee2SSatish Balay     PetscCall(PetscLogFlops(1e9*gops));
449f17f7ee2SSatish Balay #endif
450f17f7ee2SSatish Balay   }
451f17f7ee2SSatish Balay   PetscFunctionReturn(0);
452f17f7ee2SSatish Balay }
453f17f7ee2SSatish Balay 
454f17f7ee2SSatish Balay static PetscErrorCode MatProductNumeric_H2OPUS(Mat C)
455f17f7ee2SSatish Balay {
456f17f7ee2SSatish Balay   Mat_Product    *product = C->product;
457f17f7ee2SSatish Balay 
458f17f7ee2SSatish Balay   PetscFunctionBegin;
459f17f7ee2SSatish Balay   MatCheckProduct(C,1);
460f17f7ee2SSatish Balay   switch (product->type) {
461f17f7ee2SSatish Balay   case MATPRODUCT_AB:
462f17f7ee2SSatish Balay     PetscCall(MatMultNKernel_H2OPUS(product->A,PETSC_FALSE,product->B,C));
463f17f7ee2SSatish Balay     break;
464f17f7ee2SSatish Balay   case MATPRODUCT_AtB:
465f17f7ee2SSatish Balay     PetscCall(MatMultNKernel_H2OPUS(product->A,PETSC_TRUE,product->B,C));
466f17f7ee2SSatish Balay     break;
467f17f7ee2SSatish Balay   default:
468f17f7ee2SSatish Balay     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]);
469f17f7ee2SSatish Balay   }
470f17f7ee2SSatish Balay   PetscFunctionReturn(0);
471f17f7ee2SSatish Balay }
472f17f7ee2SSatish Balay 
473f17f7ee2SSatish Balay static PetscErrorCode MatProductSymbolic_H2OPUS(Mat C)
474f17f7ee2SSatish Balay {
475f17f7ee2SSatish Balay   Mat_Product    *product = C->product;
476f17f7ee2SSatish Balay   PetscBool      cisdense;
477f17f7ee2SSatish Balay   Mat            A,B;
478f17f7ee2SSatish Balay 
479f17f7ee2SSatish Balay   PetscFunctionBegin;
480f17f7ee2SSatish Balay   MatCheckProduct(C,1);
481f17f7ee2SSatish Balay   A = product->A;
482f17f7ee2SSatish Balay   B = product->B;
483f17f7ee2SSatish Balay   switch (product->type) {
484f17f7ee2SSatish Balay   case MATPRODUCT_AB:
485f17f7ee2SSatish Balay     PetscCall(MatSetSizes(C,A->rmap->n,B->cmap->n,A->rmap->N,B->cmap->N));
486f17f7ee2SSatish Balay     PetscCall(MatSetBlockSizesFromMats(C,product->A,product->B));
487f17f7ee2SSatish Balay     PetscCall(PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,""));
488f17f7ee2SSatish Balay     if (!cisdense) PetscCall(MatSetType(C,((PetscObject)product->B)->type_name));
489f17f7ee2SSatish Balay     PetscCall(MatSetUp(C));
490f17f7ee2SSatish Balay     break;
491f17f7ee2SSatish Balay   case MATPRODUCT_AtB:
492f17f7ee2SSatish Balay     PetscCall(MatSetSizes(C,A->cmap->n,B->cmap->n,A->cmap->N,B->cmap->N));
493f17f7ee2SSatish Balay     PetscCall(MatSetBlockSizesFromMats(C,product->A,product->B));
494f17f7ee2SSatish Balay     PetscCall(PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,""));
495f17f7ee2SSatish Balay     if (!cisdense) PetscCall(MatSetType(C,((PetscObject)product->B)->type_name));
496f17f7ee2SSatish Balay     PetscCall(MatSetUp(C));
497f17f7ee2SSatish Balay     break;
498f17f7ee2SSatish Balay   default:
499f17f7ee2SSatish Balay     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]);
500f17f7ee2SSatish Balay   }
501f17f7ee2SSatish Balay   C->ops->productsymbolic = NULL;
502f17f7ee2SSatish Balay   C->ops->productnumeric = MatProductNumeric_H2OPUS;
503f17f7ee2SSatish Balay   PetscFunctionReturn(0);
504f17f7ee2SSatish Balay }
505f17f7ee2SSatish Balay 
506f17f7ee2SSatish Balay static PetscErrorCode MatProductSetFromOptions_H2OPUS(Mat C)
507f17f7ee2SSatish Balay {
508f17f7ee2SSatish Balay   PetscFunctionBegin;
509f17f7ee2SSatish Balay   MatCheckProduct(C,1);
510f17f7ee2SSatish Balay   if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) {
511f17f7ee2SSatish Balay     C->ops->productsymbolic = MatProductSymbolic_H2OPUS;
512f17f7ee2SSatish Balay   }
513f17f7ee2SSatish Balay   PetscFunctionReturn(0);
514f17f7ee2SSatish Balay }
515f17f7ee2SSatish Balay 
516f17f7ee2SSatish Balay static PetscErrorCode MatMultKernel_H2OPUS(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans)
517f17f7ee2SSatish Balay {
518f17f7ee2SSatish Balay   Mat_H2OPUS     *h2opus = (Mat_H2OPUS*)A->data;
519f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
520f17f7ee2SSatish Balay   h2opusHandle_t handle = h2opus->handle->handle;
521f17f7ee2SSatish Balay #else
522f17f7ee2SSatish Balay   h2opusHandle_t handle = h2opus->handle;
523f17f7ee2SSatish Balay #endif
524f17f7ee2SSatish Balay   PetscBool      boundtocpu = PETSC_TRUE;
525f17f7ee2SSatish Balay   PetscInt       n;
526f17f7ee2SSatish Balay   PetscScalar    *xx,*yy,*uxx,*uyy;
527f17f7ee2SSatish Balay   PetscMPIInt    size;
528f17f7ee2SSatish Balay   PetscBool      usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
529f17f7ee2SSatish Balay 
530f17f7ee2SSatish Balay   PetscFunctionBegin;
531f17f7ee2SSatish Balay   HLibProfile::clear();
532f17f7ee2SSatish Balay   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
533f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
534f17f7ee2SSatish Balay   boundtocpu = A->boundtocpu;
535f17f7ee2SSatish Balay #endif
536f17f7ee2SSatish Balay   if (usesf) {
537f17f7ee2SSatish Balay     PetscCall(PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL));
538f17f7ee2SSatish Balay   } else n = A->rmap->n;
539f17f7ee2SSatish Balay   if (boundtocpu) {
540f17f7ee2SSatish Balay     PetscCall(VecGetArrayRead(x,(const PetscScalar**)&xx));
541f17f7ee2SSatish Balay     if (sy == 0.0) {
542f17f7ee2SSatish Balay       PetscCall(VecGetArrayWrite(y,&yy));
543f17f7ee2SSatish Balay     } else {
544f17f7ee2SSatish Balay       PetscCall(VecGetArray(y,&yy));
545f17f7ee2SSatish Balay     }
546f17f7ee2SSatish Balay     if (usesf) {
547f17f7ee2SSatish Balay       uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
548f17f7ee2SSatish Balay       uyy = MatH2OpusGetThrustPointer(*h2opus->yy);
549f17f7ee2SSatish Balay 
550f17f7ee2SSatish Balay       PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
551f17f7ee2SSatish Balay       PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
552f17f7ee2SSatish Balay       if (sy != 0.0) {
553f17f7ee2SSatish Balay         PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE));
554f17f7ee2SSatish Balay         PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE));
555f17f7ee2SSatish Balay       }
556f17f7ee2SSatish Balay     } else {
557f17f7ee2SSatish Balay       uxx = xx;
558f17f7ee2SSatish Balay       uyy = yy;
559f17f7ee2SSatish Balay     }
560f17f7ee2SSatish Balay     if (size > 1) {
561f17f7ee2SSatish Balay       PetscCheck(h2opus->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
562*036e4bdcSSatish Balay       PetscCheck(!trans || A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
563f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
564f17f7ee2SSatish Balay       distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix, uxx, n, sy, uyy, n, 1, h2opus->handle);
565f17f7ee2SSatish Balay #endif
566f17f7ee2SSatish Balay     } else {
567f17f7ee2SSatish Balay       PetscCheck(h2opus->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
568f17f7ee2SSatish Balay       hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, n, sy, uyy, n, 1, handle);
569f17f7ee2SSatish Balay     }
570f17f7ee2SSatish Balay     PetscCall(VecRestoreArrayRead(x,(const PetscScalar**)&xx));
571f17f7ee2SSatish Balay     if (usesf) {
572f17f7ee2SSatish Balay       PetscCall(PetscSFReduceBegin(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
573f17f7ee2SSatish Balay       PetscCall(PetscSFReduceEnd(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
574f17f7ee2SSatish Balay     }
575f17f7ee2SSatish Balay     if (sy == 0.0) {
576f17f7ee2SSatish Balay       PetscCall(VecRestoreArrayWrite(y,&yy));
577f17f7ee2SSatish Balay     } else {
578f17f7ee2SSatish Balay       PetscCall(VecRestoreArray(y,&yy));
579f17f7ee2SSatish Balay     }
580f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
581f17f7ee2SSatish Balay   } else {
582f17f7ee2SSatish Balay     PetscCall(VecCUDAGetArrayRead(x,(const PetscScalar**)&xx));
583f17f7ee2SSatish Balay     if (sy == 0.0) {
584f17f7ee2SSatish Balay       PetscCall(VecCUDAGetArrayWrite(y,&yy));
585f17f7ee2SSatish Balay     } else {
586f17f7ee2SSatish Balay       PetscCall(VecCUDAGetArray(y,&yy));
587f17f7ee2SSatish Balay     }
588f17f7ee2SSatish Balay     if (usesf) {
589f17f7ee2SSatish Balay       uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
590f17f7ee2SSatish Balay       uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
591f17f7ee2SSatish Balay 
592f17f7ee2SSatish Balay       PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
593f17f7ee2SSatish Balay       PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
594f17f7ee2SSatish Balay       if (sy != 0.0) {
595f17f7ee2SSatish Balay         PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE));
596f17f7ee2SSatish Balay         PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE));
597f17f7ee2SSatish Balay       }
598f17f7ee2SSatish Balay     } else {
599f17f7ee2SSatish Balay       uxx = xx;
600f17f7ee2SSatish Balay       uyy = yy;
601f17f7ee2SSatish Balay     }
602f17f7ee2SSatish Balay     PetscCall(PetscLogGpuTimeBegin());
603f17f7ee2SSatish Balay     if (size > 1) {
604f17f7ee2SSatish Balay       PetscCheck(h2opus->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed GPU matrix");
605*036e4bdcSSatish Balay       PetscCheck(!trans || A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
606f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
607f17f7ee2SSatish Balay       distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, h2opus->handle);
608f17f7ee2SSatish Balay #endif
609f17f7ee2SSatish Balay     } else {
610f17f7ee2SSatish Balay       PetscCheck(h2opus->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
611f17f7ee2SSatish Balay       hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle);
612f17f7ee2SSatish Balay     }
613f17f7ee2SSatish Balay     PetscCall(PetscLogGpuTimeEnd());
614f17f7ee2SSatish Balay     PetscCall(VecCUDARestoreArrayRead(x,(const PetscScalar**)&xx));
615f17f7ee2SSatish Balay     if (usesf) {
616f17f7ee2SSatish Balay       PetscCall(PetscSFReduceBegin(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
617f17f7ee2SSatish Balay       PetscCall(PetscSFReduceEnd(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
618f17f7ee2SSatish Balay     }
619f17f7ee2SSatish Balay     if (sy == 0.0) {
620f17f7ee2SSatish Balay       PetscCall(VecCUDARestoreArrayWrite(y,&yy));
621f17f7ee2SSatish Balay     } else {
622f17f7ee2SSatish Balay       PetscCall(VecCUDARestoreArray(y,&yy));
623f17f7ee2SSatish Balay     }
624f17f7ee2SSatish Balay #endif
625f17f7ee2SSatish Balay   }
626f17f7ee2SSatish Balay   { /* log flops */
627f17f7ee2SSatish Balay     double gops,time,perf,dev;
628f17f7ee2SSatish Balay     HLibProfile::getHgemvPerf(gops,time,perf,dev);
629f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
630f17f7ee2SSatish Balay     if (boundtocpu) {
631f17f7ee2SSatish Balay       PetscCall(PetscLogFlops(1e9*gops));
632f17f7ee2SSatish Balay     } else {
633f17f7ee2SSatish Balay       PetscCall(PetscLogGpuFlops(1e9*gops));
634f17f7ee2SSatish Balay     }
635f17f7ee2SSatish Balay #else
636f17f7ee2SSatish Balay     PetscCall(PetscLogFlops(1e9*gops));
637f17f7ee2SSatish Balay #endif
638f17f7ee2SSatish Balay   }
639f17f7ee2SSatish Balay   PetscFunctionReturn(0);
640f17f7ee2SSatish Balay }
641f17f7ee2SSatish Balay 
642f17f7ee2SSatish Balay static PetscErrorCode MatMultTranspose_H2OPUS(Mat A, Vec x, Vec y)
643f17f7ee2SSatish Balay {
644f17f7ee2SSatish Balay   PetscBool      xiscuda,yiscuda;
645f17f7ee2SSatish Balay 
646f17f7ee2SSatish Balay   PetscFunctionBegin;
647f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,""));
648f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompareAny((PetscObject)y,&yiscuda,VECSEQCUDA,VECMPICUDA,""));
649f17f7ee2SSatish Balay   PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !yiscuda));
650f17f7ee2SSatish Balay   PetscCall(MatMultKernel_H2OPUS(A,x,0.0,y,PETSC_TRUE));
651f17f7ee2SSatish Balay   PetscFunctionReturn(0);
652f17f7ee2SSatish Balay }
653f17f7ee2SSatish Balay 
654f17f7ee2SSatish Balay static PetscErrorCode MatMult_H2OPUS(Mat A, Vec x, Vec y)
655f17f7ee2SSatish Balay {
656f17f7ee2SSatish Balay   PetscBool      xiscuda,yiscuda;
657f17f7ee2SSatish Balay 
658f17f7ee2SSatish Balay   PetscFunctionBegin;
659f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,""));
660f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompareAny((PetscObject)y,&yiscuda,VECSEQCUDA,VECMPICUDA,""));
661f17f7ee2SSatish Balay   PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !yiscuda));
662f17f7ee2SSatish Balay   PetscCall(MatMultKernel_H2OPUS(A,x,0.0,y,PETSC_FALSE));
663f17f7ee2SSatish Balay   PetscFunctionReturn(0);
664f17f7ee2SSatish Balay }
665f17f7ee2SSatish Balay 
666f17f7ee2SSatish Balay static PetscErrorCode MatMultTransposeAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
667f17f7ee2SSatish Balay {
668f17f7ee2SSatish Balay   PetscBool      xiscuda,ziscuda;
669f17f7ee2SSatish Balay 
670f17f7ee2SSatish Balay   PetscFunctionBegin;
671f17f7ee2SSatish Balay   PetscCall(VecCopy(y,z));
672f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,""));
673f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompareAny((PetscObject)z,&ziscuda,VECSEQCUDA,VECMPICUDA,""));
674f17f7ee2SSatish Balay   PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !ziscuda));
675f17f7ee2SSatish Balay   PetscCall(MatMultKernel_H2OPUS(A,x,1.0,z,PETSC_TRUE));
676f17f7ee2SSatish Balay   PetscFunctionReturn(0);
677f17f7ee2SSatish Balay }
678f17f7ee2SSatish Balay 
679f17f7ee2SSatish Balay static PetscErrorCode MatMultAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
680f17f7ee2SSatish Balay {
681f17f7ee2SSatish Balay   PetscBool      xiscuda,ziscuda;
682f17f7ee2SSatish Balay 
683f17f7ee2SSatish Balay   PetscFunctionBegin;
684f17f7ee2SSatish Balay   PetscCall(VecCopy(y,z));
685f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,""));
686f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompareAny((PetscObject)z,&ziscuda,VECSEQCUDA,VECMPICUDA,""));
687f17f7ee2SSatish Balay   PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !ziscuda));
688f17f7ee2SSatish Balay   PetscCall(MatMultKernel_H2OPUS(A,x,1.0,z,PETSC_FALSE));
689f17f7ee2SSatish Balay   PetscFunctionReturn(0);
690f17f7ee2SSatish Balay }
691f17f7ee2SSatish Balay 
692f17f7ee2SSatish Balay static PetscErrorCode MatScale_H2OPUS(Mat A, PetscScalar s)
693f17f7ee2SSatish Balay {
694f17f7ee2SSatish Balay   Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
695f17f7ee2SSatish Balay 
696f17f7ee2SSatish Balay   PetscFunctionBegin;
697f17f7ee2SSatish Balay   a->s *= s;
698f17f7ee2SSatish Balay   PetscFunctionReturn(0);
699f17f7ee2SSatish Balay }
700f17f7ee2SSatish Balay 
701f17f7ee2SSatish Balay static PetscErrorCode MatSetFromOptions_H2OPUS(PetscOptionItems *PetscOptionsObject,Mat A)
702f17f7ee2SSatish Balay {
703f17f7ee2SSatish Balay   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
704f17f7ee2SSatish Balay 
705f17f7ee2SSatish Balay   PetscFunctionBegin;
706*036e4bdcSSatish Balay   PetscOptionsHeadBegin(PetscOptionsObject,"H2OPUS options");
707f17f7ee2SSatish Balay   PetscCall(PetscOptionsInt("-mat_h2opus_leafsize","Leaf size of cluster tree",NULL,a->leafsize,&a->leafsize,NULL));
708f17f7ee2SSatish Balay   PetscCall(PetscOptionsReal("-mat_h2opus_eta","Admissibility condition tolerance",NULL,a->eta,&a->eta,NULL));
709f17f7ee2SSatish Balay   PetscCall(PetscOptionsInt("-mat_h2opus_order","Basis order for off-diagonal sampling when constructed from kernel",NULL,a->basisord,&a->basisord,NULL));
710f17f7ee2SSatish Balay   PetscCall(PetscOptionsInt("-mat_h2opus_maxrank","Maximum rank when constructed from matvecs",NULL,a->max_rank,&a->max_rank,NULL));
711f17f7ee2SSatish Balay   PetscCall(PetscOptionsInt("-mat_h2opus_samples","Maximum number of samples to be taken concurrently when constructing from matvecs",NULL,a->bs,&a->bs,NULL));
712f17f7ee2SSatish Balay   PetscCall(PetscOptionsInt("-mat_h2opus_normsamples","Maximum bumber of samples to be when estimating norms",NULL,a->norm_max_samples,&a->norm_max_samples,NULL));
713f17f7ee2SSatish Balay   PetscCall(PetscOptionsReal("-mat_h2opus_rtol","Relative tolerance for construction from sampling",NULL,a->rtol,&a->rtol,NULL));
714f17f7ee2SSatish Balay   PetscCall(PetscOptionsBool("-mat_h2opus_check","Check error when constructing from sampling during MatAssemblyEnd()",NULL,a->check_construction,&a->check_construction,NULL));
715f17f7ee2SSatish Balay   PetscCall(PetscOptionsBool("-mat_h2opus_hara_verbose","Verbose output from hara construction",NULL,a->hara_verbose,&a->hara_verbose,NULL));
716*036e4bdcSSatish Balay   PetscCall(PetscOptionsBool("-mat_h2opus_resize","Resize after compression",NULL,a->resize,&a->resize,NULL));
717*036e4bdcSSatish Balay   PetscOptionsHeadEnd();
718f17f7ee2SSatish Balay   PetscFunctionReturn(0);
719f17f7ee2SSatish Balay }
720f17f7ee2SSatish Balay 
721f17f7ee2SSatish Balay static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat,PetscInt,const PetscReal[],PetscBool,MatH2OpusKernel,void*);
722f17f7ee2SSatish Balay 
723f17f7ee2SSatish Balay static PetscErrorCode MatH2OpusInferCoordinates_Private(Mat A)
724f17f7ee2SSatish Balay {
725f17f7ee2SSatish Balay   Mat_H2OPUS        *a = (Mat_H2OPUS*)A->data;
726f17f7ee2SSatish Balay   Vec               c;
727f17f7ee2SSatish Balay   PetscInt          spacedim;
728f17f7ee2SSatish Balay   const PetscScalar *coords;
729f17f7ee2SSatish Balay 
730f17f7ee2SSatish Balay   PetscFunctionBegin;
731f17f7ee2SSatish Balay   if (a->ptcloud) PetscFunctionReturn(0);
732f17f7ee2SSatish Balay   PetscCall(PetscObjectQuery((PetscObject)A,"__math2opus_coords",(PetscObject*)&c));
733f17f7ee2SSatish Balay   if (!c && a->sampler) {
734f17f7ee2SSatish Balay     Mat S = a->sampler->GetSamplingMat();
735f17f7ee2SSatish Balay 
736f17f7ee2SSatish Balay     PetscCall(PetscObjectQuery((PetscObject)S,"__math2opus_coords",(PetscObject*)&c));
737f17f7ee2SSatish Balay   }
738f17f7ee2SSatish Balay   if (!c) {
739f17f7ee2SSatish Balay     PetscCall(MatH2OpusSetCoords_H2OPUS(A,-1,NULL,PETSC_FALSE,NULL,NULL));
740f17f7ee2SSatish Balay   } else {
741f17f7ee2SSatish Balay     PetscCall(VecGetArrayRead(c,&coords));
742f17f7ee2SSatish Balay     PetscCall(VecGetBlockSize(c,&spacedim));
743f17f7ee2SSatish Balay     PetscCall(MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,PETSC_FALSE,NULL,NULL));
744f17f7ee2SSatish Balay     PetscCall(VecRestoreArrayRead(c,&coords));
745f17f7ee2SSatish Balay   }
746f17f7ee2SSatish Balay   PetscFunctionReturn(0);
747f17f7ee2SSatish Balay }
748f17f7ee2SSatish Balay 
749f17f7ee2SSatish Balay static PetscErrorCode MatSetUpMultiply_H2OPUS(Mat A)
750f17f7ee2SSatish Balay {
751f17f7ee2SSatish Balay   MPI_Comm       comm;
752f17f7ee2SSatish Balay   PetscMPIInt    size;
753f17f7ee2SSatish Balay   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
754f17f7ee2SSatish Balay   PetscInt       n = 0,*idx = NULL;
755f17f7ee2SSatish Balay   int            *iidx = NULL;
756f17f7ee2SSatish Balay   PetscCopyMode  own;
757f17f7ee2SSatish Balay   PetscBool      rid;
758f17f7ee2SSatish Balay 
759f17f7ee2SSatish Balay   PetscFunctionBegin;
760f17f7ee2SSatish Balay   if (a->multsetup) PetscFunctionReturn(0);
761f17f7ee2SSatish Balay   if (a->sf) { /* MatDuplicate_H2OPUS takes reference to the SF */
762f17f7ee2SSatish Balay     PetscCall(PetscSFGetGraph(a->sf,NULL,&n,NULL,NULL));
763f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
764f17f7ee2SSatish Balay     a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
765f17f7ee2SSatish Balay     a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
766f17f7ee2SSatish Balay     a->xxs_gpu = 1;
767f17f7ee2SSatish Balay     a->yys_gpu = 1;
768f17f7ee2SSatish Balay #endif
769f17f7ee2SSatish Balay     a->xx  = new thrust::host_vector<PetscScalar>(n);
770f17f7ee2SSatish Balay     a->yy  = new thrust::host_vector<PetscScalar>(n);
771f17f7ee2SSatish Balay     a->xxs = 1;
772f17f7ee2SSatish Balay     a->yys = 1;
773f17f7ee2SSatish Balay   } else {
774f17f7ee2SSatish Balay     IS is;
775f17f7ee2SSatish Balay     PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
776f17f7ee2SSatish Balay     PetscCallMPI(MPI_Comm_size(comm,&size));
777f17f7ee2SSatish Balay     if (!a->h2opus_indexmap) {
778f17f7ee2SSatish Balay       if (size > 1) {
779f17f7ee2SSatish Balay         PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
780f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
781f17f7ee2SSatish Balay         iidx = MatH2OpusGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map);
782f17f7ee2SSatish Balay         n    = a->dist_hmatrix->basis_tree.basis_branch.index_map.size();
783f17f7ee2SSatish Balay #endif
784f17f7ee2SSatish Balay       } else {
785f17f7ee2SSatish Balay         iidx = MatH2OpusGetThrustPointer(a->hmatrix->u_basis_tree.index_map);
786f17f7ee2SSatish Balay         n    = a->hmatrix->u_basis_tree.index_map.size();
787f17f7ee2SSatish Balay       }
788f17f7ee2SSatish Balay 
789f17f7ee2SSatish Balay       if (PetscDefined(USE_64BIT_INDICES)) {
790f17f7ee2SSatish Balay         PetscInt i;
791f17f7ee2SSatish Balay 
792f17f7ee2SSatish Balay         own  = PETSC_OWN_POINTER;
793f17f7ee2SSatish Balay         PetscCall(PetscMalloc1(n,&idx));
794f17f7ee2SSatish Balay         for (i=0;i<n;i++) idx[i] = iidx[i];
795f17f7ee2SSatish Balay       } else {
796f17f7ee2SSatish Balay         own  = PETSC_COPY_VALUES;
797f17f7ee2SSatish Balay         idx  = (PetscInt*)iidx;
798f17f7ee2SSatish Balay       }
799f17f7ee2SSatish Balay       PetscCall(ISCreateGeneral(comm,n,idx,own,&is));
800f17f7ee2SSatish Balay       PetscCall(ISSetPermutation(is));
801f17f7ee2SSatish Balay       PetscCall(ISViewFromOptions(is,(PetscObject)A,"-mat_h2opus_indexmap_view"));
802f17f7ee2SSatish Balay       a->h2opus_indexmap = is;
803f17f7ee2SSatish Balay     }
804f17f7ee2SSatish Balay     PetscCall(ISGetLocalSize(a->h2opus_indexmap,&n));
805f17f7ee2SSatish Balay     PetscCall(ISGetIndices(a->h2opus_indexmap,(const PetscInt **)&idx));
806f17f7ee2SSatish Balay     rid  = (PetscBool)(n == A->rmap->n);
807f17f7ee2SSatish Balay     PetscCall(MPIU_Allreduce(MPI_IN_PLACE,&rid,1,MPIU_BOOL,MPI_LAND,comm));
808f17f7ee2SSatish Balay     if (rid) {
809f17f7ee2SSatish Balay       PetscCall(ISIdentity(a->h2opus_indexmap,&rid));
810f17f7ee2SSatish Balay     }
811f17f7ee2SSatish Balay     if (!rid) {
812f17f7ee2SSatish Balay       if (size > 1) { /* Parallel distribution may be different, save it here for fast path in MatMult (see MatH2OpusSetNativeMult) */
813f17f7ee2SSatish Balay         PetscCall(PetscLayoutCreate(comm,&a->h2opus_rmap));
814f17f7ee2SSatish Balay         PetscCall(PetscLayoutSetLocalSize(a->h2opus_rmap,n));
815f17f7ee2SSatish Balay         PetscCall(PetscLayoutSetUp(a->h2opus_rmap));
816f17f7ee2SSatish Balay         PetscCall(PetscLayoutReference(a->h2opus_rmap,&a->h2opus_cmap));
817f17f7ee2SSatish Balay       }
818f17f7ee2SSatish Balay       PetscCall(PetscSFCreate(comm,&a->sf));
819f17f7ee2SSatish Balay       PetscCall(PetscSFSetGraphLayout(a->sf,A->rmap,n,NULL,PETSC_OWN_POINTER,idx));
820*036e4bdcSSatish Balay       PetscCall(PetscSFSetUp(a->sf));
821f17f7ee2SSatish Balay       PetscCall(PetscSFViewFromOptions(a->sf,(PetscObject)A,"-mat_h2opus_sf_view"));
822f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
823f17f7ee2SSatish Balay       a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
824f17f7ee2SSatish Balay       a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
825f17f7ee2SSatish Balay       a->xxs_gpu = 1;
826f17f7ee2SSatish Balay       a->yys_gpu = 1;
827f17f7ee2SSatish Balay #endif
828f17f7ee2SSatish Balay       a->xx  = new thrust::host_vector<PetscScalar>(n);
829f17f7ee2SSatish Balay       a->yy  = new thrust::host_vector<PetscScalar>(n);
830f17f7ee2SSatish Balay       a->xxs = 1;
831f17f7ee2SSatish Balay       a->yys = 1;
832f17f7ee2SSatish Balay     }
833f17f7ee2SSatish Balay     PetscCall(ISRestoreIndices(a->h2opus_indexmap,(const PetscInt **)&idx));
834f17f7ee2SSatish Balay   }
835f17f7ee2SSatish Balay   a->multsetup = PETSC_TRUE;
836f17f7ee2SSatish Balay   PetscFunctionReturn(0);
837f17f7ee2SSatish Balay }
838f17f7ee2SSatish Balay 
839f17f7ee2SSatish Balay static PetscErrorCode MatAssemblyEnd_H2OPUS(Mat A, MatAssemblyType assemblytype)
840f17f7ee2SSatish Balay {
841f17f7ee2SSatish Balay   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
842f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
843f17f7ee2SSatish Balay   h2opusHandle_t handle = a->handle->handle;
844f17f7ee2SSatish Balay #else
845f17f7ee2SSatish Balay   h2opusHandle_t handle = a->handle;
846f17f7ee2SSatish Balay #endif
847f17f7ee2SSatish Balay   PetscBool      kernel = PETSC_FALSE;
848f17f7ee2SSatish Balay   PetscBool      boundtocpu = PETSC_TRUE;
849f17f7ee2SSatish Balay   PetscBool      samplingdone = PETSC_FALSE;
850f17f7ee2SSatish Balay   MPI_Comm       comm;
851f17f7ee2SSatish Balay   PetscMPIInt    size;
852f17f7ee2SSatish Balay 
853f17f7ee2SSatish Balay   PetscFunctionBegin;
854f17f7ee2SSatish Balay   PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
855*036e4bdcSSatish Balay   PetscCheck(A->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported");
856*036e4bdcSSatish Balay   PetscCheck(A->rmap->N == A->cmap->N,comm,PETSC_ERR_SUP,"Rectangular matrices are not supported");
857f17f7ee2SSatish Balay 
858f17f7ee2SSatish Balay   /* XXX */
859f17f7ee2SSatish Balay   a->leafsize = PetscMin(a->leafsize, PetscMin(A->rmap->N, A->cmap->N));
860f17f7ee2SSatish Balay 
861f17f7ee2SSatish Balay   PetscCallMPI(MPI_Comm_size(comm,&size));
862f17f7ee2SSatish Balay   /* TODO REUSABILITY of geometric construction */
863f17f7ee2SSatish Balay   delete a->hmatrix;
864f17f7ee2SSatish Balay   delete a->dist_hmatrix;
865f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
866f17f7ee2SSatish Balay   delete a->hmatrix_gpu;
867f17f7ee2SSatish Balay   delete a->dist_hmatrix_gpu;
868f17f7ee2SSatish Balay #endif
869f17f7ee2SSatish Balay   a->orthogonal = PETSC_FALSE;
870f17f7ee2SSatish Balay 
871f17f7ee2SSatish Balay   /* TODO: other? */
872f17f7ee2SSatish Balay   H2OpusBoxCenterAdmissibility adm(a->eta);
873f17f7ee2SSatish Balay 
874f17f7ee2SSatish Balay   PetscCall(PetscLogEventBegin(MAT_H2Opus_Build,A,0,0,0));
875f17f7ee2SSatish Balay   if (size > 1) {
876f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
877f17f7ee2SSatish Balay     a->dist_hmatrix = new DistributedHMatrix(A->rmap->n/* ,A->symmetric */);
878f17f7ee2SSatish Balay #else
879f17f7ee2SSatish Balay     a->dist_hmatrix = NULL;
880f17f7ee2SSatish Balay #endif
881f17f7ee2SSatish Balay   } else {
882f17f7ee2SSatish Balay     a->hmatrix = new HMatrix(A->rmap->n,A->symmetric);
883f17f7ee2SSatish Balay   }
884f17f7ee2SSatish Balay   PetscCall(MatH2OpusInferCoordinates_Private(A));
885f17f7ee2SSatish Balay   PetscCheck(a->ptcloud,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing pointcloud");
886f17f7ee2SSatish Balay   if (a->kernel) {
887f17f7ee2SSatish Balay     BoxEntryGen<PetscScalar, H2OPUS_HWTYPE_CPU, PetscFunctionGenerator<PetscScalar>> entry_gen(*a->kernel);
888f17f7ee2SSatish Balay     if (size > 1) {
889f17f7ee2SSatish Balay       PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
890f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
891f17f7ee2SSatish Balay       buildDistributedHMatrix(*a->dist_hmatrix,a->ptcloud,adm,entry_gen,a->leafsize,a->basisord,a->handle);
892f17f7ee2SSatish Balay #endif
893f17f7ee2SSatish Balay     } else {
894f17f7ee2SSatish Balay       buildHMatrix(*a->hmatrix,a->ptcloud,adm,entry_gen,a->leafsize,a->basisord);
895f17f7ee2SSatish Balay     }
896f17f7ee2SSatish Balay     kernel = PETSC_TRUE;
897f17f7ee2SSatish Balay   } else {
898*036e4bdcSSatish Balay     PetscCheck(size <= 1,comm,PETSC_ERR_SUP,"Construction from sampling not supported in parallel");
899f17f7ee2SSatish Balay     buildHMatrixStructure(*a->hmatrix,a->ptcloud,a->leafsize,adm);
900f17f7ee2SSatish Balay   }
901f17f7ee2SSatish Balay   PetscCall(MatSetUpMultiply_H2OPUS(A));
902f17f7ee2SSatish Balay 
903f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
904f17f7ee2SSatish Balay   boundtocpu = A->boundtocpu;
905f17f7ee2SSatish Balay   if (!boundtocpu) {
906f17f7ee2SSatish Balay     if (size > 1) {
907f17f7ee2SSatish Balay       PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
908f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
909f17f7ee2SSatish Balay       a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
910f17f7ee2SSatish Balay #endif
911f17f7ee2SSatish Balay     } else {
912f17f7ee2SSatish Balay       a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
913f17f7ee2SSatish Balay     }
914f17f7ee2SSatish Balay   }
915f17f7ee2SSatish Balay #endif
916f17f7ee2SSatish Balay   if (size == 1) {
917f17f7ee2SSatish Balay     if (!kernel && a->sampler && a->sampler->GetSamplingMat()) {
918f17f7ee2SSatish Balay       PetscReal Anorm;
919f17f7ee2SSatish Balay       bool      verbose;
920f17f7ee2SSatish Balay 
921f17f7ee2SSatish Balay       PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_hara_verbose",&a->hara_verbose,NULL));
922f17f7ee2SSatish Balay       verbose = a->hara_verbose;
923f17f7ee2SSatish Balay       PetscCall(MatApproximateNorm_Private(a->sampler->GetSamplingMat(),NORM_2,a->norm_max_samples,&Anorm));
924f17f7ee2SSatish Balay       if (a->hara_verbose) PetscCall(PetscPrintf(PETSC_COMM_SELF,"Sampling uses max rank %d, tol %g (%g*%g), %s samples %d\n",a->max_rank,a->rtol*Anorm,a->rtol,Anorm,boundtocpu ? "CPU" : "GPU",a->bs));
925f17f7ee2SSatish Balay       if (a->sf && !a->nativemult) {
926f17f7ee2SSatish Balay         a->sampler->SetIndexMap(a->hmatrix->u_basis_tree.index_map.size(),a->hmatrix->u_basis_tree.index_map.data());
927f17f7ee2SSatish Balay       }
928f17f7ee2SSatish Balay       a->sampler->SetStream(handle->getMainStream());
929f17f7ee2SSatish Balay       if (boundtocpu) {
930f17f7ee2SSatish Balay         a->sampler->SetGPUSampling(false);
931f17f7ee2SSatish Balay         hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose);
932f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
933f17f7ee2SSatish Balay       } else {
934f17f7ee2SSatish Balay         a->sampler->SetGPUSampling(true);
935f17f7ee2SSatish Balay         hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose);
936f17f7ee2SSatish Balay #endif
937f17f7ee2SSatish Balay       }
938f17f7ee2SSatish Balay       samplingdone = PETSC_TRUE;
939f17f7ee2SSatish Balay     }
940f17f7ee2SSatish Balay   }
941f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
942f17f7ee2SSatish Balay   if (!boundtocpu) {
943f17f7ee2SSatish Balay     delete a->hmatrix;
944f17f7ee2SSatish Balay     delete a->dist_hmatrix;
945f17f7ee2SSatish Balay     a->hmatrix = NULL;
946f17f7ee2SSatish Balay     a->dist_hmatrix = NULL;
947f17f7ee2SSatish Balay   }
948f17f7ee2SSatish Balay   A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
949f17f7ee2SSatish Balay #endif
950f17f7ee2SSatish Balay   PetscCall(PetscLogEventEnd(MAT_H2Opus_Build,A,0,0,0));
951f17f7ee2SSatish Balay 
952f17f7ee2SSatish Balay   if (!a->s) a->s = 1.0;
953f17f7ee2SSatish Balay   A->assembled = PETSC_TRUE;
954f17f7ee2SSatish Balay 
955f17f7ee2SSatish Balay   if (samplingdone) {
956f17f7ee2SSatish Balay     PetscBool check = a->check_construction;
957f17f7ee2SSatish Balay     PetscBool checke = PETSC_FALSE;
958f17f7ee2SSatish Balay 
959f17f7ee2SSatish Balay     PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_check",&check,NULL));
960f17f7ee2SSatish Balay     PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_check_explicit",&checke,NULL));
961f17f7ee2SSatish Balay     if (check) {
962f17f7ee2SSatish Balay       Mat       E,Ae;
963f17f7ee2SSatish Balay       PetscReal n1,ni,n2;
964f17f7ee2SSatish Balay       PetscReal n1A,niA,n2A;
965f17f7ee2SSatish Balay       void      (*normfunc)(void);
966f17f7ee2SSatish Balay 
967f17f7ee2SSatish Balay       Ae   = a->sampler->GetSamplingMat();
968f17f7ee2SSatish Balay       PetscCall(MatConvert(A,MATSHELL,MAT_INITIAL_MATRIX,&E));
969f17f7ee2SSatish Balay       PetscCall(MatShellSetOperation(E,MATOP_NORM,(void (*)(void))MatNorm_H2OPUS));
970f17f7ee2SSatish Balay       PetscCall(MatAXPY(E,-1.0,Ae,DIFFERENT_NONZERO_PATTERN));
971f17f7ee2SSatish Balay       PetscCall(MatNorm(E,NORM_1,&n1));
972f17f7ee2SSatish Balay       PetscCall(MatNorm(E,NORM_INFINITY,&ni));
973f17f7ee2SSatish Balay       PetscCall(MatNorm(E,NORM_2,&n2));
974f17f7ee2SSatish Balay       if (checke) {
975f17f7ee2SSatish Balay         Mat eA,eE,eAe;
976f17f7ee2SSatish Balay 
977f17f7ee2SSatish Balay         PetscCall(MatComputeOperator(A,MATAIJ,&eA));
978f17f7ee2SSatish Balay         PetscCall(MatComputeOperator(E,MATAIJ,&eE));
979f17f7ee2SSatish Balay         PetscCall(MatComputeOperator(Ae,MATAIJ,&eAe));
980f17f7ee2SSatish Balay         PetscCall(MatChop(eA,PETSC_SMALL));
981f17f7ee2SSatish Balay         PetscCall(MatChop(eE,PETSC_SMALL));
982f17f7ee2SSatish Balay         PetscCall(MatChop(eAe,PETSC_SMALL));
983f17f7ee2SSatish Balay         PetscCall(PetscObjectSetName((PetscObject)eA,"H2Mat"));
984f17f7ee2SSatish Balay         PetscCall(MatView(eA,NULL));
985f17f7ee2SSatish Balay         PetscCall(PetscObjectSetName((PetscObject)eAe,"S"));
986f17f7ee2SSatish Balay         PetscCall(MatView(eAe,NULL));
987f17f7ee2SSatish Balay         PetscCall(PetscObjectSetName((PetscObject)eE,"H2Mat - S"));
988f17f7ee2SSatish Balay         PetscCall(MatView(eE,NULL));
989f17f7ee2SSatish Balay         PetscCall(MatDestroy(&eA));
990f17f7ee2SSatish Balay         PetscCall(MatDestroy(&eE));
991f17f7ee2SSatish Balay         PetscCall(MatDestroy(&eAe));
992f17f7ee2SSatish Balay       }
993f17f7ee2SSatish Balay 
994f17f7ee2SSatish Balay       PetscCall(MatGetOperation(Ae,MATOP_NORM,&normfunc));
995f17f7ee2SSatish Balay       PetscCall(MatSetOperation(Ae,MATOP_NORM,(void (*)(void))MatNorm_H2OPUS));
996f17f7ee2SSatish Balay       PetscCall(MatNorm(Ae,NORM_1,&n1A));
997f17f7ee2SSatish Balay       PetscCall(MatNorm(Ae,NORM_INFINITY,&niA));
998f17f7ee2SSatish Balay       PetscCall(MatNorm(Ae,NORM_2,&n2A));
999f17f7ee2SSatish Balay       n1A  = PetscMax(n1A,PETSC_SMALL);
1000f17f7ee2SSatish Balay       n2A  = PetscMax(n2A,PETSC_SMALL);
1001f17f7ee2SSatish Balay       niA  = PetscMax(niA,PETSC_SMALL);
1002f17f7ee2SSatish Balay       PetscCall(MatSetOperation(Ae,MATOP_NORM,normfunc));
1003f17f7ee2SSatish Balay       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A),"MATH2OPUS construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n",(double)n1,(double)ni,(double)n2,(double)(n1/n1A),(double)(ni/niA),(double)(n2/n2A)));
1004f17f7ee2SSatish Balay       PetscCall(MatDestroy(&E));
1005f17f7ee2SSatish Balay     }
1006f17f7ee2SSatish Balay     a->sampler->SetSamplingMat(NULL);
1007f17f7ee2SSatish Balay   }
1008f17f7ee2SSatish Balay   PetscFunctionReturn(0);
1009f17f7ee2SSatish Balay }
1010f17f7ee2SSatish Balay 
1011f17f7ee2SSatish Balay static PetscErrorCode MatZeroEntries_H2OPUS(Mat A)
1012f17f7ee2SSatish Balay {
1013f17f7ee2SSatish Balay   PetscMPIInt    size;
1014f17f7ee2SSatish Balay   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1015f17f7ee2SSatish Balay 
1016f17f7ee2SSatish Balay   PetscFunctionBegin;
1017f17f7ee2SSatish Balay   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1018*036e4bdcSSatish Balay   PetscCheck(size <= 1,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not yet supported");
1019f17f7ee2SSatish Balay   else {
1020f17f7ee2SSatish Balay     a->hmatrix->clearData();
1021f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1022f17f7ee2SSatish Balay     if (a->hmatrix_gpu) a->hmatrix_gpu->clearData();
1023f17f7ee2SSatish Balay #endif
1024f17f7ee2SSatish Balay   }
1025f17f7ee2SSatish Balay   PetscFunctionReturn(0);
1026f17f7ee2SSatish Balay }
1027f17f7ee2SSatish Balay 
1028f17f7ee2SSatish Balay static PetscErrorCode MatDuplicate_H2OPUS(Mat B, MatDuplicateOption op, Mat *nA)
1029f17f7ee2SSatish Balay {
1030f17f7ee2SSatish Balay   Mat            A;
1031f17f7ee2SSatish Balay   Mat_H2OPUS     *a, *b = (Mat_H2OPUS*)B->data;
1032f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1033f17f7ee2SSatish Balay   PetscBool      iscpu = PETSC_FALSE;
1034f17f7ee2SSatish Balay #else
1035f17f7ee2SSatish Balay   PetscBool      iscpu = PETSC_TRUE;
1036f17f7ee2SSatish Balay #endif
1037f17f7ee2SSatish Balay   MPI_Comm       comm;
1038f17f7ee2SSatish Balay 
1039f17f7ee2SSatish Balay   PetscFunctionBegin;
1040f17f7ee2SSatish Balay   PetscCall(PetscObjectGetComm((PetscObject)B,&comm));
1041f17f7ee2SSatish Balay   PetscCall(MatCreate(comm,&A));
1042f17f7ee2SSatish Balay   PetscCall(MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N));
1043f17f7ee2SSatish Balay   PetscCall(MatSetType(A,MATH2OPUS));
1044f17f7ee2SSatish Balay   PetscCall(MatPropagateSymmetryOptions(B,A));
1045f17f7ee2SSatish Balay   a = (Mat_H2OPUS*)A->data;
1046f17f7ee2SSatish Balay 
1047f17f7ee2SSatish Balay   a->eta              = b->eta;
1048f17f7ee2SSatish Balay   a->leafsize         = b->leafsize;
1049f17f7ee2SSatish Balay   a->basisord         = b->basisord;
1050f17f7ee2SSatish Balay   a->max_rank         = b->max_rank;
1051f17f7ee2SSatish Balay   a->bs               = b->bs;
1052f17f7ee2SSatish Balay   a->rtol             = b->rtol;
1053f17f7ee2SSatish Balay   a->norm_max_samples = b->norm_max_samples;
1054f17f7ee2SSatish Balay   if (op == MAT_COPY_VALUES) a->s = b->s;
1055f17f7ee2SSatish Balay 
1056f17f7ee2SSatish Balay   a->ptcloud = new PetscPointCloud<PetscReal>(*b->ptcloud);
1057f17f7ee2SSatish Balay   if (op == MAT_COPY_VALUES && b->kernel) a->kernel = new PetscFunctionGenerator<PetscScalar>(*b->kernel);
1058f17f7ee2SSatish Balay 
1059f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1060f17f7ee2SSatish Balay   if (b->dist_hmatrix) { a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix); }
1061f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1062f17f7ee2SSatish Balay   if (b->dist_hmatrix_gpu) { a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu); }
1063f17f7ee2SSatish Balay #endif
1064f17f7ee2SSatish Balay #endif
1065f17f7ee2SSatish Balay   if (b->hmatrix) {
1066f17f7ee2SSatish Balay     a->hmatrix = new HMatrix(*b->hmatrix);
1067f17f7ee2SSatish Balay     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData();
1068f17f7ee2SSatish Balay   }
1069f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1070f17f7ee2SSatish Balay   if (b->hmatrix_gpu) {
1071f17f7ee2SSatish Balay     a->hmatrix_gpu = new HMatrix_GPU(*b->hmatrix_gpu);
1072f17f7ee2SSatish Balay     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData();
1073f17f7ee2SSatish Balay   }
1074f17f7ee2SSatish Balay #endif
1075f17f7ee2SSatish Balay   if (b->sf) {
1076f17f7ee2SSatish Balay     PetscCall(PetscObjectReference((PetscObject)b->sf));
1077f17f7ee2SSatish Balay     a->sf = b->sf;
1078f17f7ee2SSatish Balay   }
1079f17f7ee2SSatish Balay   if (b->h2opus_indexmap) {
1080f17f7ee2SSatish Balay     PetscCall(PetscObjectReference((PetscObject)b->h2opus_indexmap));
1081f17f7ee2SSatish Balay     a->h2opus_indexmap = b->h2opus_indexmap;
1082f17f7ee2SSatish Balay   }
1083f17f7ee2SSatish Balay 
1084f17f7ee2SSatish Balay   PetscCall(MatSetUp(A));
1085f17f7ee2SSatish Balay   PetscCall(MatSetUpMultiply_H2OPUS(A));
1086f17f7ee2SSatish Balay   if (op == MAT_COPY_VALUES) {
1087f17f7ee2SSatish Balay     A->assembled = PETSC_TRUE;
1088f17f7ee2SSatish Balay     a->orthogonal = b->orthogonal;
1089f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1090f17f7ee2SSatish Balay     A->offloadmask = B->offloadmask;
1091f17f7ee2SSatish Balay #endif
1092f17f7ee2SSatish Balay   }
1093f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1094f17f7ee2SSatish Balay   iscpu = B->boundtocpu;
1095f17f7ee2SSatish Balay #endif
1096f17f7ee2SSatish Balay   PetscCall(MatBindToCPU(A,iscpu));
1097f17f7ee2SSatish Balay 
1098f17f7ee2SSatish Balay   *nA = A;
1099f17f7ee2SSatish Balay   PetscFunctionReturn(0);
1100f17f7ee2SSatish Balay }
1101f17f7ee2SSatish Balay 
1102f17f7ee2SSatish Balay static PetscErrorCode MatView_H2OPUS(Mat A, PetscViewer view)
1103f17f7ee2SSatish Balay {
1104f17f7ee2SSatish Balay   Mat_H2OPUS        *h2opus = (Mat_H2OPUS*)A->data;
1105*036e4bdcSSatish Balay   PetscBool         isascii, vieweps;
1106f17f7ee2SSatish Balay   PetscMPIInt       size;
1107f17f7ee2SSatish Balay   PetscViewerFormat format;
1108f17f7ee2SSatish Balay 
1109f17f7ee2SSatish Balay   PetscFunctionBegin;
1110f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompare((PetscObject)view,PETSCVIEWERASCII,&isascii));
1111f17f7ee2SSatish Balay   PetscCall(PetscViewerGetFormat(view,&format));
1112f17f7ee2SSatish Balay   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1113f17f7ee2SSatish Balay   if (isascii) {
1114f17f7ee2SSatish Balay     if (format == PETSC_VIEWER_ASCII_MATLAB) {
1115f17f7ee2SSatish Balay       if (size == 1) {
1116f17f7ee2SSatish Balay         FILE *fp;
1117f17f7ee2SSatish Balay         PetscCall(PetscViewerASCIIGetPointer(view,&fp));
1118f17f7ee2SSatish Balay         dumpHMatrix(*h2opus->hmatrix,6,fp);
1119f17f7ee2SSatish Balay       }
1120f17f7ee2SSatish Balay     } else {
1121f17f7ee2SSatish Balay       PetscCall(PetscViewerASCIIPrintf(view,"  H-Matrix constructed from %s\n",h2opus->kernel ? "Kernel" : "Mat"));
1122f17f7ee2SSatish Balay       PetscCall(PetscViewerASCIIPrintf(view,"  PointCloud dim %" PetscInt_FMT "\n",h2opus->ptcloud ? h2opus->ptcloud->getDimension() : 0));
1123f17f7ee2SSatish Balay       PetscCall(PetscViewerASCIIPrintf(view,"  Admissibility parameters: leaf size %" PetscInt_FMT ", eta %g\n",h2opus->leafsize,(double)h2opus->eta));
1124f17f7ee2SSatish Balay       if (!h2opus->kernel) {
1125f17f7ee2SSatish Balay         PetscCall(PetscViewerASCIIPrintf(view,"  Sampling parameters: max_rank %" PetscInt_FMT ", samples %" PetscInt_FMT ", tolerance %g\n",h2opus->max_rank,h2opus->bs,(double)h2opus->rtol));
1126f17f7ee2SSatish Balay       } else {
1127f17f7ee2SSatish Balay         PetscCall(PetscViewerASCIIPrintf(view,"  Offdiagonal blocks approximation order %" PetscInt_FMT "\n",h2opus->basisord));
1128f17f7ee2SSatish Balay       }
1129f17f7ee2SSatish Balay       PetscCall(PetscViewerASCIIPrintf(view,"  Number of samples for norms %" PetscInt_FMT "\n",h2opus->norm_max_samples));
1130f17f7ee2SSatish Balay       if (size == 1) {
1131f17f7ee2SSatish Balay         double dense_mem_cpu = h2opus->hmatrix ? h2opus->hmatrix->getDenseMemoryUsage() : 0;
1132f17f7ee2SSatish Balay         double low_rank_cpu = h2opus->hmatrix ? h2opus->hmatrix->getLowRankMemoryUsage() : 0;
1133f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA)
1134f17f7ee2SSatish Balay         double dense_mem_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getDenseMemoryUsage() : 0;
1135f17f7ee2SSatish Balay         double low_rank_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getLowRankMemoryUsage() : 0;
1136f17f7ee2SSatish Balay #endif
1137f17f7ee2SSatish Balay         PetscCall(PetscViewerASCIIPrintf(view,"  Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu));
1138f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA)
1139f17f7ee2SSatish Balay         PetscCall(PetscViewerASCIIPrintf(view,"  Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu));
1140f17f7ee2SSatish Balay #endif
1141f17f7ee2SSatish Balay       } else {
1142f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA)
1143f17f7ee2SSatish Balay         double      matrix_mem[4] = {0.,0.,0.,0.};
1144f17f7ee2SSatish Balay         PetscMPIInt rsize = 4;
1145f17f7ee2SSatish Balay #else
1146f17f7ee2SSatish Balay         double      matrix_mem[2] = {0.,0.};
1147f17f7ee2SSatish Balay         PetscMPIInt rsize = 2;
1148f17f7ee2SSatish Balay #endif
1149f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1150f17f7ee2SSatish Balay         matrix_mem[0] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalDenseMemoryUsage() : 0;
1151f17f7ee2SSatish Balay         matrix_mem[1] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalLowRankMemoryUsage() : 0;
1152f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA)
1153f17f7ee2SSatish Balay         matrix_mem[2] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalDenseMemoryUsage() : 0;
1154f17f7ee2SSatish Balay         matrix_mem[3] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalLowRankMemoryUsage() : 0;
1155f17f7ee2SSatish Balay #endif
1156f17f7ee2SSatish Balay #endif
1157f17f7ee2SSatish Balay         PetscCall(MPIU_Allreduce(MPI_IN_PLACE,matrix_mem,rsize,MPI_DOUBLE_PRECISION,MPI_SUM,PetscObjectComm((PetscObject)A)));
1158f17f7ee2SSatish Balay         PetscCall(PetscViewerASCIIPrintf(view,"  Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[0], matrix_mem[1], matrix_mem[0] + matrix_mem[1]));
1159f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA)
1160f17f7ee2SSatish Balay         PetscCall(PetscViewerASCIIPrintf(view,"  Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[2], matrix_mem[3], matrix_mem[2] + matrix_mem[3]));
1161f17f7ee2SSatish Balay #endif
1162f17f7ee2SSatish Balay       }
1163f17f7ee2SSatish Balay     }
1164f17f7ee2SSatish Balay   }
1165*036e4bdcSSatish Balay   vieweps = PETSC_FALSE;
1166*036e4bdcSSatish Balay   PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_vieweps",&vieweps,NULL));
1167*036e4bdcSSatish Balay   if (vieweps) {
1168f17f7ee2SSatish Balay     char filename[256];
1169f17f7ee2SSatish Balay     const char *name;
1170f17f7ee2SSatish Balay 
1171f17f7ee2SSatish Balay     PetscCall(PetscObjectGetName((PetscObject)A,&name));
1172f17f7ee2SSatish Balay     PetscCall(PetscSNPrintf(filename,sizeof(filename),"%s_structure.eps",name));
1173*036e4bdcSSatish Balay     PetscCall(PetscOptionsGetString(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_vieweps_filename",filename,sizeof(filename),NULL));
1174f17f7ee2SSatish Balay     outputEps(*h2opus->hmatrix,filename);
1175f17f7ee2SSatish Balay   }
1176f17f7ee2SSatish Balay   PetscFunctionReturn(0);
1177f17f7ee2SSatish Balay }
1178f17f7ee2SSatish Balay 
1179f17f7ee2SSatish Balay static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat A, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx)
1180f17f7ee2SSatish Balay {
1181f17f7ee2SSatish Balay   Mat_H2OPUS     *h2opus = (Mat_H2OPUS*)A->data;
1182f17f7ee2SSatish Balay   PetscReal      *gcoords;
1183f17f7ee2SSatish Balay   PetscInt       N;
1184f17f7ee2SSatish Balay   MPI_Comm       comm;
1185f17f7ee2SSatish Balay   PetscMPIInt    size;
1186f17f7ee2SSatish Balay   PetscBool      cong;
1187f17f7ee2SSatish Balay 
1188f17f7ee2SSatish Balay   PetscFunctionBegin;
1189f17f7ee2SSatish Balay   PetscCall(PetscLayoutSetUp(A->rmap));
1190f17f7ee2SSatish Balay   PetscCall(PetscLayoutSetUp(A->cmap));
1191f17f7ee2SSatish Balay   PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
1192f17f7ee2SSatish Balay   PetscCall(MatHasCongruentLayouts(A,&cong));
1193f17f7ee2SSatish Balay   PetscCheck(cong,comm,PETSC_ERR_SUP,"Only for square matrices with congruent layouts");
1194f17f7ee2SSatish Balay   N = A->rmap->N;
1195f17f7ee2SSatish Balay   PetscCallMPI(MPI_Comm_size(comm,&size));
1196f17f7ee2SSatish Balay   if (spacedim > 0 && size > 1 && cdist) {
1197f17f7ee2SSatish Balay     PetscSF      sf;
1198f17f7ee2SSatish Balay     MPI_Datatype dtype;
1199f17f7ee2SSatish Balay 
1200f17f7ee2SSatish Balay     PetscCallMPI(MPI_Type_contiguous(spacedim,MPIU_REAL,&dtype));
1201f17f7ee2SSatish Balay     PetscCallMPI(MPI_Type_commit(&dtype));
1202f17f7ee2SSatish Balay 
1203f17f7ee2SSatish Balay     PetscCall(PetscSFCreate(comm,&sf));
1204f17f7ee2SSatish Balay     PetscCall(PetscSFSetGraphWithPattern(sf,A->rmap,PETSCSF_PATTERN_ALLGATHER));
1205f17f7ee2SSatish Balay     PetscCall(PetscMalloc1(spacedim*N,&gcoords));
1206f17f7ee2SSatish Balay     PetscCall(PetscSFBcastBegin(sf,dtype,coords,gcoords,MPI_REPLACE));
1207f17f7ee2SSatish Balay     PetscCall(PetscSFBcastEnd(sf,dtype,coords,gcoords,MPI_REPLACE));
1208f17f7ee2SSatish Balay     PetscCall(PetscSFDestroy(&sf));
1209f17f7ee2SSatish Balay     PetscCallMPI(MPI_Type_free(&dtype));
1210f17f7ee2SSatish Balay   } else gcoords = (PetscReal*)coords;
1211f17f7ee2SSatish Balay 
1212f17f7ee2SSatish Balay   delete h2opus->ptcloud;
1213f17f7ee2SSatish Balay   delete h2opus->kernel;
1214f17f7ee2SSatish Balay   h2opus->ptcloud = new PetscPointCloud<PetscReal>(spacedim,N,gcoords);
1215f17f7ee2SSatish Balay   if (kernel) h2opus->kernel = new PetscFunctionGenerator<PetscScalar>(kernel,spacedim,kernelctx);
1216f17f7ee2SSatish Balay   if (gcoords != coords) PetscCall(PetscFree(gcoords));
1217f17f7ee2SSatish Balay   A->preallocated = PETSC_TRUE;
1218f17f7ee2SSatish Balay   PetscFunctionReturn(0);
1219f17f7ee2SSatish Balay }
1220f17f7ee2SSatish Balay 
1221f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1222f17f7ee2SSatish Balay static PetscErrorCode MatBindToCPU_H2OPUS(Mat A, PetscBool flg)
1223f17f7ee2SSatish Balay {
1224f17f7ee2SSatish Balay   PetscMPIInt    size;
1225f17f7ee2SSatish Balay   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1226f17f7ee2SSatish Balay 
1227f17f7ee2SSatish Balay   PetscFunctionBegin;
1228f17f7ee2SSatish Balay   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1229f17f7ee2SSatish Balay   if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) {
1230f17f7ee2SSatish Balay     if (size > 1) {
1231f17f7ee2SSatish Balay       PetscCheck(a->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1232f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1233f17f7ee2SSatish Balay       if (!a->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix_gpu);
1234f17f7ee2SSatish Balay       else *a->dist_hmatrix = *a->dist_hmatrix_gpu;
1235f17f7ee2SSatish Balay #endif
1236f17f7ee2SSatish Balay     } else {
1237f17f7ee2SSatish Balay       PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1238f17f7ee2SSatish Balay       if (!a->hmatrix) a->hmatrix = new HMatrix(*a->hmatrix_gpu);
1239f17f7ee2SSatish Balay       else *a->hmatrix = *a->hmatrix_gpu;
1240f17f7ee2SSatish Balay     }
1241f17f7ee2SSatish Balay     delete a->hmatrix_gpu;
1242f17f7ee2SSatish Balay     delete a->dist_hmatrix_gpu;
1243f17f7ee2SSatish Balay     a->hmatrix_gpu = NULL;
1244f17f7ee2SSatish Balay     a->dist_hmatrix_gpu = NULL;
1245f17f7ee2SSatish Balay     A->offloadmask = PETSC_OFFLOAD_CPU;
1246f17f7ee2SSatish Balay   } else if (!flg && A->offloadmask == PETSC_OFFLOAD_CPU) {
1247f17f7ee2SSatish Balay     if (size > 1) {
1248f17f7ee2SSatish Balay       PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1249f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1250f17f7ee2SSatish Balay       if (!a->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
1251f17f7ee2SSatish Balay       else *a->dist_hmatrix_gpu = *a->dist_hmatrix;
1252f17f7ee2SSatish Balay #endif
1253f17f7ee2SSatish Balay     } else {
1254f17f7ee2SSatish Balay       PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1255f17f7ee2SSatish Balay       if (!a->hmatrix_gpu) a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
1256f17f7ee2SSatish Balay       else *a->hmatrix_gpu = *a->hmatrix;
1257f17f7ee2SSatish Balay     }
1258f17f7ee2SSatish Balay     delete a->hmatrix;
1259f17f7ee2SSatish Balay     delete a->dist_hmatrix;
1260f17f7ee2SSatish Balay     a->hmatrix = NULL;
1261f17f7ee2SSatish Balay     a->dist_hmatrix = NULL;
1262f17f7ee2SSatish Balay     A->offloadmask = PETSC_OFFLOAD_GPU;
1263f17f7ee2SSatish Balay   }
1264f17f7ee2SSatish Balay   PetscCall(PetscFree(A->defaultvectype));
1265f17f7ee2SSatish Balay   if (!flg) {
1266f17f7ee2SSatish Balay     PetscCall(PetscStrallocpy(VECCUDA,&A->defaultvectype));
1267f17f7ee2SSatish Balay   } else {
1268f17f7ee2SSatish Balay     PetscCall(PetscStrallocpy(VECSTANDARD,&A->defaultvectype));
1269f17f7ee2SSatish Balay   }
1270f17f7ee2SSatish Balay   A->boundtocpu = flg;
1271f17f7ee2SSatish Balay   PetscFunctionReturn(0);
1272f17f7ee2SSatish Balay }
1273f17f7ee2SSatish Balay #endif
1274f17f7ee2SSatish Balay 
1275f17f7ee2SSatish Balay /*MC
1276f17f7ee2SSatish Balay      MATH2OPUS = "h2opus" - A matrix type for hierarchical matrices using the H2Opus package.
1277f17f7ee2SSatish Balay 
1278f17f7ee2SSatish Balay    Options Database Keys:
1279f17f7ee2SSatish Balay .     -mat_type h2opus - matrix type to "h2opus" during a call to MatSetFromOptions()
1280f17f7ee2SSatish Balay 
1281f17f7ee2SSatish Balay    Notes:
1282f17f7ee2SSatish Balay      H2Opus implements hierarchical matrices in the H^2 flavour.
1283f17f7ee2SSatish Balay      It supports CPU or NVIDIA GPUs.
1284f17f7ee2SSatish Balay      For CPU only builds, use ./configure --download-h2opus --download-thrust to install PETSc to use H2Opus.
1285f17f7ee2SSatish Balay      In order to run on NVIDIA GPUs, use ./configure --download-h2opus --download-magma --download-kblas.
1286f17f7ee2SSatish Balay      For details and additional references, see
1287f17f7ee2SSatish Balay        "H2Opus: A distributed-memory multi-GPU software package for non-local operators",
1288f17f7ee2SSatish Balay      available at https://arxiv.org/abs/2109.05451.
1289f17f7ee2SSatish Balay 
1290f17f7ee2SSatish Balay    Level: beginner
1291f17f7ee2SSatish Balay 
1292*036e4bdcSSatish Balay .seealso: `MATHTOOL`, `MATDENSE`, `MatCreateH2OpusFromKernel()`, `MatCreateH2OpusFromMat()`
1293f17f7ee2SSatish Balay M*/
1294f17f7ee2SSatish Balay PETSC_EXTERN PetscErrorCode MatCreate_H2OPUS(Mat A)
1295f17f7ee2SSatish Balay {
1296f17f7ee2SSatish Balay   Mat_H2OPUS     *a;
1297f17f7ee2SSatish Balay   PetscMPIInt    size;
1298f17f7ee2SSatish Balay 
1299f17f7ee2SSatish Balay   PetscFunctionBegin;
1300f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1301f17f7ee2SSatish Balay   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
1302f17f7ee2SSatish Balay #endif
1303f17f7ee2SSatish Balay   PetscCall(PetscNewLog(A,&a));
1304f17f7ee2SSatish Balay   A->data = (void*)a;
1305f17f7ee2SSatish Balay 
1306f17f7ee2SSatish Balay   a->eta              = 0.9;
1307f17f7ee2SSatish Balay   a->leafsize         = 32;
1308f17f7ee2SSatish Balay   a->basisord         = 4;
1309f17f7ee2SSatish Balay   a->max_rank         = 64;
1310f17f7ee2SSatish Balay   a->bs               = 32;
1311f17f7ee2SSatish Balay   a->rtol             = 1.e-4;
1312f17f7ee2SSatish Balay   a->s                = 1.0;
1313f17f7ee2SSatish Balay   a->norm_max_samples = 10;
1314*036e4bdcSSatish Balay   a->resize           = PETSC_TRUE; /* reallocate after compression */
1315f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1316f17f7ee2SSatish Balay   h2opusCreateDistributedHandleComm(&a->handle,PetscObjectComm((PetscObject)A));
1317f17f7ee2SSatish Balay #else
1318f17f7ee2SSatish Balay   h2opusCreateHandle(&a->handle);
1319f17f7ee2SSatish Balay #endif
1320f17f7ee2SSatish Balay   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1321f17f7ee2SSatish Balay   PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATH2OPUS));
1322f17f7ee2SSatish Balay   PetscCall(PetscMemzero(A->ops,sizeof(struct _MatOps)));
1323f17f7ee2SSatish Balay 
1324f17f7ee2SSatish Balay   A->ops->destroy          = MatDestroy_H2OPUS;
1325f17f7ee2SSatish Balay   A->ops->view             = MatView_H2OPUS;
1326f17f7ee2SSatish Balay   A->ops->assemblyend      = MatAssemblyEnd_H2OPUS;
1327f17f7ee2SSatish Balay   A->ops->mult             = MatMult_H2OPUS;
1328f17f7ee2SSatish Balay   A->ops->multtranspose    = MatMultTranspose_H2OPUS;
1329f17f7ee2SSatish Balay   A->ops->multadd          = MatMultAdd_H2OPUS;
1330f17f7ee2SSatish Balay   A->ops->multtransposeadd = MatMultTransposeAdd_H2OPUS;
1331f17f7ee2SSatish Balay   A->ops->scale            = MatScale_H2OPUS;
1332f17f7ee2SSatish Balay   A->ops->duplicate        = MatDuplicate_H2OPUS;
1333f17f7ee2SSatish Balay   A->ops->setfromoptions   = MatSetFromOptions_H2OPUS;
1334f17f7ee2SSatish Balay   A->ops->norm             = MatNorm_H2OPUS;
1335f17f7ee2SSatish Balay   A->ops->zeroentries      = MatZeroEntries_H2OPUS;
1336f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1337f17f7ee2SSatish Balay   A->ops->bindtocpu        = MatBindToCPU_H2OPUS;
1338f17f7ee2SSatish Balay #endif
1339f17f7ee2SSatish Balay 
1340f17f7ee2SSatish Balay   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdense_C",MatProductSetFromOptions_H2OPUS));
1341f17f7ee2SSatish Balay   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdensecuda_C",MatProductSetFromOptions_H2OPUS));
1342f17f7ee2SSatish Balay   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidense_C",MatProductSetFromOptions_H2OPUS));
1343f17f7ee2SSatish Balay   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidensecuda_C",MatProductSetFromOptions_H2OPUS));
1344f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1345f17f7ee2SSatish Balay   PetscCall(PetscFree(A->defaultvectype));
1346f17f7ee2SSatish Balay   PetscCall(PetscStrallocpy(VECCUDA,&A->defaultvectype));
1347f17f7ee2SSatish Balay #endif
1348f17f7ee2SSatish Balay   PetscFunctionReturn(0);
1349f17f7ee2SSatish Balay }
1350f17f7ee2SSatish Balay 
1351f17f7ee2SSatish Balay /*@C
1352f17f7ee2SSatish Balay      MatH2OpusOrthogonalize - Orthogonalize the basis tree of a hierarchical matrix.
1353f17f7ee2SSatish Balay 
1354f17f7ee2SSatish Balay    Input Parameter:
1355f17f7ee2SSatish Balay .     A - the matrix
1356f17f7ee2SSatish Balay 
1357f17f7ee2SSatish Balay    Level: intermediate
1358f17f7ee2SSatish Balay 
1359*036e4bdcSSatish Balay .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`
1360f17f7ee2SSatish Balay @*/
1361f17f7ee2SSatish Balay PetscErrorCode MatH2OpusOrthogonalize(Mat A)
1362f17f7ee2SSatish Balay {
1363f17f7ee2SSatish Balay   PetscBool      ish2opus;
1364f17f7ee2SSatish Balay   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1365f17f7ee2SSatish Balay   PetscMPIInt    size;
1366f17f7ee2SSatish Balay   PetscBool      boundtocpu = PETSC_TRUE;
1367f17f7ee2SSatish Balay 
1368f17f7ee2SSatish Balay   PetscFunctionBegin;
1369f17f7ee2SSatish Balay   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1370f17f7ee2SSatish Balay   PetscValidType(A,1);
1371f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
1372f17f7ee2SSatish Balay   if (!ish2opus) PetscFunctionReturn(0);
1373f17f7ee2SSatish Balay   if (a->orthogonal) PetscFunctionReturn(0);
1374f17f7ee2SSatish Balay   HLibProfile::clear();
1375f17f7ee2SSatish Balay   PetscCall(PetscLogEventBegin(MAT_H2Opus_Orthog,A,0,0,0));
1376f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1377f17f7ee2SSatish Balay   boundtocpu = A->boundtocpu;
1378f17f7ee2SSatish Balay #endif
1379f17f7ee2SSatish Balay   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1380f17f7ee2SSatish Balay   if (size > 1) {
1381f17f7ee2SSatish Balay     if (boundtocpu) {
1382f17f7ee2SSatish Balay       PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1383f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1384f17f7ee2SSatish Balay       distributed_horthog(*a->dist_hmatrix, a->handle);
1385f17f7ee2SSatish Balay #endif
1386f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1387f17f7ee2SSatish Balay       A->offloadmask = PETSC_OFFLOAD_CPU;
1388f17f7ee2SSatish Balay     } else {
1389f17f7ee2SSatish Balay       PetscCheck(a->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1390f17f7ee2SSatish Balay       PetscCall(PetscLogGpuTimeBegin());
1391f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1392f17f7ee2SSatish Balay       distributed_horthog(*a->dist_hmatrix_gpu, a->handle);
1393f17f7ee2SSatish Balay #endif
1394f17f7ee2SSatish Balay       PetscCall(PetscLogGpuTimeEnd());
1395f17f7ee2SSatish Balay #endif
1396f17f7ee2SSatish Balay     }
1397f17f7ee2SSatish Balay   } else {
1398f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1399f17f7ee2SSatish Balay     h2opusHandle_t handle = a->handle->handle;
1400f17f7ee2SSatish Balay #else
1401f17f7ee2SSatish Balay     h2opusHandle_t handle = a->handle;
1402f17f7ee2SSatish Balay #endif
1403f17f7ee2SSatish Balay     if (boundtocpu) {
1404f17f7ee2SSatish Balay       PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1405f17f7ee2SSatish Balay       horthog(*a->hmatrix, handle);
1406f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1407f17f7ee2SSatish Balay       A->offloadmask = PETSC_OFFLOAD_CPU;
1408f17f7ee2SSatish Balay     } else {
1409f17f7ee2SSatish Balay       PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1410f17f7ee2SSatish Balay       PetscCall(PetscLogGpuTimeBegin());
1411f17f7ee2SSatish Balay       horthog(*a->hmatrix_gpu, handle);
1412f17f7ee2SSatish Balay       PetscCall(PetscLogGpuTimeEnd());
1413f17f7ee2SSatish Balay #endif
1414f17f7ee2SSatish Balay     }
1415f17f7ee2SSatish Balay   }
1416f17f7ee2SSatish Balay   a->orthogonal = PETSC_TRUE;
1417f17f7ee2SSatish Balay   { /* log flops */
1418f17f7ee2SSatish Balay     double gops,time,perf,dev;
1419f17f7ee2SSatish Balay     HLibProfile::getHorthogPerf(gops,time,perf,dev);
1420f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1421f17f7ee2SSatish Balay     if (boundtocpu) {
1422f17f7ee2SSatish Balay       PetscCall(PetscLogFlops(1e9*gops));
1423f17f7ee2SSatish Balay     } else {
1424f17f7ee2SSatish Balay       PetscCall(PetscLogGpuFlops(1e9*gops));
1425f17f7ee2SSatish Balay     }
1426f17f7ee2SSatish Balay #else
1427f17f7ee2SSatish Balay     PetscCall(PetscLogFlops(1e9*gops));
1428f17f7ee2SSatish Balay #endif
1429f17f7ee2SSatish Balay   }
1430f17f7ee2SSatish Balay   PetscCall(PetscLogEventEnd(MAT_H2Opus_Orthog,A,0,0,0));
1431f17f7ee2SSatish Balay   PetscFunctionReturn(0);
1432f17f7ee2SSatish Balay }
1433f17f7ee2SSatish Balay 
1434f17f7ee2SSatish Balay /*@C
1435f17f7ee2SSatish Balay      MatH2OpusCompress - Compress a hierarchical matrix.
1436f17f7ee2SSatish Balay 
1437f17f7ee2SSatish Balay    Input Parameters:
1438f17f7ee2SSatish Balay +     A - the matrix
1439f17f7ee2SSatish Balay -     tol - the absolute truncation threshold
1440f17f7ee2SSatish Balay 
1441f17f7ee2SSatish Balay    Level: intermediate
1442f17f7ee2SSatish Balay 
1443*036e4bdcSSatish Balay .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusOrthogonalize()`
1444f17f7ee2SSatish Balay @*/
1445f17f7ee2SSatish Balay PetscErrorCode MatH2OpusCompress(Mat A, PetscReal tol)
1446f17f7ee2SSatish Balay {
1447f17f7ee2SSatish Balay   PetscBool      ish2opus;
1448f17f7ee2SSatish Balay   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1449f17f7ee2SSatish Balay   PetscMPIInt    size;
1450f17f7ee2SSatish Balay   PetscBool      boundtocpu = PETSC_TRUE;
1451f17f7ee2SSatish Balay 
1452f17f7ee2SSatish Balay   PetscFunctionBegin;
1453f17f7ee2SSatish Balay   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1454f17f7ee2SSatish Balay   PetscValidType(A,1);
1455f17f7ee2SSatish Balay   PetscValidLogicalCollectiveReal(A,tol,2);
1456f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
1457f17f7ee2SSatish Balay   if (!ish2opus || tol <= 0.0) PetscFunctionReturn(0);
1458f17f7ee2SSatish Balay   PetscCall(MatH2OpusOrthogonalize(A));
1459f17f7ee2SSatish Balay   HLibProfile::clear();
1460f17f7ee2SSatish Balay   PetscCall(PetscLogEventBegin(MAT_H2Opus_Compress,A,0,0,0));
1461f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1462f17f7ee2SSatish Balay   boundtocpu = A->boundtocpu;
1463f17f7ee2SSatish Balay #endif
1464f17f7ee2SSatish Balay   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1465f17f7ee2SSatish Balay   if (size > 1) {
1466f17f7ee2SSatish Balay     if (boundtocpu) {
1467f17f7ee2SSatish Balay       PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1468f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1469f17f7ee2SSatish Balay       distributed_hcompress(*a->dist_hmatrix, tol, a->handle);
1470*036e4bdcSSatish Balay       if (a->resize) {
1471*036e4bdcSSatish Balay         DistributedHMatrix *dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix);
1472*036e4bdcSSatish Balay         delete a->dist_hmatrix;
1473*036e4bdcSSatish Balay         a->dist_hmatrix = dist_hmatrix;
1474*036e4bdcSSatish Balay       }
1475f17f7ee2SSatish Balay #endif
1476f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1477f17f7ee2SSatish Balay       A->offloadmask = PETSC_OFFLOAD_CPU;
1478f17f7ee2SSatish Balay     } else {
1479f17f7ee2SSatish Balay       PetscCheck(a->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1480f17f7ee2SSatish Balay       PetscCall(PetscLogGpuTimeBegin());
1481f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1482f17f7ee2SSatish Balay       distributed_hcompress(*a->dist_hmatrix_gpu, tol, a->handle);
1483*036e4bdcSSatish Balay 
1484*036e4bdcSSatish Balay       if (a->resize) {
1485*036e4bdcSSatish Balay         DistributedHMatrix_GPU *dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix_gpu);
1486*036e4bdcSSatish Balay         delete a->dist_hmatrix_gpu;
1487*036e4bdcSSatish Balay         a->dist_hmatrix_gpu = dist_hmatrix_gpu;
1488*036e4bdcSSatish Balay       }
1489f17f7ee2SSatish Balay #endif
1490f17f7ee2SSatish Balay       PetscCall(PetscLogGpuTimeEnd());
1491f17f7ee2SSatish Balay #endif
1492f17f7ee2SSatish Balay     }
1493f17f7ee2SSatish Balay   } else {
1494f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1495f17f7ee2SSatish Balay     h2opusHandle_t handle = a->handle->handle;
1496f17f7ee2SSatish Balay #else
1497f17f7ee2SSatish Balay     h2opusHandle_t handle = a->handle;
1498f17f7ee2SSatish Balay #endif
1499f17f7ee2SSatish Balay     if (boundtocpu) {
1500f17f7ee2SSatish Balay       PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1501f17f7ee2SSatish Balay       hcompress(*a->hmatrix, tol, handle);
1502*036e4bdcSSatish Balay 
1503*036e4bdcSSatish Balay       if (a->resize) {
1504*036e4bdcSSatish Balay         HMatrix *hmatrix = new HMatrix(*a->hmatrix);
1505*036e4bdcSSatish Balay         delete a->hmatrix;
1506*036e4bdcSSatish Balay         a->hmatrix = hmatrix;
1507*036e4bdcSSatish Balay       }
1508f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1509f17f7ee2SSatish Balay       A->offloadmask = PETSC_OFFLOAD_CPU;
1510f17f7ee2SSatish Balay     } else {
1511f17f7ee2SSatish Balay       PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1512f17f7ee2SSatish Balay       PetscCall(PetscLogGpuTimeBegin());
1513f17f7ee2SSatish Balay       hcompress(*a->hmatrix_gpu, tol, handle);
1514f17f7ee2SSatish Balay       PetscCall(PetscLogGpuTimeEnd());
1515*036e4bdcSSatish Balay 
1516*036e4bdcSSatish Balay       if (a->resize) {
1517*036e4bdcSSatish Balay         HMatrix_GPU *hmatrix_gpu = new HMatrix_GPU(*a->hmatrix_gpu);
1518*036e4bdcSSatish Balay         delete a->hmatrix_gpu;
1519*036e4bdcSSatish Balay         a->hmatrix_gpu = hmatrix_gpu;
1520*036e4bdcSSatish Balay       }
1521f17f7ee2SSatish Balay #endif
1522f17f7ee2SSatish Balay     }
1523f17f7ee2SSatish Balay   }
1524f17f7ee2SSatish Balay   { /* log flops */
1525f17f7ee2SSatish Balay     double gops,time,perf,dev;
1526f17f7ee2SSatish Balay     HLibProfile::getHcompressPerf(gops,time,perf,dev);
1527f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1528f17f7ee2SSatish Balay     if (boundtocpu) {
1529f17f7ee2SSatish Balay       PetscCall(PetscLogFlops(1e9*gops));
1530f17f7ee2SSatish Balay     } else {
1531f17f7ee2SSatish Balay       PetscCall(PetscLogGpuFlops(1e9*gops));
1532f17f7ee2SSatish Balay     }
1533f17f7ee2SSatish Balay #else
1534f17f7ee2SSatish Balay     PetscCall(PetscLogFlops(1e9*gops));
1535f17f7ee2SSatish Balay #endif
1536f17f7ee2SSatish Balay   }
1537f17f7ee2SSatish Balay   PetscCall(PetscLogEventEnd(MAT_H2Opus_Compress,A,0,0,0));
1538f17f7ee2SSatish Balay   PetscFunctionReturn(0);
1539f17f7ee2SSatish Balay }
1540f17f7ee2SSatish Balay 
1541f17f7ee2SSatish Balay /*@C
1542f17f7ee2SSatish Balay      MatH2OpusSetSamplingMat - Set a matrix to be sampled from matrix vector product to construct a hierarchical matrix.
1543f17f7ee2SSatish Balay 
1544f17f7ee2SSatish Balay    Input Parameters:
1545f17f7ee2SSatish Balay +     A - the hierarchical matrix
1546f17f7ee2SSatish Balay .     B - the matrix to be sampled
1547f17f7ee2SSatish Balay .     bs - maximum number of samples to be taken concurrently
1548f17f7ee2SSatish Balay -     tol - relative tolerance for construction
1549f17f7ee2SSatish Balay 
1550f17f7ee2SSatish Balay    Notes: Need to call MatAssemblyBegin/End() to update the hierarchical matrix.
1551f17f7ee2SSatish Balay 
1552f17f7ee2SSatish Balay    Level: intermediate
1553f17f7ee2SSatish Balay 
1554*036e4bdcSSatish Balay .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`
1555f17f7ee2SSatish Balay @*/
1556f17f7ee2SSatish Balay PetscErrorCode MatH2OpusSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol)
1557f17f7ee2SSatish Balay {
1558f17f7ee2SSatish Balay   PetscBool      ish2opus;
1559f17f7ee2SSatish Balay 
1560f17f7ee2SSatish Balay   PetscFunctionBegin;
1561f17f7ee2SSatish Balay   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1562f17f7ee2SSatish Balay   PetscValidType(A,1);
1563f17f7ee2SSatish Balay   if (B) PetscValidHeaderSpecific(B,MAT_CLASSID,2);
1564f17f7ee2SSatish Balay   PetscValidLogicalCollectiveInt(A,bs,3);
1565f17f7ee2SSatish Balay   PetscValidLogicalCollectiveReal(A,tol,3);
1566f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
1567f17f7ee2SSatish Balay   if (ish2opus) {
1568f17f7ee2SSatish Balay     Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
1569f17f7ee2SSatish Balay 
1570f17f7ee2SSatish Balay     if (!a->sampler) a->sampler = new PetscMatrixSampler();
1571f17f7ee2SSatish Balay     a->sampler->SetSamplingMat(B);
1572f17f7ee2SSatish Balay     if (bs > 0) a->bs = bs;
1573f17f7ee2SSatish Balay     if (tol > 0.) a->rtol = tol;
1574f17f7ee2SSatish Balay     delete a->kernel;
1575f17f7ee2SSatish Balay   }
1576f17f7ee2SSatish Balay   PetscFunctionReturn(0);
1577f17f7ee2SSatish Balay }
1578f17f7ee2SSatish Balay 
1579f17f7ee2SSatish Balay /*@C
1580f17f7ee2SSatish Balay      MatCreateH2OpusFromKernel - Creates a MATH2OPUS from a user-supplied kernel.
1581f17f7ee2SSatish Balay 
1582f17f7ee2SSatish Balay    Input Parameters:
1583f17f7ee2SSatish Balay +     comm - MPI communicator
1584f17f7ee2SSatish Balay .     m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1585f17f7ee2SSatish Balay .     n - number of local columns (or PETSC_DECIDE to have calculated if N is given)
1586f17f7ee2SSatish Balay .     M - number of global rows (or PETSC_DETERMINE to have calculated if m is given)
1587f17f7ee2SSatish Balay .     N - number of global columns (or PETSC_DETERMINE to have calculated if n is given)
1588f17f7ee2SSatish Balay .     spacedim - dimension of the space coordinates
1589f17f7ee2SSatish Balay .     coords - coordinates of the points
1590f17f7ee2SSatish Balay .     cdist - whether or not coordinates are distributed
1591f17f7ee2SSatish Balay .     kernel - computational kernel (or NULL)
1592f17f7ee2SSatish Balay .     kernelctx - kernel context
1593f17f7ee2SSatish Balay .     eta - admissibility condition tolerance
1594f17f7ee2SSatish Balay .     leafsize - leaf size in cluster tree
1595f17f7ee2SSatish Balay -     basisord - approximation order for Chebychev interpolation of low-rank blocks
1596f17f7ee2SSatish Balay 
1597f17f7ee2SSatish Balay    Output Parameter:
1598f17f7ee2SSatish Balay .     nA - matrix
1599f17f7ee2SSatish Balay 
1600f17f7ee2SSatish Balay    Options Database Keys:
1601f17f7ee2SSatish Balay +     -mat_h2opus_leafsize <PetscInt> - Leaf size of cluster tree
1602f17f7ee2SSatish Balay .     -mat_h2opus_eta <PetscReal> - Admissibility condition tolerance
1603f17f7ee2SSatish Balay .     -mat_h2opus_order <PetscInt> - Chebychev approximation order
1604f17f7ee2SSatish Balay -     -mat_h2opus_normsamples <PetscInt> - Maximum number of samples to be used when estimating norms
1605f17f7ee2SSatish Balay 
1606f17f7ee2SSatish Balay    Level: intermediate
1607f17f7ee2SSatish Balay 
1608*036e4bdcSSatish Balay .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`
1609f17f7ee2SSatish Balay @*/
1610f17f7ee2SSatish Balay PetscErrorCode MatCreateH2OpusFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat* nA)
1611f17f7ee2SSatish Balay {
1612f17f7ee2SSatish Balay   Mat            A;
1613f17f7ee2SSatish Balay   Mat_H2OPUS     *h2opus;
1614f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1615f17f7ee2SSatish Balay   PetscBool      iscpu = PETSC_FALSE;
1616f17f7ee2SSatish Balay #else
1617f17f7ee2SSatish Balay   PetscBool      iscpu = PETSC_TRUE;
1618f17f7ee2SSatish Balay #endif
1619f17f7ee2SSatish Balay 
1620f17f7ee2SSatish Balay   PetscFunctionBegin;
1621*036e4bdcSSatish Balay   PetscCheck(m == n,PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported");
1622f17f7ee2SSatish Balay   PetscCall(MatCreate(comm,&A));
1623f17f7ee2SSatish Balay   PetscCall(MatSetSizes(A,m,n,M,N));
1624*036e4bdcSSatish Balay   PetscCheck(M == N,comm,PETSC_ERR_SUP,"Rectangular matrices are not supported");
1625f17f7ee2SSatish Balay   PetscCall(MatSetType(A,MATH2OPUS));
1626f17f7ee2SSatish Balay   PetscCall(MatBindToCPU(A,iscpu));
1627f17f7ee2SSatish Balay   PetscCall(MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,cdist,kernel,kernelctx));
1628f17f7ee2SSatish Balay 
1629f17f7ee2SSatish Balay   h2opus = (Mat_H2OPUS*)A->data;
1630f17f7ee2SSatish Balay   if (eta > 0.) h2opus->eta = eta;
1631f17f7ee2SSatish Balay   if (leafsize > 0) h2opus->leafsize = leafsize;
1632f17f7ee2SSatish Balay   if (basisord > 0) h2opus->basisord = basisord;
1633f17f7ee2SSatish Balay 
1634f17f7ee2SSatish Balay   *nA = A;
1635f17f7ee2SSatish Balay   PetscFunctionReturn(0);
1636f17f7ee2SSatish Balay }
1637f17f7ee2SSatish Balay 
1638f17f7ee2SSatish Balay /*@C
1639f17f7ee2SSatish Balay      MatCreateH2OpusFromMat - Creates a MATH2OPUS sampling from a user-supplied operator.
1640f17f7ee2SSatish Balay 
1641f17f7ee2SSatish Balay    Input Parameters:
1642f17f7ee2SSatish Balay +     B - the matrix to be sampled
1643f17f7ee2SSatish Balay .     spacedim - dimension of the space coordinates
1644f17f7ee2SSatish Balay .     coords - coordinates of the points
1645f17f7ee2SSatish Balay .     cdist - whether or not coordinates are distributed
1646f17f7ee2SSatish Balay .     eta - admissibility condition tolerance
1647f17f7ee2SSatish Balay .     leafsize - leaf size in cluster tree
1648f17f7ee2SSatish Balay .     maxrank - maximum rank allowed
1649f17f7ee2SSatish Balay .     bs - maximum number of samples to be taken concurrently
1650f17f7ee2SSatish Balay -     rtol - relative tolerance for construction
1651f17f7ee2SSatish Balay 
1652f17f7ee2SSatish Balay    Output Parameter:
1653f17f7ee2SSatish Balay .     nA - matrix
1654f17f7ee2SSatish Balay 
1655f17f7ee2SSatish Balay    Options Database Keys:
1656f17f7ee2SSatish Balay +     -mat_h2opus_leafsize <PetscInt> - Leaf size of cluster tree
1657f17f7ee2SSatish Balay .     -mat_h2opus_eta <PetscReal> - Admissibility condition tolerance
1658f17f7ee2SSatish Balay .     -mat_h2opus_maxrank <PetscInt> - Maximum rank when constructed from matvecs
1659f17f7ee2SSatish Balay .     -mat_h2opus_samples <PetscInt> - Maximum number of samples to be taken concurrently when constructing from matvecs
1660f17f7ee2SSatish Balay .     -mat_h2opus_rtol <PetscReal> - Relative tolerance for construction from sampling
1661f17f7ee2SSatish Balay .     -mat_h2opus_check <PetscBool> - Check error when constructing from sampling during MatAssemblyEnd()
1662f17f7ee2SSatish Balay .     -mat_h2opus_hara_verbose <PetscBool> - Verbose output from hara construction
1663f17f7ee2SSatish Balay -     -mat_h2opus_normsamples <PetscInt> - Maximum bumber of samples to be when estimating norms
1664f17f7ee2SSatish Balay 
1665f17f7ee2SSatish Balay    Notes: not available in parallel
1666f17f7ee2SSatish Balay 
1667f17f7ee2SSatish Balay    Level: intermediate
1668f17f7ee2SSatish Balay 
1669*036e4bdcSSatish Balay .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromKernel()`
1670f17f7ee2SSatish Balay @*/
1671f17f7ee2SSatish Balay PetscErrorCode MatCreateH2OpusFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA)
1672f17f7ee2SSatish Balay {
1673f17f7ee2SSatish Balay   Mat            A;
1674f17f7ee2SSatish Balay   Mat_H2OPUS     *h2opus;
1675f17f7ee2SSatish Balay   MPI_Comm       comm;
1676f17f7ee2SSatish Balay   PetscBool      boundtocpu = PETSC_TRUE;
1677f17f7ee2SSatish Balay 
1678f17f7ee2SSatish Balay   PetscFunctionBegin;
1679f17f7ee2SSatish Balay   PetscValidHeaderSpecific(B,MAT_CLASSID,1);
1680f17f7ee2SSatish Balay   PetscValidLogicalCollectiveInt(B,spacedim,2);
1681f17f7ee2SSatish Balay   PetscValidLogicalCollectiveBool(B,cdist,4);
1682f17f7ee2SSatish Balay   PetscValidLogicalCollectiveReal(B,eta,5);
1683f17f7ee2SSatish Balay   PetscValidLogicalCollectiveInt(B,leafsize,6);
1684f17f7ee2SSatish Balay   PetscValidLogicalCollectiveInt(B,maxrank,7);
1685f17f7ee2SSatish Balay   PetscValidLogicalCollectiveInt(B,bs,8);
1686f17f7ee2SSatish Balay   PetscValidLogicalCollectiveReal(B,rtol,9);
1687f17f7ee2SSatish Balay   PetscValidPointer(nA,10);
1688f17f7ee2SSatish Balay   PetscCall(PetscObjectGetComm((PetscObject)B,&comm));
1689*036e4bdcSSatish Balay   PetscCheck(B->rmap->n == B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported");
1690*036e4bdcSSatish Balay   PetscCheck(B->rmap->N == B->cmap->N,comm,PETSC_ERR_SUP,"Rectangular matrices are not supported");
1691f17f7ee2SSatish Balay   PetscCall(MatCreate(comm,&A));
1692f17f7ee2SSatish Balay   PetscCall(MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N));
1693f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1694f17f7ee2SSatish Balay   {
1695f17f7ee2SSatish Balay     PetscBool iscuda;
1696f17f7ee2SSatish Balay     VecType   vtype;
1697f17f7ee2SSatish Balay 
1698f17f7ee2SSatish Balay     PetscCall(MatGetVecType(B,&vtype));
1699f17f7ee2SSatish Balay     PetscCall(PetscStrcmp(vtype,VECCUDA,&iscuda));
1700f17f7ee2SSatish Balay     if (!iscuda) {
1701f17f7ee2SSatish Balay       PetscCall(PetscStrcmp(vtype,VECSEQCUDA,&iscuda));
1702f17f7ee2SSatish Balay       if (!iscuda) {
1703f17f7ee2SSatish Balay         PetscCall(PetscStrcmp(vtype,VECMPICUDA,&iscuda));
1704f17f7ee2SSatish Balay       }
1705f17f7ee2SSatish Balay     }
1706f17f7ee2SSatish Balay     if (iscuda && !B->boundtocpu) boundtocpu = PETSC_FALSE;
1707f17f7ee2SSatish Balay   }
1708f17f7ee2SSatish Balay #endif
1709f17f7ee2SSatish Balay   PetscCall(MatSetType(A,MATH2OPUS));
1710f17f7ee2SSatish Balay   PetscCall(MatBindToCPU(A,boundtocpu));
1711f17f7ee2SSatish Balay   if (spacedim) {
1712f17f7ee2SSatish Balay     PetscCall(MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,cdist,NULL,NULL));
1713f17f7ee2SSatish Balay   }
1714f17f7ee2SSatish Balay   PetscCall(MatPropagateSymmetryOptions(B,A));
1715f17f7ee2SSatish Balay   /* PetscCheck(A->symmetric,comm,PETSC_ERR_SUP,"Unsymmetric sampling does not work"); */
1716f17f7ee2SSatish Balay 
1717f17f7ee2SSatish Balay   h2opus = (Mat_H2OPUS*)A->data;
1718f17f7ee2SSatish Balay   h2opus->sampler = new PetscMatrixSampler(B);
1719f17f7ee2SSatish Balay   if (eta > 0.) h2opus->eta = eta;
1720f17f7ee2SSatish Balay   if (leafsize > 0) h2opus->leafsize = leafsize;
1721f17f7ee2SSatish Balay   if (maxrank > 0) h2opus->max_rank = maxrank;
1722f17f7ee2SSatish Balay   if (bs > 0) h2opus->bs = bs;
1723f17f7ee2SSatish Balay   if (rtol > 0.) h2opus->rtol = rtol;
1724f17f7ee2SSatish Balay   *nA = A;
1725f17f7ee2SSatish Balay   A->preallocated = PETSC_TRUE;
1726f17f7ee2SSatish Balay   PetscFunctionReturn(0);
1727f17f7ee2SSatish Balay }
1728f17f7ee2SSatish Balay 
1729f17f7ee2SSatish Balay /*@C
1730f17f7ee2SSatish Balay      MatH2OpusGetIndexMap - Access reordering index set.
1731f17f7ee2SSatish Balay 
1732f17f7ee2SSatish Balay    Input Parameters:
1733f17f7ee2SSatish Balay .     A - the matrix
1734f17f7ee2SSatish Balay 
1735f17f7ee2SSatish Balay    Output Parameter:
1736f17f7ee2SSatish Balay .     indexmap - the index set for the reordering
1737f17f7ee2SSatish Balay 
1738f17f7ee2SSatish Balay    Level: intermediate
1739f17f7ee2SSatish Balay 
1740*036e4bdcSSatish Balay .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1741f17f7ee2SSatish Balay @*/
1742f17f7ee2SSatish Balay PetscErrorCode MatH2OpusGetIndexMap(Mat A, IS *indexmap)
1743f17f7ee2SSatish Balay {
1744f17f7ee2SSatish Balay   PetscBool      ish2opus;
1745f17f7ee2SSatish Balay   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1746f17f7ee2SSatish Balay 
1747f17f7ee2SSatish Balay   PetscFunctionBegin;
1748f17f7ee2SSatish Balay   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1749f17f7ee2SSatish Balay   PetscValidType(A,1);
1750f17f7ee2SSatish Balay   PetscValidPointer(indexmap,2);
1751f17f7ee2SSatish Balay   PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1752f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
1753f17f7ee2SSatish Balay   PetscCheck(ish2opus,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name);
1754f17f7ee2SSatish Balay   *indexmap = a->h2opus_indexmap;
1755f17f7ee2SSatish Balay   PetscFunctionReturn(0);
1756f17f7ee2SSatish Balay }
1757f17f7ee2SSatish Balay 
1758f17f7ee2SSatish Balay /*@C
1759f17f7ee2SSatish Balay      MatH2OpusMapVec - Maps a vector between PETSc and H2Opus ordering
1760f17f7ee2SSatish Balay 
1761f17f7ee2SSatish Balay    Input Parameters:
1762f17f7ee2SSatish Balay +     A - the matrix
1763f17f7ee2SSatish Balay .     nativetopetsc - if true, maps from H2Opus ordering to PETSc ordering. If false, applies the reverse map
1764f17f7ee2SSatish Balay -     in - the vector to be mapped
1765f17f7ee2SSatish Balay 
1766f17f7ee2SSatish Balay    Output Parameter:
1767f17f7ee2SSatish Balay .     out - the newly created mapped vector
1768f17f7ee2SSatish Balay 
1769f17f7ee2SSatish Balay    Level: intermediate
1770f17f7ee2SSatish Balay 
1771*036e4bdcSSatish Balay .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1772f17f7ee2SSatish Balay @*/
1773f17f7ee2SSatish Balay PetscErrorCode MatH2OpusMapVec(Mat A, PetscBool nativetopetsc, Vec in, Vec* out)
1774f17f7ee2SSatish Balay {
1775f17f7ee2SSatish Balay   PetscBool      ish2opus;
1776f17f7ee2SSatish Balay   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1777f17f7ee2SSatish Balay   PetscScalar    *xin,*xout;
1778f17f7ee2SSatish Balay   PetscBool      nm;
1779f17f7ee2SSatish Balay 
1780f17f7ee2SSatish Balay   PetscFunctionBegin;
1781f17f7ee2SSatish Balay   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1782f17f7ee2SSatish Balay   PetscValidType(A,1);
1783f17f7ee2SSatish Balay   PetscValidLogicalCollectiveBool(A,nativetopetsc,2);
1784f17f7ee2SSatish Balay   PetscValidHeaderSpecific(in,VEC_CLASSID,3);
1785f17f7ee2SSatish Balay   PetscValidPointer(out,4);
1786f17f7ee2SSatish Balay   PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1787f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
1788f17f7ee2SSatish Balay   PetscCheck(ish2opus,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name);
1789f17f7ee2SSatish Balay   nm   = a->nativemult;
1790f17f7ee2SSatish Balay   PetscCall(MatH2OpusSetNativeMult(A,(PetscBool)!nativetopetsc));
1791f17f7ee2SSatish Balay   PetscCall(MatCreateVecs(A,out,NULL));
1792f17f7ee2SSatish Balay   PetscCall(MatH2OpusSetNativeMult(A,nm));
1793f17f7ee2SSatish Balay   if (!a->sf) { /* same ordering */
1794f17f7ee2SSatish Balay     PetscCall(VecCopy(in,*out));
1795f17f7ee2SSatish Balay     PetscFunctionReturn(0);
1796f17f7ee2SSatish Balay   }
1797f17f7ee2SSatish Balay   PetscCall(VecGetArrayRead(in,(const PetscScalar**)&xin));
1798f17f7ee2SSatish Balay   PetscCall(VecGetArrayWrite(*out,&xout));
1799f17f7ee2SSatish Balay   if (nativetopetsc) {
1800f17f7ee2SSatish Balay     PetscCall(PetscSFReduceBegin(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE));
1801f17f7ee2SSatish Balay     PetscCall(PetscSFReduceEnd(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE));
1802f17f7ee2SSatish Balay   } else {
1803f17f7ee2SSatish Balay     PetscCall(PetscSFBcastBegin(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE));
1804f17f7ee2SSatish Balay     PetscCall(PetscSFBcastEnd(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE));
1805f17f7ee2SSatish Balay   }
1806f17f7ee2SSatish Balay   PetscCall(VecRestoreArrayRead(in,(const PetscScalar**)&xin));
1807f17f7ee2SSatish Balay   PetscCall(VecRestoreArrayWrite(*out,&xout));
1808f17f7ee2SSatish Balay   PetscFunctionReturn(0);
1809f17f7ee2SSatish Balay }
1810f17f7ee2SSatish Balay 
1811f17f7ee2SSatish Balay /*@C
1812f17f7ee2SSatish Balay      MatH2OpusLowRankUpdate - Perform a low-rank update of the form A = A + s * U * V^T
1813f17f7ee2SSatish Balay 
1814f17f7ee2SSatish Balay    Input Parameters:
1815f17f7ee2SSatish Balay +     A - the hierarchical matrix
1816f17f7ee2SSatish Balay .     s - the scaling factor
1817f17f7ee2SSatish Balay .     U - the dense low-rank update matrix
1818f17f7ee2SSatish Balay -     V - (optional) the dense low-rank update matrix (if NULL, then V = U is assumed)
1819f17f7ee2SSatish Balay 
1820f17f7ee2SSatish Balay    Notes: The U and V matrices must be in dense format
1821f17f7ee2SSatish Balay 
1822f17f7ee2SSatish Balay    Level: intermediate
1823f17f7ee2SSatish Balay 
1824*036e4bdcSSatish Balay .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`, `MATDENSE`
1825f17f7ee2SSatish Balay @*/
1826f17f7ee2SSatish Balay PetscErrorCode MatH2OpusLowRankUpdate(Mat A, Mat U, Mat V, PetscScalar s)
1827f17f7ee2SSatish Balay {
1828f17f7ee2SSatish Balay   PetscBool      flg;
1829f17f7ee2SSatish Balay 
1830f17f7ee2SSatish Balay   PetscFunctionBegin;
1831f17f7ee2SSatish Balay   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1832f17f7ee2SSatish Balay   PetscValidType(A,1);
1833f17f7ee2SSatish Balay   PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1834f17f7ee2SSatish Balay   PetscValidHeaderSpecific(U,MAT_CLASSID,2);
1835f17f7ee2SSatish Balay   PetscCheckSameComm(A,1,U,2);
1836f17f7ee2SSatish Balay   if (V) {
1837f17f7ee2SSatish Balay     PetscValidHeaderSpecific(V,MAT_CLASSID,3);
1838f17f7ee2SSatish Balay     PetscCheckSameComm(A,1,V,3);
1839f17f7ee2SSatish Balay   }
1840f17f7ee2SSatish Balay   PetscValidLogicalCollectiveScalar(A,s,4);
1841f17f7ee2SSatish Balay 
1842f17f7ee2SSatish Balay   if (!V) V = U;
1843*036e4bdcSSatish Balay   PetscCheck(U->cmap->N == V->cmap->N,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Non matching rank update %" PetscInt_FMT " != %" PetscInt_FMT,U->cmap->N,V->cmap->N);
1844f17f7ee2SSatish Balay   if (!U->cmap->N) PetscFunctionReturn(0);
1845f17f7ee2SSatish Balay   PetscCall(PetscLayoutCompare(U->rmap,A->rmap,&flg));
1846f17f7ee2SSatish Balay   PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"A and U must have the same row layout");
1847f17f7ee2SSatish Balay   PetscCall(PetscLayoutCompare(V->rmap,A->cmap,&flg));
1848f17f7ee2SSatish Balay   PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"A column layout must match V row column layout");
1849f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&flg));
1850f17f7ee2SSatish Balay   if (flg) {
1851f17f7ee2SSatish Balay     Mat_H2OPUS        *a = (Mat_H2OPUS*)A->data;
1852f17f7ee2SSatish Balay     const PetscScalar *u,*v,*uu,*vv;
1853f17f7ee2SSatish Balay     PetscInt          ldu,ldv;
1854f17f7ee2SSatish Balay     PetscMPIInt       size;
1855f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1856f17f7ee2SSatish Balay     h2opusHandle_t    handle = a->handle->handle;
1857f17f7ee2SSatish Balay #else
1858f17f7ee2SSatish Balay     h2opusHandle_t    handle = a->handle;
1859f17f7ee2SSatish Balay #endif
1860f17f7ee2SSatish Balay     PetscBool         usesf = (PetscBool)(a->sf && !a->nativemult);
1861f17f7ee2SSatish Balay     PetscSF           usf,vsf;
1862f17f7ee2SSatish Balay 
1863f17f7ee2SSatish Balay     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1864*036e4bdcSSatish Balay     PetscCheck(size <= 1,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not yet implemented in parallel");
1865f17f7ee2SSatish Balay     PetscCall(PetscLogEventBegin(MAT_H2Opus_LR,A,0,0,0));
1866f17f7ee2SSatish Balay     PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)U,&flg,MATSEQDENSE,MATMPIDENSE,""));
1867f17f7ee2SSatish Balay     PetscCheck(flg,PetscObjectComm((PetscObject)U),PETSC_ERR_SUP,"Not for U of type %s",((PetscObject)U)->type_name);
1868f17f7ee2SSatish Balay     PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)V,&flg,MATSEQDENSE,MATMPIDENSE,""));
1869f17f7ee2SSatish Balay     PetscCheck(flg,PetscObjectComm((PetscObject)V),PETSC_ERR_SUP,"Not for V of type %s",((PetscObject)V)->type_name);
1870f17f7ee2SSatish Balay     PetscCall(MatDenseGetLDA(U,&ldu));
1871f17f7ee2SSatish Balay     PetscCall(MatDenseGetLDA(V,&ldv));
1872f17f7ee2SSatish Balay     PetscCall(MatBoundToCPU(A,&flg));
1873f17f7ee2SSatish Balay     if (usesf) {
1874f17f7ee2SSatish Balay       PetscInt n;
1875f17f7ee2SSatish Balay 
1876f17f7ee2SSatish Balay       PetscCall(MatDenseGetH2OpusVectorSF(U,a->sf,&usf));
1877f17f7ee2SSatish Balay       PetscCall(MatDenseGetH2OpusVectorSF(V,a->sf,&vsf));
1878f17f7ee2SSatish Balay       PetscCall(MatH2OpusResizeBuffers_Private(A,U->cmap->N,V->cmap->N));
1879f17f7ee2SSatish Balay       PetscCall(PetscSFGetGraph(a->sf,NULL,&n,NULL,NULL));
1880f17f7ee2SSatish Balay       ldu = n;
1881f17f7ee2SSatish Balay       ldv = n;
1882f17f7ee2SSatish Balay     }
1883f17f7ee2SSatish Balay     if (flg) {
1884f17f7ee2SSatish Balay       PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1885f17f7ee2SSatish Balay       PetscCall(MatDenseGetArrayRead(U,&u));
1886f17f7ee2SSatish Balay       PetscCall(MatDenseGetArrayRead(V,&v));
1887f17f7ee2SSatish Balay       if (usesf) {
1888f17f7ee2SSatish Balay         vv = MatH2OpusGetThrustPointer(*a->yy);
1889f17f7ee2SSatish Balay         PetscCall(PetscSFBcastBegin(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE));
1890f17f7ee2SSatish Balay         PetscCall(PetscSFBcastEnd(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE));
1891f17f7ee2SSatish Balay         if (U != V) {
1892f17f7ee2SSatish Balay           uu = MatH2OpusGetThrustPointer(*a->xx);
1893f17f7ee2SSatish Balay           PetscCall(PetscSFBcastBegin(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE));
1894f17f7ee2SSatish Balay           PetscCall(PetscSFBcastEnd(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE));
1895f17f7ee2SSatish Balay         } else uu = vv;
1896f17f7ee2SSatish Balay       } else { uu = u; vv = v; }
1897f17f7ee2SSatish Balay       hlru_global(*a->hmatrix,uu,ldu,vv,ldv,U->cmap->N,s,handle);
1898f17f7ee2SSatish Balay       PetscCall(MatDenseRestoreArrayRead(U,&u));
1899f17f7ee2SSatish Balay       PetscCall(MatDenseRestoreArrayRead(V,&v));
1900f17f7ee2SSatish Balay     } else {
1901f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1902f17f7ee2SSatish Balay       PetscBool flgU, flgV;
1903f17f7ee2SSatish Balay 
1904f17f7ee2SSatish Balay       PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1905f17f7ee2SSatish Balay       PetscCall(PetscObjectTypeCompareAny((PetscObject)U,&flgU,MATSEQDENSE,MATMPIDENSE,""));
1906f17f7ee2SSatish Balay       if (flgU) PetscCall(MatConvert(U,MATDENSECUDA,MAT_INPLACE_MATRIX,&U));
1907f17f7ee2SSatish Balay       PetscCall(PetscObjectTypeCompareAny((PetscObject)V,&flgV,MATSEQDENSE,MATMPIDENSE,""));
1908f17f7ee2SSatish Balay       if (flgV) PetscCall(MatConvert(V,MATDENSECUDA,MAT_INPLACE_MATRIX,&V));
1909f17f7ee2SSatish Balay       PetscCall(MatDenseCUDAGetArrayRead(U,&u));
1910f17f7ee2SSatish Balay       PetscCall(MatDenseCUDAGetArrayRead(V,&v));
1911f17f7ee2SSatish Balay       if (usesf) {
1912f17f7ee2SSatish Balay         vv = MatH2OpusGetThrustPointer(*a->yy_gpu);
1913f17f7ee2SSatish Balay         PetscCall(PetscSFBcastBegin(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE));
1914f17f7ee2SSatish Balay         PetscCall(PetscSFBcastEnd(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE));
1915f17f7ee2SSatish Balay         if (U != V) {
1916f17f7ee2SSatish Balay           uu = MatH2OpusGetThrustPointer(*a->xx_gpu);
1917f17f7ee2SSatish Balay           PetscCall(PetscSFBcastBegin(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE));
1918f17f7ee2SSatish Balay           PetscCall(PetscSFBcastEnd(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE));
1919f17f7ee2SSatish Balay         } else uu = vv;
1920f17f7ee2SSatish Balay       } else { uu = u; vv = v; }
1921f17f7ee2SSatish Balay #else
1922f17f7ee2SSatish Balay       SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"This should not happen");
1923f17f7ee2SSatish Balay #endif
1924f17f7ee2SSatish Balay       hlru_global(*a->hmatrix_gpu,uu,ldu,vv,ldv,U->cmap->N,s,handle);
1925f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1926f17f7ee2SSatish Balay       PetscCall(MatDenseCUDARestoreArrayRead(U,&u));
1927f17f7ee2SSatish Balay       PetscCall(MatDenseCUDARestoreArrayRead(V,&v));
1928f17f7ee2SSatish Balay       if (flgU) PetscCall(MatConvert(U,MATDENSE,MAT_INPLACE_MATRIX,&U));
1929f17f7ee2SSatish Balay       if (flgV) PetscCall(MatConvert(V,MATDENSE,MAT_INPLACE_MATRIX,&V));
1930f17f7ee2SSatish Balay #endif
1931f17f7ee2SSatish Balay     }
1932f17f7ee2SSatish Balay     PetscCall(PetscLogEventEnd(MAT_H2Opus_LR,A,0,0,0));
1933f17f7ee2SSatish Balay     a->orthogonal = PETSC_FALSE;
1934f17f7ee2SSatish Balay   }
1935f17f7ee2SSatish Balay   PetscFunctionReturn(0);
1936f17f7ee2SSatish Balay }
1937f17f7ee2SSatish Balay #endif
1938