xref: /petsc/src/mat/impls/h2opus/cuda/math2opus.cu (revision f17f7ee2a07b24904e01a3a56815833a6ed82889)
1*f17f7ee2SSatish Balay #include <h2opusconf.h>
2*f17f7ee2SSatish Balay /* skip compilation of this .cu file if H2OPUS is CPU only while PETSc has GPU support */
3*f17f7ee2SSatish Balay #if !defined(__CUDACC__) || defined(H2OPUS_USE_GPU)
4*f17f7ee2SSatish Balay #include <h2opus.h>
5*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
6*f17f7ee2SSatish Balay #include <h2opus/distributed/distributed_h2opus_handle.h>
7*f17f7ee2SSatish Balay #include <h2opus/distributed/distributed_geometric_construction.h>
8*f17f7ee2SSatish Balay #include <h2opus/distributed/distributed_hgemv.h>
9*f17f7ee2SSatish Balay #include <h2opus/distributed/distributed_horthog.h>
10*f17f7ee2SSatish Balay #include <h2opus/distributed/distributed_hcompress.h>
11*f17f7ee2SSatish Balay #endif
12*f17f7ee2SSatish Balay #include <h2opus/util/boxentrygen.h>
13*f17f7ee2SSatish Balay #include <petsc/private/matimpl.h>
14*f17f7ee2SSatish Balay #include <petsc/private/vecimpl.h>
15*f17f7ee2SSatish Balay #include <petsc/private/deviceimpl.h>
16*f17f7ee2SSatish Balay #include <petscsf.h>
17*f17f7ee2SSatish Balay 
18*f17f7ee2SSatish Balay /* math2opusutils */
19*f17f7ee2SSatish Balay PETSC_INTERN PetscErrorCode PetscSFGetVectorSF(PetscSF,PetscInt,PetscInt,PetscInt,PetscSF*);
20*f17f7ee2SSatish Balay PETSC_INTERN PetscErrorCode MatDenseGetH2OpusVectorSF(Mat,PetscSF,PetscSF*);
21*f17f7ee2SSatish Balay PETSC_INTERN PetscErrorCode VecSign(Vec,Vec);
22*f17f7ee2SSatish Balay PETSC_INTERN PetscErrorCode VecSetDelta(Vec,PetscInt);
23*f17f7ee2SSatish Balay PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat,NormType,PetscInt,PetscReal*);
24*f17f7ee2SSatish Balay 
25*f17f7ee2SSatish Balay #define MatH2OpusGetThrustPointer(v) thrust::raw_pointer_cast((v).data())
26*f17f7ee2SSatish Balay 
27*f17f7ee2SSatish Balay /* Use GPU only if H2OPUS is configured for GPU */
28*f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU)
29*f17f7ee2SSatish Balay #define PETSC_H2OPUS_USE_GPU
30*f17f7ee2SSatish Balay #endif
31*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
32*f17f7ee2SSatish Balay #define MatH2OpusUpdateIfNeeded(A,B) MatBindToCPU(A,(PetscBool)((A)->boundtocpu || (B)))
33*f17f7ee2SSatish Balay #else
34*f17f7ee2SSatish Balay #define MatH2OpusUpdateIfNeeded(A,B) 0
35*f17f7ee2SSatish Balay #endif
36*f17f7ee2SSatish Balay 
37*f17f7ee2SSatish Balay // TODO H2OPUS:
38*f17f7ee2SSatish Balay // DistributedHMatrix
39*f17f7ee2SSatish Balay //   unsymmetric ?
40*f17f7ee2SSatish Balay //   transpose for distributed_hgemv?
41*f17f7ee2SSatish Balay //   clearData()
42*f17f7ee2SSatish Balay // Unify interface for sequential and parallel?
43*f17f7ee2SSatish Balay // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled)
44*f17f7ee2SSatish Balay //
45*f17f7ee2SSatish Balay template <class T> class PetscPointCloud : public H2OpusDataSet<T>
46*f17f7ee2SSatish Balay {
47*f17f7ee2SSatish Balay   private:
48*f17f7ee2SSatish Balay     int dimension;
49*f17f7ee2SSatish Balay     size_t num_points;
50*f17f7ee2SSatish Balay     std::vector<T> pts;
51*f17f7ee2SSatish Balay 
52*f17f7ee2SSatish Balay   public:
53*f17f7ee2SSatish Balay     PetscPointCloud(int dim, size_t num_pts, const T coords[])
54*f17f7ee2SSatish Balay     {
55*f17f7ee2SSatish Balay       dim = dim > 0 ? dim : 1;
56*f17f7ee2SSatish Balay       this->dimension = dim;
57*f17f7ee2SSatish Balay       this->num_points = num_pts;
58*f17f7ee2SSatish Balay 
59*f17f7ee2SSatish Balay       pts.resize(num_pts*dim);
60*f17f7ee2SSatish Balay       if (coords) {
61*f17f7ee2SSatish Balay         for (size_t n = 0; n < num_points; n++)
62*f17f7ee2SSatish Balay           for (int i = 0; i < dim; i++)
63*f17f7ee2SSatish Balay             pts[n*dim + i] = coords[n*dim + i];
64*f17f7ee2SSatish Balay       } else {
65*f17f7ee2SSatish Balay         PetscReal h = 1./(num_points - 1);
66*f17f7ee2SSatish Balay         for (size_t n = 0; n < num_points; n++)
67*f17f7ee2SSatish Balay           for (int i = 0; i < dim; i++)
68*f17f7ee2SSatish Balay             pts[n*dim + i] = i*h;
69*f17f7ee2SSatish Balay       }
70*f17f7ee2SSatish Balay     }
71*f17f7ee2SSatish Balay 
72*f17f7ee2SSatish Balay     PetscPointCloud(const PetscPointCloud<T>& other)
73*f17f7ee2SSatish Balay     {
74*f17f7ee2SSatish Balay       size_t N = other.dimension * other.num_points;
75*f17f7ee2SSatish Balay       this->dimension = other.dimension;
76*f17f7ee2SSatish Balay       this->num_points = other.num_points;
77*f17f7ee2SSatish Balay       this->pts.resize(N);
78*f17f7ee2SSatish Balay       for (size_t i = 0; i < N; i++)
79*f17f7ee2SSatish Balay         this->pts[i] = other.pts[i];
80*f17f7ee2SSatish Balay     }
81*f17f7ee2SSatish Balay 
82*f17f7ee2SSatish Balay     int getDimension() const
83*f17f7ee2SSatish Balay     {
84*f17f7ee2SSatish Balay         return dimension;
85*f17f7ee2SSatish Balay     }
86*f17f7ee2SSatish Balay 
87*f17f7ee2SSatish Balay     size_t getDataSetSize() const
88*f17f7ee2SSatish Balay     {
89*f17f7ee2SSatish Balay         return num_points;
90*f17f7ee2SSatish Balay     }
91*f17f7ee2SSatish Balay 
92*f17f7ee2SSatish Balay     T getDataPoint(size_t idx, int dim) const
93*f17f7ee2SSatish Balay     {
94*f17f7ee2SSatish Balay         assert(dim < dimension && idx < num_points);
95*f17f7ee2SSatish Balay         return pts[idx*dimension + dim];
96*f17f7ee2SSatish Balay     }
97*f17f7ee2SSatish Balay 
98*f17f7ee2SSatish Balay     void Print(std::ostream& out = std::cout)
99*f17f7ee2SSatish Balay     {
100*f17f7ee2SSatish Balay        out << "Dimension: " << dimension << std::endl;
101*f17f7ee2SSatish Balay        out << "NumPoints: " << num_points << std::endl;
102*f17f7ee2SSatish Balay        for (size_t n = 0; n < num_points; n++) {
103*f17f7ee2SSatish Balay          for (int d = 0; d < dimension; d++)
104*f17f7ee2SSatish Balay            out << pts[n*dimension + d] << " ";
105*f17f7ee2SSatish Balay          out << std::endl;
106*f17f7ee2SSatish Balay        }
107*f17f7ee2SSatish Balay     }
108*f17f7ee2SSatish Balay };
109*f17f7ee2SSatish Balay 
110*f17f7ee2SSatish Balay template<class T> class PetscFunctionGenerator
111*f17f7ee2SSatish Balay {
112*f17f7ee2SSatish Balay private:
113*f17f7ee2SSatish Balay   MatH2OpusKernel k;
114*f17f7ee2SSatish Balay   int             dim;
115*f17f7ee2SSatish Balay   void            *ctx;
116*f17f7ee2SSatish Balay 
117*f17f7ee2SSatish Balay public:
118*f17f7ee2SSatish Balay     PetscFunctionGenerator(MatH2OpusKernel k, int dim, void* ctx) { this->k = k; this->dim = dim; this->ctx = ctx; }
119*f17f7ee2SSatish Balay     PetscFunctionGenerator(PetscFunctionGenerator& other) { this->k = other.k; this->dim = other.dim; this->ctx = other.ctx; }
120*f17f7ee2SSatish Balay     T operator()(PetscReal *pt1, PetscReal *pt2)
121*f17f7ee2SSatish Balay     {
122*f17f7ee2SSatish Balay         return (T)((*this->k)(this->dim,pt1,pt2,this->ctx));
123*f17f7ee2SSatish Balay     }
124*f17f7ee2SSatish Balay };
125*f17f7ee2SSatish Balay 
126*f17f7ee2SSatish Balay #include <../src/mat/impls/h2opus/math2opussampler.hpp>
127*f17f7ee2SSatish Balay 
128*f17f7ee2SSatish Balay /* just to not clutter the code */
129*f17f7ee2SSatish Balay #if !defined(H2OPUS_USE_GPU)
130*f17f7ee2SSatish Balay typedef HMatrix HMatrix_GPU;
131*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
132*f17f7ee2SSatish Balay typedef DistributedHMatrix DistributedHMatrix_GPU;
133*f17f7ee2SSatish Balay #endif
134*f17f7ee2SSatish Balay #endif
135*f17f7ee2SSatish Balay 
136*f17f7ee2SSatish Balay typedef struct {
137*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
138*f17f7ee2SSatish Balay   distributedH2OpusHandle_t handle;
139*f17f7ee2SSatish Balay #else
140*f17f7ee2SSatish Balay   h2opusHandle_t handle;
141*f17f7ee2SSatish Balay #endif
142*f17f7ee2SSatish Balay   /* Sequential and parallel matrices are two different classes at the moment */
143*f17f7ee2SSatish Balay   HMatrix *hmatrix;
144*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
145*f17f7ee2SSatish Balay   DistributedHMatrix *dist_hmatrix;
146*f17f7ee2SSatish Balay #else
147*f17f7ee2SSatish Balay   HMatrix *dist_hmatrix; /* just to not clutter the code */
148*f17f7ee2SSatish Balay #endif
149*f17f7ee2SSatish Balay   /* May use permutations */
150*f17f7ee2SSatish Balay   PetscSF sf;
151*f17f7ee2SSatish Balay   PetscLayout h2opus_rmap, h2opus_cmap;
152*f17f7ee2SSatish Balay   IS h2opus_indexmap;
153*f17f7ee2SSatish Balay   thrust::host_vector<PetscScalar> *xx,*yy;
154*f17f7ee2SSatish Balay   PetscInt xxs,yys;
155*f17f7ee2SSatish Balay   PetscBool multsetup;
156*f17f7ee2SSatish Balay 
157*f17f7ee2SSatish Balay   /* GPU */
158*f17f7ee2SSatish Balay   HMatrix_GPU *hmatrix_gpu;
159*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
160*f17f7ee2SSatish Balay   DistributedHMatrix_GPU *dist_hmatrix_gpu;
161*f17f7ee2SSatish Balay #else
162*f17f7ee2SSatish Balay   HMatrix_GPU *dist_hmatrix_gpu; /* just to not clutter the code */
163*f17f7ee2SSatish Balay #endif
164*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
165*f17f7ee2SSatish Balay   thrust::device_vector<PetscScalar> *xx_gpu,*yy_gpu;
166*f17f7ee2SSatish Balay   PetscInt xxs_gpu,yys_gpu;
167*f17f7ee2SSatish Balay #endif
168*f17f7ee2SSatish Balay 
169*f17f7ee2SSatish Balay   /* construction from matvecs */
170*f17f7ee2SSatish Balay   PetscMatrixSampler* sampler;
171*f17f7ee2SSatish Balay   PetscBool nativemult;
172*f17f7ee2SSatish Balay 
173*f17f7ee2SSatish Balay   /* Admissibility */
174*f17f7ee2SSatish Balay   PetscReal eta;
175*f17f7ee2SSatish Balay   PetscInt  leafsize;
176*f17f7ee2SSatish Balay 
177*f17f7ee2SSatish Balay   /* for dof reordering */
178*f17f7ee2SSatish Balay   PetscPointCloud<PetscReal> *ptcloud;
179*f17f7ee2SSatish Balay 
180*f17f7ee2SSatish Balay   /* kernel for generating matrix entries */
181*f17f7ee2SSatish Balay   PetscFunctionGenerator<PetscScalar> *kernel;
182*f17f7ee2SSatish Balay 
183*f17f7ee2SSatish Balay   /* basis orthogonalized? */
184*f17f7ee2SSatish Balay   PetscBool orthogonal;
185*f17f7ee2SSatish Balay 
186*f17f7ee2SSatish Balay   /* customization */
187*f17f7ee2SSatish Balay   PetscInt  basisord;
188*f17f7ee2SSatish Balay   PetscInt  max_rank;
189*f17f7ee2SSatish Balay   PetscInt  bs;
190*f17f7ee2SSatish Balay   PetscReal rtol;
191*f17f7ee2SSatish Balay   PetscInt  norm_max_samples;
192*f17f7ee2SSatish Balay   PetscBool check_construction;
193*f17f7ee2SSatish Balay   PetscBool hara_verbose;
194*f17f7ee2SSatish Balay 
195*f17f7ee2SSatish Balay   /* keeps track of MatScale values */
196*f17f7ee2SSatish Balay   PetscScalar s;
197*f17f7ee2SSatish Balay } Mat_H2OPUS;
198*f17f7ee2SSatish Balay 
199*f17f7ee2SSatish Balay static PetscErrorCode MatDestroy_H2OPUS(Mat A)
200*f17f7ee2SSatish Balay {
201*f17f7ee2SSatish Balay   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
202*f17f7ee2SSatish Balay 
203*f17f7ee2SSatish Balay   PetscFunctionBegin;
204*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
205*f17f7ee2SSatish Balay   h2opusDestroyDistributedHandle(a->handle);
206*f17f7ee2SSatish Balay #else
207*f17f7ee2SSatish Balay   h2opusDestroyHandle(a->handle);
208*f17f7ee2SSatish Balay #endif
209*f17f7ee2SSatish Balay   delete a->dist_hmatrix;
210*f17f7ee2SSatish Balay   delete a->hmatrix;
211*f17f7ee2SSatish Balay   PetscCall(PetscSFDestroy(&a->sf));
212*f17f7ee2SSatish Balay   PetscCall(PetscLayoutDestroy(&a->h2opus_rmap));
213*f17f7ee2SSatish Balay   PetscCall(PetscLayoutDestroy(&a->h2opus_cmap));
214*f17f7ee2SSatish Balay   PetscCall(ISDestroy(&a->h2opus_indexmap));
215*f17f7ee2SSatish Balay   delete a->xx;
216*f17f7ee2SSatish Balay   delete a->yy;
217*f17f7ee2SSatish Balay   delete a->hmatrix_gpu;
218*f17f7ee2SSatish Balay   delete a->dist_hmatrix_gpu;
219*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
220*f17f7ee2SSatish Balay   delete a->xx_gpu;
221*f17f7ee2SSatish Balay   delete a->yy_gpu;
222*f17f7ee2SSatish Balay #endif
223*f17f7ee2SSatish Balay   delete a->sampler;
224*f17f7ee2SSatish Balay   delete a->ptcloud;
225*f17f7ee2SSatish Balay   delete a->kernel;
226*f17f7ee2SSatish Balay   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdense_C",NULL));
227*f17f7ee2SSatish Balay   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdensecuda_C",NULL));
228*f17f7ee2SSatish Balay   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidense_C",NULL));
229*f17f7ee2SSatish Balay   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidensecuda_C",NULL));
230*f17f7ee2SSatish Balay   PetscCall(PetscObjectChangeTypeName((PetscObject)A,NULL));
231*f17f7ee2SSatish Balay   PetscCall(PetscFree(A->data));
232*f17f7ee2SSatish Balay   PetscFunctionReturn(0);
233*f17f7ee2SSatish Balay }
234*f17f7ee2SSatish Balay 
235*f17f7ee2SSatish Balay PetscErrorCode MatH2OpusSetNativeMult(Mat A, PetscBool nm)
236*f17f7ee2SSatish Balay {
237*f17f7ee2SSatish Balay   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
238*f17f7ee2SSatish Balay   PetscBool      ish2opus;
239*f17f7ee2SSatish Balay 
240*f17f7ee2SSatish Balay   PetscFunctionBegin;
241*f17f7ee2SSatish Balay   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
242*f17f7ee2SSatish Balay   PetscValidLogicalCollectiveBool(A,nm,2);
243*f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
244*f17f7ee2SSatish Balay   if (ish2opus) {
245*f17f7ee2SSatish Balay     if (a->h2opus_rmap) { /* need to swap layouts for vector creation */
246*f17f7ee2SSatish Balay       if ((!a->nativemult && nm) || (a->nativemult && !nm)) {
247*f17f7ee2SSatish Balay         PetscLayout t;
248*f17f7ee2SSatish Balay         t = A->rmap;
249*f17f7ee2SSatish Balay         A->rmap = a->h2opus_rmap;
250*f17f7ee2SSatish Balay         a->h2opus_rmap = t;
251*f17f7ee2SSatish Balay         t = A->cmap;
252*f17f7ee2SSatish Balay         A->cmap = a->h2opus_cmap;
253*f17f7ee2SSatish Balay         a->h2opus_cmap = t;
254*f17f7ee2SSatish Balay       }
255*f17f7ee2SSatish Balay     }
256*f17f7ee2SSatish Balay     a->nativemult = nm;
257*f17f7ee2SSatish Balay   }
258*f17f7ee2SSatish Balay   PetscFunctionReturn(0);
259*f17f7ee2SSatish Balay }
260*f17f7ee2SSatish Balay 
261*f17f7ee2SSatish Balay PetscErrorCode MatH2OpusGetNativeMult(Mat A, PetscBool *nm)
262*f17f7ee2SSatish Balay {
263*f17f7ee2SSatish Balay   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
264*f17f7ee2SSatish Balay   PetscBool      ish2opus;
265*f17f7ee2SSatish Balay 
266*f17f7ee2SSatish Balay   PetscFunctionBegin;
267*f17f7ee2SSatish Balay   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
268*f17f7ee2SSatish Balay   PetscValidPointer(nm,2);
269*f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
270*f17f7ee2SSatish Balay   PetscCheck(ish2opus,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name);
271*f17f7ee2SSatish Balay   *nm = a->nativemult;
272*f17f7ee2SSatish Balay   PetscFunctionReturn(0);
273*f17f7ee2SSatish Balay }
274*f17f7ee2SSatish Balay 
275*f17f7ee2SSatish Balay PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat A, NormType normtype, PetscReal* n)
276*f17f7ee2SSatish Balay {
277*f17f7ee2SSatish Balay   PetscBool      ish2opus;
278*f17f7ee2SSatish Balay   PetscInt       nmax = PETSC_DECIDE;
279*f17f7ee2SSatish Balay   Mat_H2OPUS     *a = NULL;
280*f17f7ee2SSatish Balay   PetscBool      mult = PETSC_FALSE;
281*f17f7ee2SSatish Balay 
282*f17f7ee2SSatish Balay   PetscFunctionBegin;
283*f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
284*f17f7ee2SSatish Balay   if (ish2opus) { /* set userdefine number of samples and fastpath for mult (norms are order independent) */
285*f17f7ee2SSatish Balay     a = (Mat_H2OPUS*)A->data;
286*f17f7ee2SSatish Balay 
287*f17f7ee2SSatish Balay     nmax = a->norm_max_samples;
288*f17f7ee2SSatish Balay     mult = a->nativemult;
289*f17f7ee2SSatish Balay     PetscCall(MatH2OpusSetNativeMult(A,PETSC_TRUE));
290*f17f7ee2SSatish Balay   } else {
291*f17f7ee2SSatish Balay     PetscCall(PetscOptionsGetInt(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_approximate_norm_samples",&nmax,NULL));
292*f17f7ee2SSatish Balay   }
293*f17f7ee2SSatish Balay   PetscCall(MatApproximateNorm_Private(A,normtype,nmax,n));
294*f17f7ee2SSatish Balay   if (a) PetscCall(MatH2OpusSetNativeMult(A,mult));
295*f17f7ee2SSatish Balay   PetscFunctionReturn(0);
296*f17f7ee2SSatish Balay }
297*f17f7ee2SSatish Balay 
298*f17f7ee2SSatish Balay static PetscErrorCode MatH2OpusResizeBuffers_Private(Mat A, PetscInt xN, PetscInt yN)
299*f17f7ee2SSatish Balay {
300*f17f7ee2SSatish Balay   Mat_H2OPUS     *h2opus = (Mat_H2OPUS*)A->data;
301*f17f7ee2SSatish Balay   PetscInt       n;
302*f17f7ee2SSatish Balay   PetscBool      boundtocpu = PETSC_TRUE;
303*f17f7ee2SSatish Balay 
304*f17f7ee2SSatish Balay   PetscFunctionBegin;
305*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
306*f17f7ee2SSatish Balay   boundtocpu = A->boundtocpu;
307*f17f7ee2SSatish Balay #endif
308*f17f7ee2SSatish Balay   PetscCall(PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL));
309*f17f7ee2SSatish Balay   if (boundtocpu) {
310*f17f7ee2SSatish Balay     if (h2opus->xxs < xN) { h2opus->xx->resize(n*xN); h2opus->xxs = xN; }
311*f17f7ee2SSatish Balay     if (h2opus->yys < yN) { h2opus->yy->resize(n*yN); h2opus->yys = yN; }
312*f17f7ee2SSatish Balay   }
313*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
314*f17f7ee2SSatish Balay   if (!boundtocpu) {
315*f17f7ee2SSatish Balay     if (h2opus->xxs_gpu < xN) { h2opus->xx_gpu->resize(n*xN); h2opus->xxs_gpu = xN; }
316*f17f7ee2SSatish Balay     if (h2opus->yys_gpu < yN) { h2opus->yy_gpu->resize(n*yN); h2opus->yys_gpu = yN; }
317*f17f7ee2SSatish Balay   }
318*f17f7ee2SSatish Balay #endif
319*f17f7ee2SSatish Balay   PetscFunctionReturn(0);
320*f17f7ee2SSatish Balay }
321*f17f7ee2SSatish Balay 
322*f17f7ee2SSatish Balay static PetscErrorCode MatMultNKernel_H2OPUS(Mat A, PetscBool transA, Mat B, Mat C)
323*f17f7ee2SSatish Balay {
324*f17f7ee2SSatish Balay   Mat_H2OPUS     *h2opus = (Mat_H2OPUS*)A->data;
325*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
326*f17f7ee2SSatish Balay   h2opusHandle_t handle = h2opus->handle->handle;
327*f17f7ee2SSatish Balay #else
328*f17f7ee2SSatish Balay   h2opusHandle_t handle = h2opus->handle;
329*f17f7ee2SSatish Balay #endif
330*f17f7ee2SSatish Balay   PetscBool      boundtocpu = PETSC_TRUE;
331*f17f7ee2SSatish Balay   PetscScalar    *xx,*yy,*uxx,*uyy;
332*f17f7ee2SSatish Balay   PetscInt       blda,clda;
333*f17f7ee2SSatish Balay   PetscMPIInt    size;
334*f17f7ee2SSatish Balay   PetscSF        bsf,csf;
335*f17f7ee2SSatish Balay   PetscBool      usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
336*f17f7ee2SSatish Balay 
337*f17f7ee2SSatish Balay   PetscFunctionBegin;
338*f17f7ee2SSatish Balay   HLibProfile::clear();
339*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
340*f17f7ee2SSatish Balay   boundtocpu = A->boundtocpu;
341*f17f7ee2SSatish Balay #endif
342*f17f7ee2SSatish Balay   PetscCall(MatDenseGetLDA(B,&blda));
343*f17f7ee2SSatish Balay   PetscCall(MatDenseGetLDA(C,&clda));
344*f17f7ee2SSatish Balay   if (usesf) {
345*f17f7ee2SSatish Balay     PetscInt n;
346*f17f7ee2SSatish Balay 
347*f17f7ee2SSatish Balay     PetscCall(MatDenseGetH2OpusVectorSF(B,h2opus->sf,&bsf));
348*f17f7ee2SSatish Balay     PetscCall(MatDenseGetH2OpusVectorSF(C,h2opus->sf,&csf));
349*f17f7ee2SSatish Balay 
350*f17f7ee2SSatish Balay     PetscCall(MatH2OpusResizeBuffers_Private(A,B->cmap->N,C->cmap->N));
351*f17f7ee2SSatish Balay     PetscCall(PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL));
352*f17f7ee2SSatish Balay     blda = n;
353*f17f7ee2SSatish Balay     clda = n;
354*f17f7ee2SSatish Balay   }
355*f17f7ee2SSatish Balay   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
356*f17f7ee2SSatish Balay   if (boundtocpu) {
357*f17f7ee2SSatish Balay     PetscCall(MatDenseGetArrayRead(B,(const PetscScalar**)&xx));
358*f17f7ee2SSatish Balay     PetscCall(MatDenseGetArrayWrite(C,&yy));
359*f17f7ee2SSatish Balay     if (usesf) {
360*f17f7ee2SSatish Balay       uxx  = MatH2OpusGetThrustPointer(*h2opus->xx);
361*f17f7ee2SSatish Balay       uyy  = MatH2OpusGetThrustPointer(*h2opus->yy);
362*f17f7ee2SSatish Balay       PetscCall(PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
363*f17f7ee2SSatish Balay       PetscCall(PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
364*f17f7ee2SSatish Balay     } else {
365*f17f7ee2SSatish Balay       uxx = xx;
366*f17f7ee2SSatish Balay       uyy = yy;
367*f17f7ee2SSatish Balay     }
368*f17f7ee2SSatish Balay     if (size > 1) {
369*f17f7ee2SSatish Balay       PetscCheck(h2opus->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
370*f17f7ee2SSatish Balay       PetscCheckFalse(transA && !A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
371*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
372*f17f7ee2SSatish Balay       distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
373*f17f7ee2SSatish Balay #endif
374*f17f7ee2SSatish Balay     } else {
375*f17f7ee2SSatish Balay       PetscCheck(h2opus->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
376*f17f7ee2SSatish Balay       hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
377*f17f7ee2SSatish Balay     }
378*f17f7ee2SSatish Balay     PetscCall(MatDenseRestoreArrayRead(B,(const PetscScalar**)&xx));
379*f17f7ee2SSatish Balay     if (usesf) {
380*f17f7ee2SSatish Balay       PetscCall(PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
381*f17f7ee2SSatish Balay       PetscCall(PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
382*f17f7ee2SSatish Balay     }
383*f17f7ee2SSatish Balay     PetscCall(MatDenseRestoreArrayWrite(C,&yy));
384*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
385*f17f7ee2SSatish Balay   } else {
386*f17f7ee2SSatish Balay     PetscBool ciscuda,biscuda;
387*f17f7ee2SSatish Balay 
388*f17f7ee2SSatish Balay     /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */
389*f17f7ee2SSatish Balay     PetscCall(PetscObjectTypeCompareAny((PetscObject)B,&biscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,""));
390*f17f7ee2SSatish Balay     if (!biscuda) {
391*f17f7ee2SSatish Balay       PetscCall(MatConvert(B,MATDENSECUDA,MAT_INPLACE_MATRIX,&B));
392*f17f7ee2SSatish Balay     }
393*f17f7ee2SSatish Balay     PetscCall(PetscObjectTypeCompareAny((PetscObject)C,&ciscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,""));
394*f17f7ee2SSatish Balay     if (!ciscuda) {
395*f17f7ee2SSatish Balay       C->assembled = PETSC_TRUE;
396*f17f7ee2SSatish Balay       PetscCall(MatConvert(C,MATDENSECUDA,MAT_INPLACE_MATRIX,&C));
397*f17f7ee2SSatish Balay     }
398*f17f7ee2SSatish Balay     PetscCall(MatDenseCUDAGetArrayRead(B,(const PetscScalar**)&xx));
399*f17f7ee2SSatish Balay     PetscCall(MatDenseCUDAGetArrayWrite(C,&yy));
400*f17f7ee2SSatish Balay     if (usesf) {
401*f17f7ee2SSatish Balay       uxx  = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
402*f17f7ee2SSatish Balay       uyy  = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
403*f17f7ee2SSatish Balay       PetscCall(PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
404*f17f7ee2SSatish Balay       PetscCall(PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
405*f17f7ee2SSatish Balay     } else {
406*f17f7ee2SSatish Balay       uxx = xx;
407*f17f7ee2SSatish Balay       uyy = yy;
408*f17f7ee2SSatish Balay     }
409*f17f7ee2SSatish Balay     PetscCall(PetscLogGpuTimeBegin());
410*f17f7ee2SSatish Balay     if (size > 1) {
411*f17f7ee2SSatish Balay       PetscCheck(h2opus->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed GPU matrix");
412*f17f7ee2SSatish Balay       PetscCheckFalse(transA && !A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
413*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
414*f17f7ee2SSatish Balay       distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
415*f17f7ee2SSatish Balay #endif
416*f17f7ee2SSatish Balay     } else {
417*f17f7ee2SSatish Balay       PetscCheck(h2opus->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
418*f17f7ee2SSatish Balay       hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
419*f17f7ee2SSatish Balay     }
420*f17f7ee2SSatish Balay     PetscCall(PetscLogGpuTimeEnd());
421*f17f7ee2SSatish Balay     PetscCall(MatDenseCUDARestoreArrayRead(B,(const PetscScalar**)&xx));
422*f17f7ee2SSatish Balay     if (usesf) {
423*f17f7ee2SSatish Balay       PetscCall(PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
424*f17f7ee2SSatish Balay       PetscCall(PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
425*f17f7ee2SSatish Balay     }
426*f17f7ee2SSatish Balay     PetscCall(MatDenseCUDARestoreArrayWrite(C,&yy));
427*f17f7ee2SSatish Balay     if (!biscuda) {
428*f17f7ee2SSatish Balay       PetscCall(MatConvert(B,MATDENSE,MAT_INPLACE_MATRIX,&B));
429*f17f7ee2SSatish Balay     }
430*f17f7ee2SSatish Balay     if (!ciscuda) {
431*f17f7ee2SSatish Balay       PetscCall(MatConvert(C,MATDENSE,MAT_INPLACE_MATRIX,&C));
432*f17f7ee2SSatish Balay     }
433*f17f7ee2SSatish Balay #endif
434*f17f7ee2SSatish Balay   }
435*f17f7ee2SSatish Balay   { /* log flops */
436*f17f7ee2SSatish Balay     double gops,time,perf,dev;
437*f17f7ee2SSatish Balay     HLibProfile::getHgemvPerf(gops,time,perf,dev);
438*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
439*f17f7ee2SSatish Balay     if (boundtocpu) {
440*f17f7ee2SSatish Balay       PetscCall(PetscLogFlops(1e9*gops));
441*f17f7ee2SSatish Balay     } else {
442*f17f7ee2SSatish Balay       PetscCall(PetscLogGpuFlops(1e9*gops));
443*f17f7ee2SSatish Balay     }
444*f17f7ee2SSatish Balay #else
445*f17f7ee2SSatish Balay     PetscCall(PetscLogFlops(1e9*gops));
446*f17f7ee2SSatish Balay #endif
447*f17f7ee2SSatish Balay   }
448*f17f7ee2SSatish Balay   PetscFunctionReturn(0);
449*f17f7ee2SSatish Balay }
450*f17f7ee2SSatish Balay 
451*f17f7ee2SSatish Balay static PetscErrorCode MatProductNumeric_H2OPUS(Mat C)
452*f17f7ee2SSatish Balay {
453*f17f7ee2SSatish Balay   Mat_Product    *product = C->product;
454*f17f7ee2SSatish Balay 
455*f17f7ee2SSatish Balay   PetscFunctionBegin;
456*f17f7ee2SSatish Balay   MatCheckProduct(C,1);
457*f17f7ee2SSatish Balay   switch (product->type) {
458*f17f7ee2SSatish Balay   case MATPRODUCT_AB:
459*f17f7ee2SSatish Balay     PetscCall(MatMultNKernel_H2OPUS(product->A,PETSC_FALSE,product->B,C));
460*f17f7ee2SSatish Balay     break;
461*f17f7ee2SSatish Balay   case MATPRODUCT_AtB:
462*f17f7ee2SSatish Balay     PetscCall(MatMultNKernel_H2OPUS(product->A,PETSC_TRUE,product->B,C));
463*f17f7ee2SSatish Balay     break;
464*f17f7ee2SSatish Balay   default:
465*f17f7ee2SSatish Balay     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]);
466*f17f7ee2SSatish Balay   }
467*f17f7ee2SSatish Balay   PetscFunctionReturn(0);
468*f17f7ee2SSatish Balay }
469*f17f7ee2SSatish Balay 
470*f17f7ee2SSatish Balay static PetscErrorCode MatProductSymbolic_H2OPUS(Mat C)
471*f17f7ee2SSatish Balay {
472*f17f7ee2SSatish Balay   Mat_Product    *product = C->product;
473*f17f7ee2SSatish Balay   PetscBool      cisdense;
474*f17f7ee2SSatish Balay   Mat            A,B;
475*f17f7ee2SSatish Balay 
476*f17f7ee2SSatish Balay   PetscFunctionBegin;
477*f17f7ee2SSatish Balay   MatCheckProduct(C,1);
478*f17f7ee2SSatish Balay   A = product->A;
479*f17f7ee2SSatish Balay   B = product->B;
480*f17f7ee2SSatish Balay   switch (product->type) {
481*f17f7ee2SSatish Balay   case MATPRODUCT_AB:
482*f17f7ee2SSatish Balay     PetscCall(MatSetSizes(C,A->rmap->n,B->cmap->n,A->rmap->N,B->cmap->N));
483*f17f7ee2SSatish Balay     PetscCall(MatSetBlockSizesFromMats(C,product->A,product->B));
484*f17f7ee2SSatish Balay     PetscCall(PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,""));
485*f17f7ee2SSatish Balay     if (!cisdense) PetscCall(MatSetType(C,((PetscObject)product->B)->type_name));
486*f17f7ee2SSatish Balay     PetscCall(MatSetUp(C));
487*f17f7ee2SSatish Balay     break;
488*f17f7ee2SSatish Balay   case MATPRODUCT_AtB:
489*f17f7ee2SSatish Balay     PetscCall(MatSetSizes(C,A->cmap->n,B->cmap->n,A->cmap->N,B->cmap->N));
490*f17f7ee2SSatish Balay     PetscCall(MatSetBlockSizesFromMats(C,product->A,product->B));
491*f17f7ee2SSatish Balay     PetscCall(PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,""));
492*f17f7ee2SSatish Balay     if (!cisdense) PetscCall(MatSetType(C,((PetscObject)product->B)->type_name));
493*f17f7ee2SSatish Balay     PetscCall(MatSetUp(C));
494*f17f7ee2SSatish Balay     break;
495*f17f7ee2SSatish Balay   default:
496*f17f7ee2SSatish Balay     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]);
497*f17f7ee2SSatish Balay   }
498*f17f7ee2SSatish Balay   C->ops->productsymbolic = NULL;
499*f17f7ee2SSatish Balay   C->ops->productnumeric = MatProductNumeric_H2OPUS;
500*f17f7ee2SSatish Balay   PetscFunctionReturn(0);
501*f17f7ee2SSatish Balay }
502*f17f7ee2SSatish Balay 
503*f17f7ee2SSatish Balay static PetscErrorCode MatProductSetFromOptions_H2OPUS(Mat C)
504*f17f7ee2SSatish Balay {
505*f17f7ee2SSatish Balay   PetscFunctionBegin;
506*f17f7ee2SSatish Balay   MatCheckProduct(C,1);
507*f17f7ee2SSatish Balay   if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) {
508*f17f7ee2SSatish Balay     C->ops->productsymbolic = MatProductSymbolic_H2OPUS;
509*f17f7ee2SSatish Balay   }
510*f17f7ee2SSatish Balay   PetscFunctionReturn(0);
511*f17f7ee2SSatish Balay }
512*f17f7ee2SSatish Balay 
513*f17f7ee2SSatish Balay static PetscErrorCode MatMultKernel_H2OPUS(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans)
514*f17f7ee2SSatish Balay {
515*f17f7ee2SSatish Balay   Mat_H2OPUS     *h2opus = (Mat_H2OPUS*)A->data;
516*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
517*f17f7ee2SSatish Balay   h2opusHandle_t handle = h2opus->handle->handle;
518*f17f7ee2SSatish Balay #else
519*f17f7ee2SSatish Balay   h2opusHandle_t handle = h2opus->handle;
520*f17f7ee2SSatish Balay #endif
521*f17f7ee2SSatish Balay   PetscBool      boundtocpu = PETSC_TRUE;
522*f17f7ee2SSatish Balay   PetscInt       n;
523*f17f7ee2SSatish Balay   PetscScalar    *xx,*yy,*uxx,*uyy;
524*f17f7ee2SSatish Balay   PetscMPIInt    size;
525*f17f7ee2SSatish Balay   PetscBool      usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
526*f17f7ee2SSatish Balay 
527*f17f7ee2SSatish Balay   PetscFunctionBegin;
528*f17f7ee2SSatish Balay   HLibProfile::clear();
529*f17f7ee2SSatish Balay   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
530*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
531*f17f7ee2SSatish Balay   boundtocpu = A->boundtocpu;
532*f17f7ee2SSatish Balay #endif
533*f17f7ee2SSatish Balay   if (usesf) {
534*f17f7ee2SSatish Balay     PetscCall(PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL));
535*f17f7ee2SSatish Balay   } else n = A->rmap->n;
536*f17f7ee2SSatish Balay   if (boundtocpu) {
537*f17f7ee2SSatish Balay     PetscCall(VecGetArrayRead(x,(const PetscScalar**)&xx));
538*f17f7ee2SSatish Balay     if (sy == 0.0) {
539*f17f7ee2SSatish Balay       PetscCall(VecGetArrayWrite(y,&yy));
540*f17f7ee2SSatish Balay     } else {
541*f17f7ee2SSatish Balay       PetscCall(VecGetArray(y,&yy));
542*f17f7ee2SSatish Balay     }
543*f17f7ee2SSatish Balay     if (usesf) {
544*f17f7ee2SSatish Balay       uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
545*f17f7ee2SSatish Balay       uyy = MatH2OpusGetThrustPointer(*h2opus->yy);
546*f17f7ee2SSatish Balay 
547*f17f7ee2SSatish Balay       PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
548*f17f7ee2SSatish Balay       PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
549*f17f7ee2SSatish Balay       if (sy != 0.0) {
550*f17f7ee2SSatish Balay         PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE));
551*f17f7ee2SSatish Balay         PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE));
552*f17f7ee2SSatish Balay       }
553*f17f7ee2SSatish Balay     } else {
554*f17f7ee2SSatish Balay       uxx = xx;
555*f17f7ee2SSatish Balay       uyy = yy;
556*f17f7ee2SSatish Balay     }
557*f17f7ee2SSatish Balay     if (size > 1) {
558*f17f7ee2SSatish Balay       PetscCheck(h2opus->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
559*f17f7ee2SSatish Balay       PetscCheckFalse(trans && !A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
560*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
561*f17f7ee2SSatish Balay       distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix, uxx, n, sy, uyy, n, 1, h2opus->handle);
562*f17f7ee2SSatish Balay #endif
563*f17f7ee2SSatish Balay     } else {
564*f17f7ee2SSatish Balay       PetscCheck(h2opus->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
565*f17f7ee2SSatish Balay       hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, n, sy, uyy, n, 1, handle);
566*f17f7ee2SSatish Balay     }
567*f17f7ee2SSatish Balay     PetscCall(VecRestoreArrayRead(x,(const PetscScalar**)&xx));
568*f17f7ee2SSatish Balay     if (usesf) {
569*f17f7ee2SSatish Balay       PetscCall(PetscSFReduceBegin(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
570*f17f7ee2SSatish Balay       PetscCall(PetscSFReduceEnd(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
571*f17f7ee2SSatish Balay     }
572*f17f7ee2SSatish Balay     if (sy == 0.0) {
573*f17f7ee2SSatish Balay       PetscCall(VecRestoreArrayWrite(y,&yy));
574*f17f7ee2SSatish Balay     } else {
575*f17f7ee2SSatish Balay       PetscCall(VecRestoreArray(y,&yy));
576*f17f7ee2SSatish Balay     }
577*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
578*f17f7ee2SSatish Balay   } else {
579*f17f7ee2SSatish Balay     PetscCall(VecCUDAGetArrayRead(x,(const PetscScalar**)&xx));
580*f17f7ee2SSatish Balay     if (sy == 0.0) {
581*f17f7ee2SSatish Balay       PetscCall(VecCUDAGetArrayWrite(y,&yy));
582*f17f7ee2SSatish Balay     } else {
583*f17f7ee2SSatish Balay       PetscCall(VecCUDAGetArray(y,&yy));
584*f17f7ee2SSatish Balay     }
585*f17f7ee2SSatish Balay     if (usesf) {
586*f17f7ee2SSatish Balay       uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
587*f17f7ee2SSatish Balay       uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
588*f17f7ee2SSatish Balay 
589*f17f7ee2SSatish Balay       PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
590*f17f7ee2SSatish Balay       PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
591*f17f7ee2SSatish Balay       if (sy != 0.0) {
592*f17f7ee2SSatish Balay         PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE));
593*f17f7ee2SSatish Balay         PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE));
594*f17f7ee2SSatish Balay       }
595*f17f7ee2SSatish Balay     } else {
596*f17f7ee2SSatish Balay       uxx = xx;
597*f17f7ee2SSatish Balay       uyy = yy;
598*f17f7ee2SSatish Balay     }
599*f17f7ee2SSatish Balay     PetscCall(PetscLogGpuTimeBegin());
600*f17f7ee2SSatish Balay     if (size > 1) {
601*f17f7ee2SSatish Balay       PetscCheck(h2opus->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed GPU matrix");
602*f17f7ee2SSatish Balay       PetscCheckFalse(trans && !A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
603*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
604*f17f7ee2SSatish Balay       distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, h2opus->handle);
605*f17f7ee2SSatish Balay #endif
606*f17f7ee2SSatish Balay     } else {
607*f17f7ee2SSatish Balay       PetscCheck(h2opus->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
608*f17f7ee2SSatish Balay       hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle);
609*f17f7ee2SSatish Balay     }
610*f17f7ee2SSatish Balay     PetscCall(PetscLogGpuTimeEnd());
611*f17f7ee2SSatish Balay     PetscCall(VecCUDARestoreArrayRead(x,(const PetscScalar**)&xx));
612*f17f7ee2SSatish Balay     if (usesf) {
613*f17f7ee2SSatish Balay       PetscCall(PetscSFReduceBegin(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
614*f17f7ee2SSatish Balay       PetscCall(PetscSFReduceEnd(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
615*f17f7ee2SSatish Balay     }
616*f17f7ee2SSatish Balay     if (sy == 0.0) {
617*f17f7ee2SSatish Balay       PetscCall(VecCUDARestoreArrayWrite(y,&yy));
618*f17f7ee2SSatish Balay     } else {
619*f17f7ee2SSatish Balay       PetscCall(VecCUDARestoreArray(y,&yy));
620*f17f7ee2SSatish Balay     }
621*f17f7ee2SSatish Balay #endif
622*f17f7ee2SSatish Balay   }
623*f17f7ee2SSatish Balay   { /* log flops */
624*f17f7ee2SSatish Balay     double gops,time,perf,dev;
625*f17f7ee2SSatish Balay     HLibProfile::getHgemvPerf(gops,time,perf,dev);
626*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
627*f17f7ee2SSatish Balay     if (boundtocpu) {
628*f17f7ee2SSatish Balay       PetscCall(PetscLogFlops(1e9*gops));
629*f17f7ee2SSatish Balay     } else {
630*f17f7ee2SSatish Balay       PetscCall(PetscLogGpuFlops(1e9*gops));
631*f17f7ee2SSatish Balay     }
632*f17f7ee2SSatish Balay #else
633*f17f7ee2SSatish Balay     PetscCall(PetscLogFlops(1e9*gops));
634*f17f7ee2SSatish Balay #endif
635*f17f7ee2SSatish Balay   }
636*f17f7ee2SSatish Balay   PetscFunctionReturn(0);
637*f17f7ee2SSatish Balay }
638*f17f7ee2SSatish Balay 
639*f17f7ee2SSatish Balay static PetscErrorCode MatMultTranspose_H2OPUS(Mat A, Vec x, Vec y)
640*f17f7ee2SSatish Balay {
641*f17f7ee2SSatish Balay   PetscBool      xiscuda,yiscuda;
642*f17f7ee2SSatish Balay 
643*f17f7ee2SSatish Balay   PetscFunctionBegin;
644*f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,""));
645*f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompareAny((PetscObject)y,&yiscuda,VECSEQCUDA,VECMPICUDA,""));
646*f17f7ee2SSatish Balay   PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !yiscuda));
647*f17f7ee2SSatish Balay   PetscCall(MatMultKernel_H2OPUS(A,x,0.0,y,PETSC_TRUE));
648*f17f7ee2SSatish Balay   PetscFunctionReturn(0);
649*f17f7ee2SSatish Balay }
650*f17f7ee2SSatish Balay 
651*f17f7ee2SSatish Balay static PetscErrorCode MatMult_H2OPUS(Mat A, Vec x, Vec y)
652*f17f7ee2SSatish Balay {
653*f17f7ee2SSatish Balay   PetscBool      xiscuda,yiscuda;
654*f17f7ee2SSatish Balay 
655*f17f7ee2SSatish Balay   PetscFunctionBegin;
656*f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,""));
657*f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompareAny((PetscObject)y,&yiscuda,VECSEQCUDA,VECMPICUDA,""));
658*f17f7ee2SSatish Balay   PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !yiscuda));
659*f17f7ee2SSatish Balay   PetscCall(MatMultKernel_H2OPUS(A,x,0.0,y,PETSC_FALSE));
660*f17f7ee2SSatish Balay   PetscFunctionReturn(0);
661*f17f7ee2SSatish Balay }
662*f17f7ee2SSatish Balay 
663*f17f7ee2SSatish Balay static PetscErrorCode MatMultTransposeAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
664*f17f7ee2SSatish Balay {
665*f17f7ee2SSatish Balay   PetscBool      xiscuda,ziscuda;
666*f17f7ee2SSatish Balay 
667*f17f7ee2SSatish Balay   PetscFunctionBegin;
668*f17f7ee2SSatish Balay   PetscCall(VecCopy(y,z));
669*f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,""));
670*f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompareAny((PetscObject)z,&ziscuda,VECSEQCUDA,VECMPICUDA,""));
671*f17f7ee2SSatish Balay   PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !ziscuda));
672*f17f7ee2SSatish Balay   PetscCall(MatMultKernel_H2OPUS(A,x,1.0,z,PETSC_TRUE));
673*f17f7ee2SSatish Balay   PetscFunctionReturn(0);
674*f17f7ee2SSatish Balay }
675*f17f7ee2SSatish Balay 
676*f17f7ee2SSatish Balay static PetscErrorCode MatMultAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
677*f17f7ee2SSatish Balay {
678*f17f7ee2SSatish Balay   PetscBool      xiscuda,ziscuda;
679*f17f7ee2SSatish Balay 
680*f17f7ee2SSatish Balay   PetscFunctionBegin;
681*f17f7ee2SSatish Balay   PetscCall(VecCopy(y,z));
682*f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,""));
683*f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompareAny((PetscObject)z,&ziscuda,VECSEQCUDA,VECMPICUDA,""));
684*f17f7ee2SSatish Balay   PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !ziscuda));
685*f17f7ee2SSatish Balay   PetscCall(MatMultKernel_H2OPUS(A,x,1.0,z,PETSC_FALSE));
686*f17f7ee2SSatish Balay   PetscFunctionReturn(0);
687*f17f7ee2SSatish Balay }
688*f17f7ee2SSatish Balay 
689*f17f7ee2SSatish Balay static PetscErrorCode MatScale_H2OPUS(Mat A, PetscScalar s)
690*f17f7ee2SSatish Balay {
691*f17f7ee2SSatish Balay   Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
692*f17f7ee2SSatish Balay 
693*f17f7ee2SSatish Balay   PetscFunctionBegin;
694*f17f7ee2SSatish Balay   a->s *= s;
695*f17f7ee2SSatish Balay   PetscFunctionReturn(0);
696*f17f7ee2SSatish Balay }
697*f17f7ee2SSatish Balay 
698*f17f7ee2SSatish Balay static PetscErrorCode MatSetFromOptions_H2OPUS(PetscOptionItems *PetscOptionsObject,Mat A)
699*f17f7ee2SSatish Balay {
700*f17f7ee2SSatish Balay   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
701*f17f7ee2SSatish Balay 
702*f17f7ee2SSatish Balay   PetscFunctionBegin;
703*f17f7ee2SSatish Balay   PetscCall(PetscOptionsHead(PetscOptionsObject,"H2OPUS options"));
704*f17f7ee2SSatish Balay   PetscCall(PetscOptionsInt("-mat_h2opus_leafsize","Leaf size of cluster tree",NULL,a->leafsize,&a->leafsize,NULL));
705*f17f7ee2SSatish Balay   PetscCall(PetscOptionsReal("-mat_h2opus_eta","Admissibility condition tolerance",NULL,a->eta,&a->eta,NULL));
706*f17f7ee2SSatish Balay   PetscCall(PetscOptionsInt("-mat_h2opus_order","Basis order for off-diagonal sampling when constructed from kernel",NULL,a->basisord,&a->basisord,NULL));
707*f17f7ee2SSatish Balay   PetscCall(PetscOptionsInt("-mat_h2opus_maxrank","Maximum rank when constructed from matvecs",NULL,a->max_rank,&a->max_rank,NULL));
708*f17f7ee2SSatish Balay   PetscCall(PetscOptionsInt("-mat_h2opus_samples","Maximum number of samples to be taken concurrently when constructing from matvecs",NULL,a->bs,&a->bs,NULL));
709*f17f7ee2SSatish Balay   PetscCall(PetscOptionsInt("-mat_h2opus_normsamples","Maximum bumber of samples to be when estimating norms",NULL,a->norm_max_samples,&a->norm_max_samples,NULL));
710*f17f7ee2SSatish Balay   PetscCall(PetscOptionsReal("-mat_h2opus_rtol","Relative tolerance for construction from sampling",NULL,a->rtol,&a->rtol,NULL));
711*f17f7ee2SSatish Balay   PetscCall(PetscOptionsBool("-mat_h2opus_check","Check error when constructing from sampling during MatAssemblyEnd()",NULL,a->check_construction,&a->check_construction,NULL));
712*f17f7ee2SSatish Balay   PetscCall(PetscOptionsBool("-mat_h2opus_hara_verbose","Verbose output from hara construction",NULL,a->hara_verbose,&a->hara_verbose,NULL));
713*f17f7ee2SSatish Balay   PetscCall(PetscOptionsTail());
714*f17f7ee2SSatish Balay   PetscFunctionReturn(0);
715*f17f7ee2SSatish Balay }
716*f17f7ee2SSatish Balay 
717*f17f7ee2SSatish Balay static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat,PetscInt,const PetscReal[],PetscBool,MatH2OpusKernel,void*);
718*f17f7ee2SSatish Balay 
719*f17f7ee2SSatish Balay static PetscErrorCode MatH2OpusInferCoordinates_Private(Mat A)
720*f17f7ee2SSatish Balay {
721*f17f7ee2SSatish Balay   Mat_H2OPUS        *a = (Mat_H2OPUS*)A->data;
722*f17f7ee2SSatish Balay   Vec               c;
723*f17f7ee2SSatish Balay   PetscInt          spacedim;
724*f17f7ee2SSatish Balay   const PetscScalar *coords;
725*f17f7ee2SSatish Balay 
726*f17f7ee2SSatish Balay   PetscFunctionBegin;
727*f17f7ee2SSatish Balay   if (a->ptcloud) PetscFunctionReturn(0);
728*f17f7ee2SSatish Balay   PetscCall(PetscObjectQuery((PetscObject)A,"__math2opus_coords",(PetscObject*)&c));
729*f17f7ee2SSatish Balay   if (!c && a->sampler) {
730*f17f7ee2SSatish Balay     Mat S = a->sampler->GetSamplingMat();
731*f17f7ee2SSatish Balay 
732*f17f7ee2SSatish Balay     PetscCall(PetscObjectQuery((PetscObject)S,"__math2opus_coords",(PetscObject*)&c));
733*f17f7ee2SSatish Balay   }
734*f17f7ee2SSatish Balay   if (!c) {
735*f17f7ee2SSatish Balay     PetscCall(MatH2OpusSetCoords_H2OPUS(A,-1,NULL,PETSC_FALSE,NULL,NULL));
736*f17f7ee2SSatish Balay   } else {
737*f17f7ee2SSatish Balay     PetscCall(VecGetArrayRead(c,&coords));
738*f17f7ee2SSatish Balay     PetscCall(VecGetBlockSize(c,&spacedim));
739*f17f7ee2SSatish Balay     PetscCall(MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,PETSC_FALSE,NULL,NULL));
740*f17f7ee2SSatish Balay     PetscCall(VecRestoreArrayRead(c,&coords));
741*f17f7ee2SSatish Balay   }
742*f17f7ee2SSatish Balay   PetscFunctionReturn(0);
743*f17f7ee2SSatish Balay }
744*f17f7ee2SSatish Balay 
745*f17f7ee2SSatish Balay static PetscErrorCode MatSetUpMultiply_H2OPUS(Mat A)
746*f17f7ee2SSatish Balay {
747*f17f7ee2SSatish Balay   MPI_Comm       comm;
748*f17f7ee2SSatish Balay   PetscMPIInt    size;
749*f17f7ee2SSatish Balay   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
750*f17f7ee2SSatish Balay   PetscInt       n = 0,*idx = NULL;
751*f17f7ee2SSatish Balay   int            *iidx = NULL;
752*f17f7ee2SSatish Balay   PetscCopyMode  own;
753*f17f7ee2SSatish Balay   PetscBool      rid;
754*f17f7ee2SSatish Balay 
755*f17f7ee2SSatish Balay   PetscFunctionBegin;
756*f17f7ee2SSatish Balay   if (a->multsetup) PetscFunctionReturn(0);
757*f17f7ee2SSatish Balay   if (a->sf) { /* MatDuplicate_H2OPUS takes reference to the SF */
758*f17f7ee2SSatish Balay     PetscCall(PetscSFGetGraph(a->sf,NULL,&n,NULL,NULL));
759*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
760*f17f7ee2SSatish Balay     a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
761*f17f7ee2SSatish Balay     a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
762*f17f7ee2SSatish Balay     a->xxs_gpu = 1;
763*f17f7ee2SSatish Balay     a->yys_gpu = 1;
764*f17f7ee2SSatish Balay #endif
765*f17f7ee2SSatish Balay     a->xx  = new thrust::host_vector<PetscScalar>(n);
766*f17f7ee2SSatish Balay     a->yy  = new thrust::host_vector<PetscScalar>(n);
767*f17f7ee2SSatish Balay     a->xxs = 1;
768*f17f7ee2SSatish Balay     a->yys = 1;
769*f17f7ee2SSatish Balay   } else {
770*f17f7ee2SSatish Balay     IS is;
771*f17f7ee2SSatish Balay     PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
772*f17f7ee2SSatish Balay     PetscCallMPI(MPI_Comm_size(comm,&size));
773*f17f7ee2SSatish Balay     if (!a->h2opus_indexmap) {
774*f17f7ee2SSatish Balay       if (size > 1) {
775*f17f7ee2SSatish Balay         PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
776*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
777*f17f7ee2SSatish Balay         iidx = MatH2OpusGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map);
778*f17f7ee2SSatish Balay         n    = a->dist_hmatrix->basis_tree.basis_branch.index_map.size();
779*f17f7ee2SSatish Balay #endif
780*f17f7ee2SSatish Balay       } else {
781*f17f7ee2SSatish Balay         iidx = MatH2OpusGetThrustPointer(a->hmatrix->u_basis_tree.index_map);
782*f17f7ee2SSatish Balay         n    = a->hmatrix->u_basis_tree.index_map.size();
783*f17f7ee2SSatish Balay       }
784*f17f7ee2SSatish Balay 
785*f17f7ee2SSatish Balay       if (PetscDefined(USE_64BIT_INDICES)) {
786*f17f7ee2SSatish Balay         PetscInt i;
787*f17f7ee2SSatish Balay 
788*f17f7ee2SSatish Balay         own  = PETSC_OWN_POINTER;
789*f17f7ee2SSatish Balay         PetscCall(PetscMalloc1(n,&idx));
790*f17f7ee2SSatish Balay         for (i=0;i<n;i++) idx[i] = iidx[i];
791*f17f7ee2SSatish Balay       } else {
792*f17f7ee2SSatish Balay         own  = PETSC_COPY_VALUES;
793*f17f7ee2SSatish Balay         idx  = (PetscInt*)iidx;
794*f17f7ee2SSatish Balay       }
795*f17f7ee2SSatish Balay       PetscCall(ISCreateGeneral(comm,n,idx,own,&is));
796*f17f7ee2SSatish Balay       PetscCall(ISSetPermutation(is));
797*f17f7ee2SSatish Balay       PetscCall(ISViewFromOptions(is,(PetscObject)A,"-mat_h2opus_indexmap_view"));
798*f17f7ee2SSatish Balay       a->h2opus_indexmap = is;
799*f17f7ee2SSatish Balay     }
800*f17f7ee2SSatish Balay     PetscCall(ISGetLocalSize(a->h2opus_indexmap,&n));
801*f17f7ee2SSatish Balay     PetscCall(ISGetIndices(a->h2opus_indexmap,(const PetscInt **)&idx));
802*f17f7ee2SSatish Balay     rid  = (PetscBool)(n == A->rmap->n);
803*f17f7ee2SSatish Balay     PetscCall(MPIU_Allreduce(MPI_IN_PLACE,&rid,1,MPIU_BOOL,MPI_LAND,comm));
804*f17f7ee2SSatish Balay     if (rid) {
805*f17f7ee2SSatish Balay       PetscCall(ISIdentity(a->h2opus_indexmap,&rid));
806*f17f7ee2SSatish Balay     }
807*f17f7ee2SSatish Balay     if (!rid) {
808*f17f7ee2SSatish Balay       if (size > 1) { /* Parallel distribution may be different, save it here for fast path in MatMult (see MatH2OpusSetNativeMult) */
809*f17f7ee2SSatish Balay         PetscCall(PetscLayoutCreate(comm,&a->h2opus_rmap));
810*f17f7ee2SSatish Balay         PetscCall(PetscLayoutSetLocalSize(a->h2opus_rmap,n));
811*f17f7ee2SSatish Balay         PetscCall(PetscLayoutSetUp(a->h2opus_rmap));
812*f17f7ee2SSatish Balay         PetscCall(PetscLayoutReference(a->h2opus_rmap,&a->h2opus_cmap));
813*f17f7ee2SSatish Balay       }
814*f17f7ee2SSatish Balay       PetscCall(PetscSFCreate(comm,&a->sf));
815*f17f7ee2SSatish Balay       PetscCall(PetscSFSetGraphLayout(a->sf,A->rmap,n,NULL,PETSC_OWN_POINTER,idx));
816*f17f7ee2SSatish Balay       PetscCall(PetscSFViewFromOptions(a->sf,(PetscObject)A,"-mat_h2opus_sf_view"));
817*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
818*f17f7ee2SSatish Balay       a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
819*f17f7ee2SSatish Balay       a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
820*f17f7ee2SSatish Balay       a->xxs_gpu = 1;
821*f17f7ee2SSatish Balay       a->yys_gpu = 1;
822*f17f7ee2SSatish Balay #endif
823*f17f7ee2SSatish Balay       a->xx  = new thrust::host_vector<PetscScalar>(n);
824*f17f7ee2SSatish Balay       a->yy  = new thrust::host_vector<PetscScalar>(n);
825*f17f7ee2SSatish Balay       a->xxs = 1;
826*f17f7ee2SSatish Balay       a->yys = 1;
827*f17f7ee2SSatish Balay     }
828*f17f7ee2SSatish Balay     PetscCall(ISRestoreIndices(a->h2opus_indexmap,(const PetscInt **)&idx));
829*f17f7ee2SSatish Balay   }
830*f17f7ee2SSatish Balay   a->multsetup = PETSC_TRUE;
831*f17f7ee2SSatish Balay   PetscFunctionReturn(0);
832*f17f7ee2SSatish Balay }
833*f17f7ee2SSatish Balay 
834*f17f7ee2SSatish Balay static PetscErrorCode MatAssemblyEnd_H2OPUS(Mat A, MatAssemblyType assemblytype)
835*f17f7ee2SSatish Balay {
836*f17f7ee2SSatish Balay   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
837*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
838*f17f7ee2SSatish Balay   h2opusHandle_t handle = a->handle->handle;
839*f17f7ee2SSatish Balay #else
840*f17f7ee2SSatish Balay   h2opusHandle_t handle = a->handle;
841*f17f7ee2SSatish Balay #endif
842*f17f7ee2SSatish Balay   PetscBool      kernel = PETSC_FALSE;
843*f17f7ee2SSatish Balay   PetscBool      boundtocpu = PETSC_TRUE;
844*f17f7ee2SSatish Balay   PetscBool      samplingdone = PETSC_FALSE;
845*f17f7ee2SSatish Balay   MPI_Comm       comm;
846*f17f7ee2SSatish Balay   PetscMPIInt    size;
847*f17f7ee2SSatish Balay 
848*f17f7ee2SSatish Balay   PetscFunctionBegin;
849*f17f7ee2SSatish Balay   PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
850*f17f7ee2SSatish Balay   PetscCheckFalse(A->rmap->n != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported");
851*f17f7ee2SSatish Balay   PetscCheckFalse(A->rmap->N != A->cmap->N,comm,PETSC_ERR_SUP,"Rectangular matrices are not supported");
852*f17f7ee2SSatish Balay 
853*f17f7ee2SSatish Balay   /* XXX */
854*f17f7ee2SSatish Balay   a->leafsize = PetscMin(a->leafsize, PetscMin(A->rmap->N, A->cmap->N));
855*f17f7ee2SSatish Balay 
856*f17f7ee2SSatish Balay   PetscCallMPI(MPI_Comm_size(comm,&size));
857*f17f7ee2SSatish Balay   /* TODO REUSABILITY of geometric construction */
858*f17f7ee2SSatish Balay   delete a->hmatrix;
859*f17f7ee2SSatish Balay   delete a->dist_hmatrix;
860*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
861*f17f7ee2SSatish Balay   delete a->hmatrix_gpu;
862*f17f7ee2SSatish Balay   delete a->dist_hmatrix_gpu;
863*f17f7ee2SSatish Balay #endif
864*f17f7ee2SSatish Balay   a->orthogonal = PETSC_FALSE;
865*f17f7ee2SSatish Balay 
866*f17f7ee2SSatish Balay   /* TODO: other? */
867*f17f7ee2SSatish Balay   H2OpusBoxCenterAdmissibility adm(a->eta);
868*f17f7ee2SSatish Balay 
869*f17f7ee2SSatish Balay   PetscCall(PetscLogEventBegin(MAT_H2Opus_Build,A,0,0,0));
870*f17f7ee2SSatish Balay   if (size > 1) {
871*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
872*f17f7ee2SSatish Balay     a->dist_hmatrix = new DistributedHMatrix(A->rmap->n/* ,A->symmetric */);
873*f17f7ee2SSatish Balay #else
874*f17f7ee2SSatish Balay     a->dist_hmatrix = NULL;
875*f17f7ee2SSatish Balay #endif
876*f17f7ee2SSatish Balay   } else {
877*f17f7ee2SSatish Balay     a->hmatrix = new HMatrix(A->rmap->n,A->symmetric);
878*f17f7ee2SSatish Balay   }
879*f17f7ee2SSatish Balay   PetscCall(MatH2OpusInferCoordinates_Private(A));
880*f17f7ee2SSatish Balay   PetscCheck(a->ptcloud,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing pointcloud");
881*f17f7ee2SSatish Balay   if (a->kernel) {
882*f17f7ee2SSatish Balay     BoxEntryGen<PetscScalar, H2OPUS_HWTYPE_CPU, PetscFunctionGenerator<PetscScalar>> entry_gen(*a->kernel);
883*f17f7ee2SSatish Balay     if (size > 1) {
884*f17f7ee2SSatish Balay       PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
885*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
886*f17f7ee2SSatish Balay       buildDistributedHMatrix(*a->dist_hmatrix,a->ptcloud,adm,entry_gen,a->leafsize,a->basisord,a->handle);
887*f17f7ee2SSatish Balay #endif
888*f17f7ee2SSatish Balay     } else {
889*f17f7ee2SSatish Balay       buildHMatrix(*a->hmatrix,a->ptcloud,adm,entry_gen,a->leafsize,a->basisord);
890*f17f7ee2SSatish Balay     }
891*f17f7ee2SSatish Balay     kernel = PETSC_TRUE;
892*f17f7ee2SSatish Balay   } else {
893*f17f7ee2SSatish Balay     PetscCheckFalse(size > 1,comm,PETSC_ERR_SUP,"Construction from sampling not supported in parallel");
894*f17f7ee2SSatish Balay     buildHMatrixStructure(*a->hmatrix,a->ptcloud,a->leafsize,adm);
895*f17f7ee2SSatish Balay   }
896*f17f7ee2SSatish Balay   PetscCall(MatSetUpMultiply_H2OPUS(A));
897*f17f7ee2SSatish Balay 
898*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
899*f17f7ee2SSatish Balay   boundtocpu = A->boundtocpu;
900*f17f7ee2SSatish Balay   if (!boundtocpu) {
901*f17f7ee2SSatish Balay     if (size > 1) {
902*f17f7ee2SSatish Balay       PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
903*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
904*f17f7ee2SSatish Balay       a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
905*f17f7ee2SSatish Balay #endif
906*f17f7ee2SSatish Balay     } else {
907*f17f7ee2SSatish Balay       a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
908*f17f7ee2SSatish Balay     }
909*f17f7ee2SSatish Balay   }
910*f17f7ee2SSatish Balay #endif
911*f17f7ee2SSatish Balay   if (size == 1) {
912*f17f7ee2SSatish Balay     if (!kernel && a->sampler && a->sampler->GetSamplingMat()) {
913*f17f7ee2SSatish Balay       PetscReal Anorm;
914*f17f7ee2SSatish Balay       bool      verbose;
915*f17f7ee2SSatish Balay 
916*f17f7ee2SSatish Balay       PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_hara_verbose",&a->hara_verbose,NULL));
917*f17f7ee2SSatish Balay       verbose = a->hara_verbose;
918*f17f7ee2SSatish Balay       PetscCall(MatApproximateNorm_Private(a->sampler->GetSamplingMat(),NORM_2,a->norm_max_samples,&Anorm));
919*f17f7ee2SSatish Balay       if (a->hara_verbose) PetscCall(PetscPrintf(PETSC_COMM_SELF,"Sampling uses max rank %d, tol %g (%g*%g), %s samples %d\n",a->max_rank,a->rtol*Anorm,a->rtol,Anorm,boundtocpu ? "CPU" : "GPU",a->bs));
920*f17f7ee2SSatish Balay       if (a->sf && !a->nativemult) {
921*f17f7ee2SSatish Balay         a->sampler->SetIndexMap(a->hmatrix->u_basis_tree.index_map.size(),a->hmatrix->u_basis_tree.index_map.data());
922*f17f7ee2SSatish Balay       }
923*f17f7ee2SSatish Balay       a->sampler->SetStream(handle->getMainStream());
924*f17f7ee2SSatish Balay       if (boundtocpu) {
925*f17f7ee2SSatish Balay         a->sampler->SetGPUSampling(false);
926*f17f7ee2SSatish Balay         hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose);
927*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
928*f17f7ee2SSatish Balay       } else {
929*f17f7ee2SSatish Balay         a->sampler->SetGPUSampling(true);
930*f17f7ee2SSatish Balay         hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose);
931*f17f7ee2SSatish Balay #endif
932*f17f7ee2SSatish Balay       }
933*f17f7ee2SSatish Balay       samplingdone = PETSC_TRUE;
934*f17f7ee2SSatish Balay     }
935*f17f7ee2SSatish Balay   }
936*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
937*f17f7ee2SSatish Balay   if (!boundtocpu) {
938*f17f7ee2SSatish Balay     delete a->hmatrix;
939*f17f7ee2SSatish Balay     delete a->dist_hmatrix;
940*f17f7ee2SSatish Balay     a->hmatrix = NULL;
941*f17f7ee2SSatish Balay     a->dist_hmatrix = NULL;
942*f17f7ee2SSatish Balay   }
943*f17f7ee2SSatish Balay   A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
944*f17f7ee2SSatish Balay #endif
945*f17f7ee2SSatish Balay   PetscCall(PetscLogEventEnd(MAT_H2Opus_Build,A,0,0,0));
946*f17f7ee2SSatish Balay 
947*f17f7ee2SSatish Balay   if (!a->s) a->s = 1.0;
948*f17f7ee2SSatish Balay   A->assembled = PETSC_TRUE;
949*f17f7ee2SSatish Balay 
950*f17f7ee2SSatish Balay   if (samplingdone) {
951*f17f7ee2SSatish Balay     PetscBool check = a->check_construction;
952*f17f7ee2SSatish Balay     PetscBool checke = PETSC_FALSE;
953*f17f7ee2SSatish Balay 
954*f17f7ee2SSatish Balay     PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_check",&check,NULL));
955*f17f7ee2SSatish Balay     PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_check_explicit",&checke,NULL));
956*f17f7ee2SSatish Balay     if (check) {
957*f17f7ee2SSatish Balay       Mat       E,Ae;
958*f17f7ee2SSatish Balay       PetscReal n1,ni,n2;
959*f17f7ee2SSatish Balay       PetscReal n1A,niA,n2A;
960*f17f7ee2SSatish Balay       void      (*normfunc)(void);
961*f17f7ee2SSatish Balay 
962*f17f7ee2SSatish Balay       Ae   = a->sampler->GetSamplingMat();
963*f17f7ee2SSatish Balay       PetscCall(MatConvert(A,MATSHELL,MAT_INITIAL_MATRIX,&E));
964*f17f7ee2SSatish Balay       PetscCall(MatShellSetOperation(E,MATOP_NORM,(void (*)(void))MatNorm_H2OPUS));
965*f17f7ee2SSatish Balay       PetscCall(MatAXPY(E,-1.0,Ae,DIFFERENT_NONZERO_PATTERN));
966*f17f7ee2SSatish Balay       PetscCall(MatNorm(E,NORM_1,&n1));
967*f17f7ee2SSatish Balay       PetscCall(MatNorm(E,NORM_INFINITY,&ni));
968*f17f7ee2SSatish Balay       PetscCall(MatNorm(E,NORM_2,&n2));
969*f17f7ee2SSatish Balay       if (checke) {
970*f17f7ee2SSatish Balay         Mat eA,eE,eAe;
971*f17f7ee2SSatish Balay 
972*f17f7ee2SSatish Balay         PetscCall(MatComputeOperator(A,MATAIJ,&eA));
973*f17f7ee2SSatish Balay         PetscCall(MatComputeOperator(E,MATAIJ,&eE));
974*f17f7ee2SSatish Balay         PetscCall(MatComputeOperator(Ae,MATAIJ,&eAe));
975*f17f7ee2SSatish Balay         PetscCall(MatChop(eA,PETSC_SMALL));
976*f17f7ee2SSatish Balay         PetscCall(MatChop(eE,PETSC_SMALL));
977*f17f7ee2SSatish Balay         PetscCall(MatChop(eAe,PETSC_SMALL));
978*f17f7ee2SSatish Balay         PetscCall(PetscObjectSetName((PetscObject)eA,"H2Mat"));
979*f17f7ee2SSatish Balay         PetscCall(MatView(eA,NULL));
980*f17f7ee2SSatish Balay         PetscCall(PetscObjectSetName((PetscObject)eAe,"S"));
981*f17f7ee2SSatish Balay         PetscCall(MatView(eAe,NULL));
982*f17f7ee2SSatish Balay         PetscCall(PetscObjectSetName((PetscObject)eE,"H2Mat - S"));
983*f17f7ee2SSatish Balay         PetscCall(MatView(eE,NULL));
984*f17f7ee2SSatish Balay         PetscCall(MatDestroy(&eA));
985*f17f7ee2SSatish Balay         PetscCall(MatDestroy(&eE));
986*f17f7ee2SSatish Balay         PetscCall(MatDestroy(&eAe));
987*f17f7ee2SSatish Balay       }
988*f17f7ee2SSatish Balay 
989*f17f7ee2SSatish Balay       PetscCall(MatGetOperation(Ae,MATOP_NORM,&normfunc));
990*f17f7ee2SSatish Balay       PetscCall(MatSetOperation(Ae,MATOP_NORM,(void (*)(void))MatNorm_H2OPUS));
991*f17f7ee2SSatish Balay       PetscCall(MatNorm(Ae,NORM_1,&n1A));
992*f17f7ee2SSatish Balay       PetscCall(MatNorm(Ae,NORM_INFINITY,&niA));
993*f17f7ee2SSatish Balay       PetscCall(MatNorm(Ae,NORM_2,&n2A));
994*f17f7ee2SSatish Balay       n1A  = PetscMax(n1A,PETSC_SMALL);
995*f17f7ee2SSatish Balay       n2A  = PetscMax(n2A,PETSC_SMALL);
996*f17f7ee2SSatish Balay       niA  = PetscMax(niA,PETSC_SMALL);
997*f17f7ee2SSatish Balay       PetscCall(MatSetOperation(Ae,MATOP_NORM,normfunc));
998*f17f7ee2SSatish Balay       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A),"MATH2OPUS construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n",(double)n1,(double)ni,(double)n2,(double)(n1/n1A),(double)(ni/niA),(double)(n2/n2A)));
999*f17f7ee2SSatish Balay       PetscCall(MatDestroy(&E));
1000*f17f7ee2SSatish Balay     }
1001*f17f7ee2SSatish Balay     a->sampler->SetSamplingMat(NULL);
1002*f17f7ee2SSatish Balay   }
1003*f17f7ee2SSatish Balay   PetscFunctionReturn(0);
1004*f17f7ee2SSatish Balay }
1005*f17f7ee2SSatish Balay 
1006*f17f7ee2SSatish Balay static PetscErrorCode MatZeroEntries_H2OPUS(Mat A)
1007*f17f7ee2SSatish Balay {
1008*f17f7ee2SSatish Balay   PetscMPIInt    size;
1009*f17f7ee2SSatish Balay   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1010*f17f7ee2SSatish Balay 
1011*f17f7ee2SSatish Balay   PetscFunctionBegin;
1012*f17f7ee2SSatish Balay   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1013*f17f7ee2SSatish Balay   PetscCheckFalse(size > 1,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not yet supported");
1014*f17f7ee2SSatish Balay   else {
1015*f17f7ee2SSatish Balay     a->hmatrix->clearData();
1016*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1017*f17f7ee2SSatish Balay     if (a->hmatrix_gpu) a->hmatrix_gpu->clearData();
1018*f17f7ee2SSatish Balay #endif
1019*f17f7ee2SSatish Balay   }
1020*f17f7ee2SSatish Balay   PetscFunctionReturn(0);
1021*f17f7ee2SSatish Balay }
1022*f17f7ee2SSatish Balay 
1023*f17f7ee2SSatish Balay static PetscErrorCode MatDuplicate_H2OPUS(Mat B, MatDuplicateOption op, Mat *nA)
1024*f17f7ee2SSatish Balay {
1025*f17f7ee2SSatish Balay   Mat            A;
1026*f17f7ee2SSatish Balay   Mat_H2OPUS     *a, *b = (Mat_H2OPUS*)B->data;
1027*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1028*f17f7ee2SSatish Balay   PetscBool      iscpu = PETSC_FALSE;
1029*f17f7ee2SSatish Balay #else
1030*f17f7ee2SSatish Balay   PetscBool      iscpu = PETSC_TRUE;
1031*f17f7ee2SSatish Balay #endif
1032*f17f7ee2SSatish Balay   MPI_Comm       comm;
1033*f17f7ee2SSatish Balay 
1034*f17f7ee2SSatish Balay   PetscFunctionBegin;
1035*f17f7ee2SSatish Balay   PetscCall(PetscObjectGetComm((PetscObject)B,&comm));
1036*f17f7ee2SSatish Balay   PetscCall(MatCreate(comm,&A));
1037*f17f7ee2SSatish Balay   PetscCall(MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N));
1038*f17f7ee2SSatish Balay   PetscCall(MatSetType(A,MATH2OPUS));
1039*f17f7ee2SSatish Balay   PetscCall(MatPropagateSymmetryOptions(B,A));
1040*f17f7ee2SSatish Balay   a = (Mat_H2OPUS*)A->data;
1041*f17f7ee2SSatish Balay 
1042*f17f7ee2SSatish Balay   a->eta              = b->eta;
1043*f17f7ee2SSatish Balay   a->leafsize         = b->leafsize;
1044*f17f7ee2SSatish Balay   a->basisord         = b->basisord;
1045*f17f7ee2SSatish Balay   a->max_rank         = b->max_rank;
1046*f17f7ee2SSatish Balay   a->bs               = b->bs;
1047*f17f7ee2SSatish Balay   a->rtol             = b->rtol;
1048*f17f7ee2SSatish Balay   a->norm_max_samples = b->norm_max_samples;
1049*f17f7ee2SSatish Balay   if (op == MAT_COPY_VALUES) a->s = b->s;
1050*f17f7ee2SSatish Balay 
1051*f17f7ee2SSatish Balay   a->ptcloud = new PetscPointCloud<PetscReal>(*b->ptcloud);
1052*f17f7ee2SSatish Balay   if (op == MAT_COPY_VALUES && b->kernel) a->kernel = new PetscFunctionGenerator<PetscScalar>(*b->kernel);
1053*f17f7ee2SSatish Balay 
1054*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1055*f17f7ee2SSatish Balay   if (b->dist_hmatrix) { a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix); }
1056*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1057*f17f7ee2SSatish Balay   if (b->dist_hmatrix_gpu) { a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu); }
1058*f17f7ee2SSatish Balay #endif
1059*f17f7ee2SSatish Balay #endif
1060*f17f7ee2SSatish Balay   if (b->hmatrix) {
1061*f17f7ee2SSatish Balay     a->hmatrix = new HMatrix(*b->hmatrix);
1062*f17f7ee2SSatish Balay     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData();
1063*f17f7ee2SSatish Balay   }
1064*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1065*f17f7ee2SSatish Balay   if (b->hmatrix_gpu) {
1066*f17f7ee2SSatish Balay     a->hmatrix_gpu = new HMatrix_GPU(*b->hmatrix_gpu);
1067*f17f7ee2SSatish Balay     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData();
1068*f17f7ee2SSatish Balay   }
1069*f17f7ee2SSatish Balay #endif
1070*f17f7ee2SSatish Balay   if (b->sf) {
1071*f17f7ee2SSatish Balay     PetscCall(PetscObjectReference((PetscObject)b->sf));
1072*f17f7ee2SSatish Balay     a->sf = b->sf;
1073*f17f7ee2SSatish Balay   }
1074*f17f7ee2SSatish Balay   if (b->h2opus_indexmap) {
1075*f17f7ee2SSatish Balay     PetscCall(PetscObjectReference((PetscObject)b->h2opus_indexmap));
1076*f17f7ee2SSatish Balay     a->h2opus_indexmap = b->h2opus_indexmap;
1077*f17f7ee2SSatish Balay   }
1078*f17f7ee2SSatish Balay 
1079*f17f7ee2SSatish Balay   PetscCall(MatSetUp(A));
1080*f17f7ee2SSatish Balay   PetscCall(MatSetUpMultiply_H2OPUS(A));
1081*f17f7ee2SSatish Balay   if (op == MAT_COPY_VALUES) {
1082*f17f7ee2SSatish Balay     A->assembled = PETSC_TRUE;
1083*f17f7ee2SSatish Balay     a->orthogonal = b->orthogonal;
1084*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1085*f17f7ee2SSatish Balay     A->offloadmask = B->offloadmask;
1086*f17f7ee2SSatish Balay #endif
1087*f17f7ee2SSatish Balay   }
1088*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1089*f17f7ee2SSatish Balay   iscpu = B->boundtocpu;
1090*f17f7ee2SSatish Balay #endif
1091*f17f7ee2SSatish Balay   PetscCall(MatBindToCPU(A,iscpu));
1092*f17f7ee2SSatish Balay 
1093*f17f7ee2SSatish Balay   *nA = A;
1094*f17f7ee2SSatish Balay   PetscFunctionReturn(0);
1095*f17f7ee2SSatish Balay }
1096*f17f7ee2SSatish Balay 
1097*f17f7ee2SSatish Balay static PetscErrorCode MatView_H2OPUS(Mat A, PetscViewer view)
1098*f17f7ee2SSatish Balay {
1099*f17f7ee2SSatish Balay   Mat_H2OPUS        *h2opus = (Mat_H2OPUS*)A->data;
1100*f17f7ee2SSatish Balay   PetscBool         isascii;
1101*f17f7ee2SSatish Balay   PetscMPIInt       size;
1102*f17f7ee2SSatish Balay   PetscViewerFormat format;
1103*f17f7ee2SSatish Balay 
1104*f17f7ee2SSatish Balay   PetscFunctionBegin;
1105*f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompare((PetscObject)view,PETSCVIEWERASCII,&isascii));
1106*f17f7ee2SSatish Balay   PetscCall(PetscViewerGetFormat(view,&format));
1107*f17f7ee2SSatish Balay   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1108*f17f7ee2SSatish Balay   if (isascii) {
1109*f17f7ee2SSatish Balay     if (format == PETSC_VIEWER_ASCII_MATLAB) {
1110*f17f7ee2SSatish Balay       if (size == 1) {
1111*f17f7ee2SSatish Balay         FILE *fp;
1112*f17f7ee2SSatish Balay         PetscCall(PetscViewerASCIIGetPointer(view,&fp));
1113*f17f7ee2SSatish Balay         dumpHMatrix(*h2opus->hmatrix,6,fp);
1114*f17f7ee2SSatish Balay       }
1115*f17f7ee2SSatish Balay     } else {
1116*f17f7ee2SSatish Balay       PetscCall(PetscViewerASCIIPrintf(view,"  H-Matrix constructed from %s\n",h2opus->kernel ? "Kernel" : "Mat"));
1117*f17f7ee2SSatish Balay       PetscCall(PetscViewerASCIIPrintf(view,"  PointCloud dim %" PetscInt_FMT "\n",h2opus->ptcloud ? h2opus->ptcloud->getDimension() : 0));
1118*f17f7ee2SSatish Balay       PetscCall(PetscViewerASCIIPrintf(view,"  Admissibility parameters: leaf size %" PetscInt_FMT ", eta %g\n",h2opus->leafsize,(double)h2opus->eta));
1119*f17f7ee2SSatish Balay       if (!h2opus->kernel) {
1120*f17f7ee2SSatish Balay         PetscCall(PetscViewerASCIIPrintf(view,"  Sampling parameters: max_rank %" PetscInt_FMT ", samples %" PetscInt_FMT ", tolerance %g\n",h2opus->max_rank,h2opus->bs,(double)h2opus->rtol));
1121*f17f7ee2SSatish Balay       } else {
1122*f17f7ee2SSatish Balay         PetscCall(PetscViewerASCIIPrintf(view,"  Offdiagonal blocks approximation order %" PetscInt_FMT "\n",h2opus->basisord));
1123*f17f7ee2SSatish Balay       }
1124*f17f7ee2SSatish Balay       PetscCall(PetscViewerASCIIPrintf(view,"  Number of samples for norms %" PetscInt_FMT "\n",h2opus->norm_max_samples));
1125*f17f7ee2SSatish Balay       if (size == 1) {
1126*f17f7ee2SSatish Balay         double dense_mem_cpu = h2opus->hmatrix ? h2opus->hmatrix->getDenseMemoryUsage() : 0;
1127*f17f7ee2SSatish Balay         double low_rank_cpu = h2opus->hmatrix ? h2opus->hmatrix->getLowRankMemoryUsage() : 0;
1128*f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA)
1129*f17f7ee2SSatish Balay         double dense_mem_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getDenseMemoryUsage() : 0;
1130*f17f7ee2SSatish Balay         double low_rank_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getLowRankMemoryUsage() : 0;
1131*f17f7ee2SSatish Balay #endif
1132*f17f7ee2SSatish Balay         PetscCall(PetscViewerASCIIPrintf(view,"  Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu));
1133*f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA)
1134*f17f7ee2SSatish Balay         PetscCall(PetscViewerASCIIPrintf(view,"  Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu));
1135*f17f7ee2SSatish Balay #endif
1136*f17f7ee2SSatish Balay       } else {
1137*f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA)
1138*f17f7ee2SSatish Balay         double      matrix_mem[4] = {0.,0.,0.,0.};
1139*f17f7ee2SSatish Balay         PetscMPIInt rsize = 4;
1140*f17f7ee2SSatish Balay #else
1141*f17f7ee2SSatish Balay         double      matrix_mem[2] = {0.,0.};
1142*f17f7ee2SSatish Balay         PetscMPIInt rsize = 2;
1143*f17f7ee2SSatish Balay #endif
1144*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1145*f17f7ee2SSatish Balay         matrix_mem[0] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalDenseMemoryUsage() : 0;
1146*f17f7ee2SSatish Balay         matrix_mem[1] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalLowRankMemoryUsage() : 0;
1147*f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA)
1148*f17f7ee2SSatish Balay         matrix_mem[2] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalDenseMemoryUsage() : 0;
1149*f17f7ee2SSatish Balay         matrix_mem[3] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalLowRankMemoryUsage() : 0;
1150*f17f7ee2SSatish Balay #endif
1151*f17f7ee2SSatish Balay #endif
1152*f17f7ee2SSatish Balay         PetscCall(MPIU_Allreduce(MPI_IN_PLACE,matrix_mem,rsize,MPI_DOUBLE_PRECISION,MPI_SUM,PetscObjectComm((PetscObject)A)));
1153*f17f7ee2SSatish Balay         PetscCall(PetscViewerASCIIPrintf(view,"  Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[0], matrix_mem[1], matrix_mem[0] + matrix_mem[1]));
1154*f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA)
1155*f17f7ee2SSatish Balay         PetscCall(PetscViewerASCIIPrintf(view,"  Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[2], matrix_mem[3], matrix_mem[2] + matrix_mem[3]));
1156*f17f7ee2SSatish Balay #endif
1157*f17f7ee2SSatish Balay       }
1158*f17f7ee2SSatish Balay     }
1159*f17f7ee2SSatish Balay   }
1160*f17f7ee2SSatish Balay #if 0
1161*f17f7ee2SSatish Balay   if (size == 1) {
1162*f17f7ee2SSatish Balay     char filename[256];
1163*f17f7ee2SSatish Balay     const char *name;
1164*f17f7ee2SSatish Balay 
1165*f17f7ee2SSatish Balay     PetscCall(PetscObjectGetName((PetscObject)A,&name));
1166*f17f7ee2SSatish Balay     PetscCall(PetscSNPrintf(filename,sizeof(filename),"%s_structure.eps",name));
1167*f17f7ee2SSatish Balay     outputEps(*h2opus->hmatrix,filename);
1168*f17f7ee2SSatish Balay   }
1169*f17f7ee2SSatish Balay #endif
1170*f17f7ee2SSatish Balay   PetscFunctionReturn(0);
1171*f17f7ee2SSatish Balay }
1172*f17f7ee2SSatish Balay 
1173*f17f7ee2SSatish Balay static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat A, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx)
1174*f17f7ee2SSatish Balay {
1175*f17f7ee2SSatish Balay   Mat_H2OPUS     *h2opus = (Mat_H2OPUS*)A->data;
1176*f17f7ee2SSatish Balay   PetscReal      *gcoords;
1177*f17f7ee2SSatish Balay   PetscInt       N;
1178*f17f7ee2SSatish Balay   MPI_Comm       comm;
1179*f17f7ee2SSatish Balay   PetscMPIInt    size;
1180*f17f7ee2SSatish Balay   PetscBool      cong;
1181*f17f7ee2SSatish Balay 
1182*f17f7ee2SSatish Balay   PetscFunctionBegin;
1183*f17f7ee2SSatish Balay   PetscCall(PetscLayoutSetUp(A->rmap));
1184*f17f7ee2SSatish Balay   PetscCall(PetscLayoutSetUp(A->cmap));
1185*f17f7ee2SSatish Balay   PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
1186*f17f7ee2SSatish Balay   PetscCall(MatHasCongruentLayouts(A,&cong));
1187*f17f7ee2SSatish Balay   PetscCheck(cong,comm,PETSC_ERR_SUP,"Only for square matrices with congruent layouts");
1188*f17f7ee2SSatish Balay   N    = A->rmap->N;
1189*f17f7ee2SSatish Balay   PetscCallMPI(MPI_Comm_size(comm,&size));
1190*f17f7ee2SSatish Balay   if (spacedim > 0 && size > 1 && cdist) {
1191*f17f7ee2SSatish Balay     PetscSF      sf;
1192*f17f7ee2SSatish Balay     MPI_Datatype dtype;
1193*f17f7ee2SSatish Balay 
1194*f17f7ee2SSatish Balay     PetscCallMPI(MPI_Type_contiguous(spacedim,MPIU_REAL,&dtype));
1195*f17f7ee2SSatish Balay     PetscCallMPI(MPI_Type_commit(&dtype));
1196*f17f7ee2SSatish Balay 
1197*f17f7ee2SSatish Balay     PetscCall(PetscSFCreate(comm,&sf));
1198*f17f7ee2SSatish Balay     PetscCall(PetscSFSetGraphWithPattern(sf,A->rmap,PETSCSF_PATTERN_ALLGATHER));
1199*f17f7ee2SSatish Balay     PetscCall(PetscMalloc1(spacedim*N,&gcoords));
1200*f17f7ee2SSatish Balay     PetscCall(PetscSFBcastBegin(sf,dtype,coords,gcoords,MPI_REPLACE));
1201*f17f7ee2SSatish Balay     PetscCall(PetscSFBcastEnd(sf,dtype,coords,gcoords,MPI_REPLACE));
1202*f17f7ee2SSatish Balay     PetscCall(PetscSFDestroy(&sf));
1203*f17f7ee2SSatish Balay     PetscCallMPI(MPI_Type_free(&dtype));
1204*f17f7ee2SSatish Balay   } else gcoords = (PetscReal*)coords;
1205*f17f7ee2SSatish Balay 
1206*f17f7ee2SSatish Balay   delete h2opus->ptcloud;
1207*f17f7ee2SSatish Balay   delete h2opus->kernel;
1208*f17f7ee2SSatish Balay   h2opus->ptcloud = new PetscPointCloud<PetscReal>(spacedim,N,gcoords);
1209*f17f7ee2SSatish Balay   if (kernel) h2opus->kernel = new PetscFunctionGenerator<PetscScalar>(kernel,spacedim,kernelctx);
1210*f17f7ee2SSatish Balay   if (gcoords != coords) PetscCall(PetscFree(gcoords));
1211*f17f7ee2SSatish Balay   A->preallocated = PETSC_TRUE;
1212*f17f7ee2SSatish Balay   PetscFunctionReturn(0);
1213*f17f7ee2SSatish Balay }
1214*f17f7ee2SSatish Balay 
1215*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1216*f17f7ee2SSatish Balay static PetscErrorCode MatBindToCPU_H2OPUS(Mat A, PetscBool flg)
1217*f17f7ee2SSatish Balay {
1218*f17f7ee2SSatish Balay   PetscMPIInt    size;
1219*f17f7ee2SSatish Balay   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1220*f17f7ee2SSatish Balay 
1221*f17f7ee2SSatish Balay   PetscFunctionBegin;
1222*f17f7ee2SSatish Balay   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1223*f17f7ee2SSatish Balay   if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) {
1224*f17f7ee2SSatish Balay     if (size > 1) {
1225*f17f7ee2SSatish Balay       PetscCheck(a->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1226*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1227*f17f7ee2SSatish Balay       if (!a->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix_gpu);
1228*f17f7ee2SSatish Balay       else *a->dist_hmatrix = *a->dist_hmatrix_gpu;
1229*f17f7ee2SSatish Balay #endif
1230*f17f7ee2SSatish Balay     } else {
1231*f17f7ee2SSatish Balay       PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1232*f17f7ee2SSatish Balay       if (!a->hmatrix) a->hmatrix = new HMatrix(*a->hmatrix_gpu);
1233*f17f7ee2SSatish Balay       else *a->hmatrix = *a->hmatrix_gpu;
1234*f17f7ee2SSatish Balay     }
1235*f17f7ee2SSatish Balay     delete a->hmatrix_gpu;
1236*f17f7ee2SSatish Balay     delete a->dist_hmatrix_gpu;
1237*f17f7ee2SSatish Balay     a->hmatrix_gpu = NULL;
1238*f17f7ee2SSatish Balay     a->dist_hmatrix_gpu = NULL;
1239*f17f7ee2SSatish Balay     A->offloadmask = PETSC_OFFLOAD_CPU;
1240*f17f7ee2SSatish Balay   } else if (!flg && A->offloadmask == PETSC_OFFLOAD_CPU) {
1241*f17f7ee2SSatish Balay     if (size > 1) {
1242*f17f7ee2SSatish Balay       PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1243*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1244*f17f7ee2SSatish Balay       if (!a->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
1245*f17f7ee2SSatish Balay       else *a->dist_hmatrix_gpu = *a->dist_hmatrix;
1246*f17f7ee2SSatish Balay #endif
1247*f17f7ee2SSatish Balay     } else {
1248*f17f7ee2SSatish Balay       PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1249*f17f7ee2SSatish Balay       if (!a->hmatrix_gpu) a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
1250*f17f7ee2SSatish Balay       else *a->hmatrix_gpu = *a->hmatrix;
1251*f17f7ee2SSatish Balay     }
1252*f17f7ee2SSatish Balay     delete a->hmatrix;
1253*f17f7ee2SSatish Balay     delete a->dist_hmatrix;
1254*f17f7ee2SSatish Balay     a->hmatrix = NULL;
1255*f17f7ee2SSatish Balay     a->dist_hmatrix = NULL;
1256*f17f7ee2SSatish Balay     A->offloadmask = PETSC_OFFLOAD_GPU;
1257*f17f7ee2SSatish Balay   }
1258*f17f7ee2SSatish Balay   PetscCall(PetscFree(A->defaultvectype));
1259*f17f7ee2SSatish Balay   if (!flg) {
1260*f17f7ee2SSatish Balay     PetscCall(PetscStrallocpy(VECCUDA,&A->defaultvectype));
1261*f17f7ee2SSatish Balay   } else {
1262*f17f7ee2SSatish Balay     PetscCall(PetscStrallocpy(VECSTANDARD,&A->defaultvectype));
1263*f17f7ee2SSatish Balay   }
1264*f17f7ee2SSatish Balay   A->boundtocpu = flg;
1265*f17f7ee2SSatish Balay   PetscFunctionReturn(0);
1266*f17f7ee2SSatish Balay }
1267*f17f7ee2SSatish Balay #endif
1268*f17f7ee2SSatish Balay 
1269*f17f7ee2SSatish Balay /*MC
1270*f17f7ee2SSatish Balay      MATH2OPUS = "h2opus" - A matrix type for hierarchical matrices using the H2Opus package.
1271*f17f7ee2SSatish Balay 
1272*f17f7ee2SSatish Balay    Options Database Keys:
1273*f17f7ee2SSatish Balay .     -mat_type h2opus - matrix type to "h2opus" during a call to MatSetFromOptions()
1274*f17f7ee2SSatish Balay 
1275*f17f7ee2SSatish Balay    Notes:
1276*f17f7ee2SSatish Balay      H2Opus implements hierarchical matrices in the H^2 flavour.
1277*f17f7ee2SSatish Balay      It supports CPU or NVIDIA GPUs.
1278*f17f7ee2SSatish Balay      For CPU only builds, use ./configure --download-h2opus --download-thrust to install PETSc to use H2Opus.
1279*f17f7ee2SSatish Balay      In order to run on NVIDIA GPUs, use ./configure --download-h2opus --download-magma --download-kblas.
1280*f17f7ee2SSatish Balay      For details and additional references, see
1281*f17f7ee2SSatish Balay        "H2Opus: A distributed-memory multi-GPU software package for non-local operators",
1282*f17f7ee2SSatish Balay      available at https://arxiv.org/abs/2109.05451.
1283*f17f7ee2SSatish Balay 
1284*f17f7ee2SSatish Balay    Level: beginner
1285*f17f7ee2SSatish Balay 
1286*f17f7ee2SSatish Balay .seealso: MATHTOOL, MATDENSE, MatCreateH2OpusFromKernel(), MatCreateH2OpusFromMat()
1287*f17f7ee2SSatish Balay M*/
1288*f17f7ee2SSatish Balay PETSC_EXTERN PetscErrorCode MatCreate_H2OPUS(Mat A)
1289*f17f7ee2SSatish Balay {
1290*f17f7ee2SSatish Balay   Mat_H2OPUS     *a;
1291*f17f7ee2SSatish Balay   PetscMPIInt    size;
1292*f17f7ee2SSatish Balay 
1293*f17f7ee2SSatish Balay   PetscFunctionBegin;
1294*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1295*f17f7ee2SSatish Balay   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
1296*f17f7ee2SSatish Balay #endif
1297*f17f7ee2SSatish Balay   PetscCall(PetscNewLog(A,&a));
1298*f17f7ee2SSatish Balay   A->data = (void*)a;
1299*f17f7ee2SSatish Balay 
1300*f17f7ee2SSatish Balay   a->eta              = 0.9;
1301*f17f7ee2SSatish Balay   a->leafsize         = 32;
1302*f17f7ee2SSatish Balay   a->basisord         = 4;
1303*f17f7ee2SSatish Balay   a->max_rank         = 64;
1304*f17f7ee2SSatish Balay   a->bs               = 32;
1305*f17f7ee2SSatish Balay   a->rtol             = 1.e-4;
1306*f17f7ee2SSatish Balay   a->s                = 1.0;
1307*f17f7ee2SSatish Balay   a->norm_max_samples = 10;
1308*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1309*f17f7ee2SSatish Balay   h2opusCreateDistributedHandleComm(&a->handle,PetscObjectComm((PetscObject)A));
1310*f17f7ee2SSatish Balay #else
1311*f17f7ee2SSatish Balay   h2opusCreateHandle(&a->handle);
1312*f17f7ee2SSatish Balay #endif
1313*f17f7ee2SSatish Balay   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1314*f17f7ee2SSatish Balay   PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATH2OPUS));
1315*f17f7ee2SSatish Balay   PetscCall(PetscMemzero(A->ops,sizeof(struct _MatOps)));
1316*f17f7ee2SSatish Balay 
1317*f17f7ee2SSatish Balay   A->ops->destroy          = MatDestroy_H2OPUS;
1318*f17f7ee2SSatish Balay   A->ops->view             = MatView_H2OPUS;
1319*f17f7ee2SSatish Balay   A->ops->assemblyend      = MatAssemblyEnd_H2OPUS;
1320*f17f7ee2SSatish Balay   A->ops->mult             = MatMult_H2OPUS;
1321*f17f7ee2SSatish Balay   A->ops->multtranspose    = MatMultTranspose_H2OPUS;
1322*f17f7ee2SSatish Balay   A->ops->multadd          = MatMultAdd_H2OPUS;
1323*f17f7ee2SSatish Balay   A->ops->multtransposeadd = MatMultTransposeAdd_H2OPUS;
1324*f17f7ee2SSatish Balay   A->ops->scale            = MatScale_H2OPUS;
1325*f17f7ee2SSatish Balay   A->ops->duplicate        = MatDuplicate_H2OPUS;
1326*f17f7ee2SSatish Balay   A->ops->setfromoptions   = MatSetFromOptions_H2OPUS;
1327*f17f7ee2SSatish Balay   A->ops->norm             = MatNorm_H2OPUS;
1328*f17f7ee2SSatish Balay   A->ops->zeroentries      = MatZeroEntries_H2OPUS;
1329*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1330*f17f7ee2SSatish Balay   A->ops->bindtocpu        = MatBindToCPU_H2OPUS;
1331*f17f7ee2SSatish Balay #endif
1332*f17f7ee2SSatish Balay 
1333*f17f7ee2SSatish Balay   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdense_C",MatProductSetFromOptions_H2OPUS));
1334*f17f7ee2SSatish Balay   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdensecuda_C",MatProductSetFromOptions_H2OPUS));
1335*f17f7ee2SSatish Balay   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidense_C",MatProductSetFromOptions_H2OPUS));
1336*f17f7ee2SSatish Balay   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidensecuda_C",MatProductSetFromOptions_H2OPUS));
1337*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1338*f17f7ee2SSatish Balay   PetscCall(PetscFree(A->defaultvectype));
1339*f17f7ee2SSatish Balay   PetscCall(PetscStrallocpy(VECCUDA,&A->defaultvectype));
1340*f17f7ee2SSatish Balay #endif
1341*f17f7ee2SSatish Balay   PetscFunctionReturn(0);
1342*f17f7ee2SSatish Balay }
1343*f17f7ee2SSatish Balay 
1344*f17f7ee2SSatish Balay /*@C
1345*f17f7ee2SSatish Balay      MatH2OpusOrthogonalize - Orthogonalize the basis tree of a hierarchical matrix.
1346*f17f7ee2SSatish Balay 
1347*f17f7ee2SSatish Balay    Input Parameter:
1348*f17f7ee2SSatish Balay .     A - the matrix
1349*f17f7ee2SSatish Balay 
1350*f17f7ee2SSatish Balay    Level: intermediate
1351*f17f7ee2SSatish Balay 
1352*f17f7ee2SSatish Balay .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel(), MatH2OpusCompress()
1353*f17f7ee2SSatish Balay @*/
1354*f17f7ee2SSatish Balay PetscErrorCode MatH2OpusOrthogonalize(Mat A)
1355*f17f7ee2SSatish Balay {
1356*f17f7ee2SSatish Balay   PetscBool      ish2opus;
1357*f17f7ee2SSatish Balay   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1358*f17f7ee2SSatish Balay   PetscMPIInt    size;
1359*f17f7ee2SSatish Balay   PetscBool      boundtocpu = PETSC_TRUE;
1360*f17f7ee2SSatish Balay 
1361*f17f7ee2SSatish Balay   PetscFunctionBegin;
1362*f17f7ee2SSatish Balay   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1363*f17f7ee2SSatish Balay   PetscValidType(A,1);
1364*f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
1365*f17f7ee2SSatish Balay   if (!ish2opus) PetscFunctionReturn(0);
1366*f17f7ee2SSatish Balay   if (a->orthogonal) PetscFunctionReturn(0);
1367*f17f7ee2SSatish Balay   HLibProfile::clear();
1368*f17f7ee2SSatish Balay   PetscCall(PetscLogEventBegin(MAT_H2Opus_Orthog,A,0,0,0));
1369*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1370*f17f7ee2SSatish Balay   boundtocpu = A->boundtocpu;
1371*f17f7ee2SSatish Balay #endif
1372*f17f7ee2SSatish Balay   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1373*f17f7ee2SSatish Balay   if (size > 1) {
1374*f17f7ee2SSatish Balay     if (boundtocpu) {
1375*f17f7ee2SSatish Balay       PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1376*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1377*f17f7ee2SSatish Balay       distributed_horthog(*a->dist_hmatrix, a->handle);
1378*f17f7ee2SSatish Balay #endif
1379*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1380*f17f7ee2SSatish Balay       A->offloadmask = PETSC_OFFLOAD_CPU;
1381*f17f7ee2SSatish Balay     } else {
1382*f17f7ee2SSatish Balay       PetscCheck(a->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1383*f17f7ee2SSatish Balay       PetscCall(PetscLogGpuTimeBegin());
1384*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1385*f17f7ee2SSatish Balay       distributed_horthog(*a->dist_hmatrix_gpu, a->handle);
1386*f17f7ee2SSatish Balay #endif
1387*f17f7ee2SSatish Balay       PetscCall(PetscLogGpuTimeEnd());
1388*f17f7ee2SSatish Balay #endif
1389*f17f7ee2SSatish Balay     }
1390*f17f7ee2SSatish Balay   } else {
1391*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1392*f17f7ee2SSatish Balay     h2opusHandle_t handle = a->handle->handle;
1393*f17f7ee2SSatish Balay #else
1394*f17f7ee2SSatish Balay     h2opusHandle_t handle = a->handle;
1395*f17f7ee2SSatish Balay #endif
1396*f17f7ee2SSatish Balay     if (boundtocpu) {
1397*f17f7ee2SSatish Balay       PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1398*f17f7ee2SSatish Balay       horthog(*a->hmatrix, handle);
1399*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1400*f17f7ee2SSatish Balay       A->offloadmask = PETSC_OFFLOAD_CPU;
1401*f17f7ee2SSatish Balay     } else {
1402*f17f7ee2SSatish Balay       PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1403*f17f7ee2SSatish Balay       PetscCall(PetscLogGpuTimeBegin());
1404*f17f7ee2SSatish Balay       horthog(*a->hmatrix_gpu, handle);
1405*f17f7ee2SSatish Balay       PetscCall(PetscLogGpuTimeEnd());
1406*f17f7ee2SSatish Balay #endif
1407*f17f7ee2SSatish Balay     }
1408*f17f7ee2SSatish Balay   }
1409*f17f7ee2SSatish Balay   a->orthogonal = PETSC_TRUE;
1410*f17f7ee2SSatish Balay   { /* log flops */
1411*f17f7ee2SSatish Balay     double gops,time,perf,dev;
1412*f17f7ee2SSatish Balay     HLibProfile::getHorthogPerf(gops,time,perf,dev);
1413*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1414*f17f7ee2SSatish Balay     if (boundtocpu) {
1415*f17f7ee2SSatish Balay       PetscCall(PetscLogFlops(1e9*gops));
1416*f17f7ee2SSatish Balay     } else {
1417*f17f7ee2SSatish Balay       PetscCall(PetscLogGpuFlops(1e9*gops));
1418*f17f7ee2SSatish Balay     }
1419*f17f7ee2SSatish Balay #else
1420*f17f7ee2SSatish Balay     PetscCall(PetscLogFlops(1e9*gops));
1421*f17f7ee2SSatish Balay #endif
1422*f17f7ee2SSatish Balay   }
1423*f17f7ee2SSatish Balay   PetscCall(PetscLogEventEnd(MAT_H2Opus_Orthog,A,0,0,0));
1424*f17f7ee2SSatish Balay   PetscFunctionReturn(0);
1425*f17f7ee2SSatish Balay }
1426*f17f7ee2SSatish Balay 
1427*f17f7ee2SSatish Balay /*@C
1428*f17f7ee2SSatish Balay      MatH2OpusCompress - Compress a hierarchical matrix.
1429*f17f7ee2SSatish Balay 
1430*f17f7ee2SSatish Balay    Input Parameters:
1431*f17f7ee2SSatish Balay +     A - the matrix
1432*f17f7ee2SSatish Balay -     tol - the absolute truncation threshold
1433*f17f7ee2SSatish Balay 
1434*f17f7ee2SSatish Balay    Level: intermediate
1435*f17f7ee2SSatish Balay 
1436*f17f7ee2SSatish Balay .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel(), MatH2OpusOrthogonalize()
1437*f17f7ee2SSatish Balay @*/
1438*f17f7ee2SSatish Balay PetscErrorCode MatH2OpusCompress(Mat A, PetscReal tol)
1439*f17f7ee2SSatish Balay {
1440*f17f7ee2SSatish Balay   PetscBool      ish2opus;
1441*f17f7ee2SSatish Balay   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1442*f17f7ee2SSatish Balay   PetscMPIInt    size;
1443*f17f7ee2SSatish Balay   PetscBool      boundtocpu = PETSC_TRUE;
1444*f17f7ee2SSatish Balay 
1445*f17f7ee2SSatish Balay   PetscFunctionBegin;
1446*f17f7ee2SSatish Balay   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1447*f17f7ee2SSatish Balay   PetscValidType(A,1);
1448*f17f7ee2SSatish Balay   PetscValidLogicalCollectiveReal(A,tol,2);
1449*f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
1450*f17f7ee2SSatish Balay   if (!ish2opus || tol <= 0.0) PetscFunctionReturn(0);
1451*f17f7ee2SSatish Balay   PetscCall(MatH2OpusOrthogonalize(A));
1452*f17f7ee2SSatish Balay   HLibProfile::clear();
1453*f17f7ee2SSatish Balay   PetscCall(PetscLogEventBegin(MAT_H2Opus_Compress,A,0,0,0));
1454*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1455*f17f7ee2SSatish Balay   boundtocpu = A->boundtocpu;
1456*f17f7ee2SSatish Balay #endif
1457*f17f7ee2SSatish Balay   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1458*f17f7ee2SSatish Balay   if (size > 1) {
1459*f17f7ee2SSatish Balay     if (boundtocpu) {
1460*f17f7ee2SSatish Balay       PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1461*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1462*f17f7ee2SSatish Balay       distributed_hcompress(*a->dist_hmatrix, tol, a->handle);
1463*f17f7ee2SSatish Balay #endif
1464*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1465*f17f7ee2SSatish Balay       A->offloadmask = PETSC_OFFLOAD_CPU;
1466*f17f7ee2SSatish Balay     } else {
1467*f17f7ee2SSatish Balay       PetscCheck(a->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1468*f17f7ee2SSatish Balay       PetscCall(PetscLogGpuTimeBegin());
1469*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1470*f17f7ee2SSatish Balay       distributed_hcompress(*a->dist_hmatrix_gpu, tol, a->handle);
1471*f17f7ee2SSatish Balay #endif
1472*f17f7ee2SSatish Balay       PetscCall(PetscLogGpuTimeEnd());
1473*f17f7ee2SSatish Balay #endif
1474*f17f7ee2SSatish Balay     }
1475*f17f7ee2SSatish Balay   } else {
1476*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1477*f17f7ee2SSatish Balay     h2opusHandle_t handle = a->handle->handle;
1478*f17f7ee2SSatish Balay #else
1479*f17f7ee2SSatish Balay     h2opusHandle_t handle = a->handle;
1480*f17f7ee2SSatish Balay #endif
1481*f17f7ee2SSatish Balay     if (boundtocpu) {
1482*f17f7ee2SSatish Balay       PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1483*f17f7ee2SSatish Balay       hcompress(*a->hmatrix, tol, handle);
1484*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1485*f17f7ee2SSatish Balay       A->offloadmask = PETSC_OFFLOAD_CPU;
1486*f17f7ee2SSatish Balay     } else {
1487*f17f7ee2SSatish Balay       PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1488*f17f7ee2SSatish Balay       PetscCall(PetscLogGpuTimeBegin());
1489*f17f7ee2SSatish Balay       hcompress(*a->hmatrix_gpu, tol, handle);
1490*f17f7ee2SSatish Balay       PetscCall(PetscLogGpuTimeEnd());
1491*f17f7ee2SSatish Balay #endif
1492*f17f7ee2SSatish Balay     }
1493*f17f7ee2SSatish Balay   }
1494*f17f7ee2SSatish Balay   { /* log flops */
1495*f17f7ee2SSatish Balay     double gops,time,perf,dev;
1496*f17f7ee2SSatish Balay     HLibProfile::getHcompressPerf(gops,time,perf,dev);
1497*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1498*f17f7ee2SSatish Balay     if (boundtocpu) {
1499*f17f7ee2SSatish Balay       PetscCall(PetscLogFlops(1e9*gops));
1500*f17f7ee2SSatish Balay     } else {
1501*f17f7ee2SSatish Balay       PetscCall(PetscLogGpuFlops(1e9*gops));
1502*f17f7ee2SSatish Balay     }
1503*f17f7ee2SSatish Balay #else
1504*f17f7ee2SSatish Balay     PetscCall(PetscLogFlops(1e9*gops));
1505*f17f7ee2SSatish Balay #endif
1506*f17f7ee2SSatish Balay   }
1507*f17f7ee2SSatish Balay   PetscCall(PetscLogEventEnd(MAT_H2Opus_Compress,A,0,0,0));
1508*f17f7ee2SSatish Balay   PetscFunctionReturn(0);
1509*f17f7ee2SSatish Balay }
1510*f17f7ee2SSatish Balay 
1511*f17f7ee2SSatish Balay /*@C
1512*f17f7ee2SSatish Balay      MatH2OpusSetSamplingMat - Set a matrix to be sampled from matrix vector product to construct a hierarchical matrix.
1513*f17f7ee2SSatish Balay 
1514*f17f7ee2SSatish Balay    Input Parameters:
1515*f17f7ee2SSatish Balay +     A - the hierarchical matrix
1516*f17f7ee2SSatish Balay .     B - the matrix to be sampled
1517*f17f7ee2SSatish Balay .     bs - maximum number of samples to be taken concurrently
1518*f17f7ee2SSatish Balay -     tol - relative tolerance for construction
1519*f17f7ee2SSatish Balay 
1520*f17f7ee2SSatish Balay    Notes: Need to call MatAssemblyBegin/End() to update the hierarchical matrix.
1521*f17f7ee2SSatish Balay 
1522*f17f7ee2SSatish Balay    Level: intermediate
1523*f17f7ee2SSatish Balay 
1524*f17f7ee2SSatish Balay .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel(), MatH2OpusCompress(), MatH2OpusOrthogonalize()
1525*f17f7ee2SSatish Balay @*/
1526*f17f7ee2SSatish Balay PetscErrorCode MatH2OpusSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol)
1527*f17f7ee2SSatish Balay {
1528*f17f7ee2SSatish Balay   PetscBool      ish2opus;
1529*f17f7ee2SSatish Balay 
1530*f17f7ee2SSatish Balay   PetscFunctionBegin;
1531*f17f7ee2SSatish Balay   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1532*f17f7ee2SSatish Balay   PetscValidType(A,1);
1533*f17f7ee2SSatish Balay   if (B) PetscValidHeaderSpecific(B,MAT_CLASSID,2);
1534*f17f7ee2SSatish Balay   PetscValidLogicalCollectiveInt(A,bs,3);
1535*f17f7ee2SSatish Balay   PetscValidLogicalCollectiveReal(A,tol,3);
1536*f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
1537*f17f7ee2SSatish Balay   if (ish2opus) {
1538*f17f7ee2SSatish Balay     Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
1539*f17f7ee2SSatish Balay 
1540*f17f7ee2SSatish Balay     if (!a->sampler) a->sampler = new PetscMatrixSampler();
1541*f17f7ee2SSatish Balay     a->sampler->SetSamplingMat(B);
1542*f17f7ee2SSatish Balay     if (bs > 0) a->bs = bs;
1543*f17f7ee2SSatish Balay     if (tol > 0.) a->rtol = tol;
1544*f17f7ee2SSatish Balay     delete a->kernel;
1545*f17f7ee2SSatish Balay   }
1546*f17f7ee2SSatish Balay   PetscFunctionReturn(0);
1547*f17f7ee2SSatish Balay }
1548*f17f7ee2SSatish Balay 
1549*f17f7ee2SSatish Balay /*@C
1550*f17f7ee2SSatish Balay      MatCreateH2OpusFromKernel - Creates a MATH2OPUS from a user-supplied kernel.
1551*f17f7ee2SSatish Balay 
1552*f17f7ee2SSatish Balay    Input Parameters:
1553*f17f7ee2SSatish Balay +     comm - MPI communicator
1554*f17f7ee2SSatish Balay .     m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1555*f17f7ee2SSatish Balay .     n - number of local columns (or PETSC_DECIDE to have calculated if N is given)
1556*f17f7ee2SSatish Balay .     M - number of global rows (or PETSC_DETERMINE to have calculated if m is given)
1557*f17f7ee2SSatish Balay .     N - number of global columns (or PETSC_DETERMINE to have calculated if n is given)
1558*f17f7ee2SSatish Balay .     spacedim - dimension of the space coordinates
1559*f17f7ee2SSatish Balay .     coords - coordinates of the points
1560*f17f7ee2SSatish Balay .     cdist - whether or not coordinates are distributed
1561*f17f7ee2SSatish Balay .     kernel - computational kernel (or NULL)
1562*f17f7ee2SSatish Balay .     kernelctx - kernel context
1563*f17f7ee2SSatish Balay .     eta - admissibility condition tolerance
1564*f17f7ee2SSatish Balay .     leafsize - leaf size in cluster tree
1565*f17f7ee2SSatish Balay -     basisord - approximation order for Chebychev interpolation of low-rank blocks
1566*f17f7ee2SSatish Balay 
1567*f17f7ee2SSatish Balay    Output Parameter:
1568*f17f7ee2SSatish Balay .     nA - matrix
1569*f17f7ee2SSatish Balay 
1570*f17f7ee2SSatish Balay    Options Database Keys:
1571*f17f7ee2SSatish Balay +     -mat_h2opus_leafsize <PetscInt> - Leaf size of cluster tree
1572*f17f7ee2SSatish Balay .     -mat_h2opus_eta <PetscReal> - Admissibility condition tolerance
1573*f17f7ee2SSatish Balay .     -mat_h2opus_order <PetscInt> - Chebychev approximation order
1574*f17f7ee2SSatish Balay -     -mat_h2opus_normsamples <PetscInt> - Maximum number of samples to be used when estimating norms
1575*f17f7ee2SSatish Balay 
1576*f17f7ee2SSatish Balay    Level: intermediate
1577*f17f7ee2SSatish Balay 
1578*f17f7ee2SSatish Balay .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat()
1579*f17f7ee2SSatish Balay @*/
1580*f17f7ee2SSatish Balay PetscErrorCode MatCreateH2OpusFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat* nA)
1581*f17f7ee2SSatish Balay {
1582*f17f7ee2SSatish Balay   Mat            A;
1583*f17f7ee2SSatish Balay   Mat_H2OPUS     *h2opus;
1584*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1585*f17f7ee2SSatish Balay   PetscBool      iscpu = PETSC_FALSE;
1586*f17f7ee2SSatish Balay #else
1587*f17f7ee2SSatish Balay   PetscBool      iscpu = PETSC_TRUE;
1588*f17f7ee2SSatish Balay #endif
1589*f17f7ee2SSatish Balay 
1590*f17f7ee2SSatish Balay   PetscFunctionBegin;
1591*f17f7ee2SSatish Balay   PetscCheckFalse(m != n,PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported");
1592*f17f7ee2SSatish Balay   PetscCall(MatCreate(comm,&A));
1593*f17f7ee2SSatish Balay   PetscCall(MatSetSizes(A,m,n,M,N));
1594*f17f7ee2SSatish Balay   PetscCheckFalse(M != N,comm,PETSC_ERR_SUP,"Rectangular matrices are not supported");
1595*f17f7ee2SSatish Balay   PetscCall(MatSetType(A,MATH2OPUS));
1596*f17f7ee2SSatish Balay   PetscCall(MatBindToCPU(A,iscpu));
1597*f17f7ee2SSatish Balay   PetscCall(MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,cdist,kernel,kernelctx));
1598*f17f7ee2SSatish Balay 
1599*f17f7ee2SSatish Balay   h2opus = (Mat_H2OPUS*)A->data;
1600*f17f7ee2SSatish Balay   if (eta > 0.) h2opus->eta = eta;
1601*f17f7ee2SSatish Balay   if (leafsize > 0) h2opus->leafsize = leafsize;
1602*f17f7ee2SSatish Balay   if (basisord > 0) h2opus->basisord = basisord;
1603*f17f7ee2SSatish Balay 
1604*f17f7ee2SSatish Balay   *nA = A;
1605*f17f7ee2SSatish Balay   PetscFunctionReturn(0);
1606*f17f7ee2SSatish Balay }
1607*f17f7ee2SSatish Balay 
1608*f17f7ee2SSatish Balay /*@C
1609*f17f7ee2SSatish Balay      MatCreateH2OpusFromMat - Creates a MATH2OPUS sampling from a user-supplied operator.
1610*f17f7ee2SSatish Balay 
1611*f17f7ee2SSatish Balay    Input Parameters:
1612*f17f7ee2SSatish Balay +     B - the matrix to be sampled
1613*f17f7ee2SSatish Balay .     spacedim - dimension of the space coordinates
1614*f17f7ee2SSatish Balay .     coords - coordinates of the points
1615*f17f7ee2SSatish Balay .     cdist - whether or not coordinates are distributed
1616*f17f7ee2SSatish Balay .     eta - admissibility condition tolerance
1617*f17f7ee2SSatish Balay .     leafsize - leaf size in cluster tree
1618*f17f7ee2SSatish Balay .     maxrank - maximum rank allowed
1619*f17f7ee2SSatish Balay .     bs - maximum number of samples to be taken concurrently
1620*f17f7ee2SSatish Balay -     rtol - relative tolerance for construction
1621*f17f7ee2SSatish Balay 
1622*f17f7ee2SSatish Balay    Output Parameter:
1623*f17f7ee2SSatish Balay .     nA - matrix
1624*f17f7ee2SSatish Balay 
1625*f17f7ee2SSatish Balay    Options Database Keys:
1626*f17f7ee2SSatish Balay +     -mat_h2opus_leafsize <PetscInt> - Leaf size of cluster tree
1627*f17f7ee2SSatish Balay .     -mat_h2opus_eta <PetscReal> - Admissibility condition tolerance
1628*f17f7ee2SSatish Balay .     -mat_h2opus_maxrank <PetscInt> - Maximum rank when constructed from matvecs
1629*f17f7ee2SSatish Balay .     -mat_h2opus_samples <PetscInt> - Maximum number of samples to be taken concurrently when constructing from matvecs
1630*f17f7ee2SSatish Balay .     -mat_h2opus_rtol <PetscReal> - Relative tolerance for construction from sampling
1631*f17f7ee2SSatish Balay .     -mat_h2opus_check <PetscBool> - Check error when constructing from sampling during MatAssemblyEnd()
1632*f17f7ee2SSatish Balay .     -mat_h2opus_hara_verbose <PetscBool> - Verbose output from hara construction
1633*f17f7ee2SSatish Balay -     -mat_h2opus_normsamples <PetscInt> - Maximum bumber of samples to be when estimating norms
1634*f17f7ee2SSatish Balay 
1635*f17f7ee2SSatish Balay    Notes: not available in parallel
1636*f17f7ee2SSatish Balay 
1637*f17f7ee2SSatish Balay    Level: intermediate
1638*f17f7ee2SSatish Balay 
1639*f17f7ee2SSatish Balay .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromKernel()
1640*f17f7ee2SSatish Balay @*/
1641*f17f7ee2SSatish Balay PetscErrorCode MatCreateH2OpusFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA)
1642*f17f7ee2SSatish Balay {
1643*f17f7ee2SSatish Balay   Mat            A;
1644*f17f7ee2SSatish Balay   Mat_H2OPUS     *h2opus;
1645*f17f7ee2SSatish Balay   MPI_Comm       comm;
1646*f17f7ee2SSatish Balay   PetscBool      boundtocpu = PETSC_TRUE;
1647*f17f7ee2SSatish Balay 
1648*f17f7ee2SSatish Balay   PetscFunctionBegin;
1649*f17f7ee2SSatish Balay   PetscValidHeaderSpecific(B,MAT_CLASSID,1);
1650*f17f7ee2SSatish Balay   PetscValidLogicalCollectiveInt(B,spacedim,2);
1651*f17f7ee2SSatish Balay   PetscValidLogicalCollectiveBool(B,cdist,4);
1652*f17f7ee2SSatish Balay   PetscValidLogicalCollectiveReal(B,eta,5);
1653*f17f7ee2SSatish Balay   PetscValidLogicalCollectiveInt(B,leafsize,6);
1654*f17f7ee2SSatish Balay   PetscValidLogicalCollectiveInt(B,maxrank,7);
1655*f17f7ee2SSatish Balay   PetscValidLogicalCollectiveInt(B,bs,8);
1656*f17f7ee2SSatish Balay   PetscValidLogicalCollectiveReal(B,rtol,9);
1657*f17f7ee2SSatish Balay   PetscValidPointer(nA,10);
1658*f17f7ee2SSatish Balay   PetscCall(PetscObjectGetComm((PetscObject)B,&comm));
1659*f17f7ee2SSatish Balay   PetscCheckFalse(B->rmap->n != B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported");
1660*f17f7ee2SSatish Balay   PetscCheckFalse(B->rmap->N != B->cmap->N,comm,PETSC_ERR_SUP,"Rectangular matrices are not supported");
1661*f17f7ee2SSatish Balay   PetscCall(MatCreate(comm,&A));
1662*f17f7ee2SSatish Balay   PetscCall(MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N));
1663*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1664*f17f7ee2SSatish Balay   {
1665*f17f7ee2SSatish Balay     PetscBool iscuda;
1666*f17f7ee2SSatish Balay     VecType   vtype;
1667*f17f7ee2SSatish Balay 
1668*f17f7ee2SSatish Balay     PetscCall(MatGetVecType(B,&vtype));
1669*f17f7ee2SSatish Balay     PetscCall(PetscStrcmp(vtype,VECCUDA,&iscuda));
1670*f17f7ee2SSatish Balay     if (!iscuda) {
1671*f17f7ee2SSatish Balay       PetscCall(PetscStrcmp(vtype,VECSEQCUDA,&iscuda));
1672*f17f7ee2SSatish Balay       if (!iscuda) {
1673*f17f7ee2SSatish Balay         PetscCall(PetscStrcmp(vtype,VECMPICUDA,&iscuda));
1674*f17f7ee2SSatish Balay       }
1675*f17f7ee2SSatish Balay     }
1676*f17f7ee2SSatish Balay     if (iscuda && !B->boundtocpu) boundtocpu = PETSC_FALSE;
1677*f17f7ee2SSatish Balay   }
1678*f17f7ee2SSatish Balay #endif
1679*f17f7ee2SSatish Balay   PetscCall(MatSetType(A,MATH2OPUS));
1680*f17f7ee2SSatish Balay   PetscCall(MatBindToCPU(A,boundtocpu));
1681*f17f7ee2SSatish Balay   if (spacedim) {
1682*f17f7ee2SSatish Balay     PetscCall(MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,cdist,NULL,NULL));
1683*f17f7ee2SSatish Balay   }
1684*f17f7ee2SSatish Balay   PetscCall(MatPropagateSymmetryOptions(B,A));
1685*f17f7ee2SSatish Balay   /* PetscCheck(A->symmetric,comm,PETSC_ERR_SUP,"Unsymmetric sampling does not work"); */
1686*f17f7ee2SSatish Balay 
1687*f17f7ee2SSatish Balay   h2opus = (Mat_H2OPUS*)A->data;
1688*f17f7ee2SSatish Balay   h2opus->sampler = new PetscMatrixSampler(B);
1689*f17f7ee2SSatish Balay   if (eta > 0.) h2opus->eta = eta;
1690*f17f7ee2SSatish Balay   if (leafsize > 0) h2opus->leafsize = leafsize;
1691*f17f7ee2SSatish Balay   if (maxrank > 0) h2opus->max_rank = maxrank;
1692*f17f7ee2SSatish Balay   if (bs > 0) h2opus->bs = bs;
1693*f17f7ee2SSatish Balay   if (rtol > 0.) h2opus->rtol = rtol;
1694*f17f7ee2SSatish Balay   *nA = A;
1695*f17f7ee2SSatish Balay   A->preallocated = PETSC_TRUE;
1696*f17f7ee2SSatish Balay   PetscFunctionReturn(0);
1697*f17f7ee2SSatish Balay }
1698*f17f7ee2SSatish Balay 
1699*f17f7ee2SSatish Balay /*@C
1700*f17f7ee2SSatish Balay      MatH2OpusGetIndexMap - Access reordering index set.
1701*f17f7ee2SSatish Balay 
1702*f17f7ee2SSatish Balay    Input Parameters:
1703*f17f7ee2SSatish Balay .     A - the matrix
1704*f17f7ee2SSatish Balay 
1705*f17f7ee2SSatish Balay    Output Parameter:
1706*f17f7ee2SSatish Balay .     indexmap - the index set for the reordering
1707*f17f7ee2SSatish Balay 
1708*f17f7ee2SSatish Balay    Level: intermediate
1709*f17f7ee2SSatish Balay 
1710*f17f7ee2SSatish Balay .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel()
1711*f17f7ee2SSatish Balay @*/
1712*f17f7ee2SSatish Balay PetscErrorCode MatH2OpusGetIndexMap(Mat A, IS *indexmap)
1713*f17f7ee2SSatish Balay {
1714*f17f7ee2SSatish Balay   PetscBool      ish2opus;
1715*f17f7ee2SSatish Balay   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1716*f17f7ee2SSatish Balay 
1717*f17f7ee2SSatish Balay   PetscFunctionBegin;
1718*f17f7ee2SSatish Balay   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1719*f17f7ee2SSatish Balay   PetscValidType(A,1);
1720*f17f7ee2SSatish Balay   PetscValidPointer(indexmap,2);
1721*f17f7ee2SSatish Balay   PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1722*f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
1723*f17f7ee2SSatish Balay   PetscCheck(ish2opus,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name);
1724*f17f7ee2SSatish Balay   *indexmap = a->h2opus_indexmap;
1725*f17f7ee2SSatish Balay   PetscFunctionReturn(0);
1726*f17f7ee2SSatish Balay }
1727*f17f7ee2SSatish Balay 
1728*f17f7ee2SSatish Balay /*@C
1729*f17f7ee2SSatish Balay      MatH2OpusMapVec - Maps a vector between PETSc and H2Opus ordering
1730*f17f7ee2SSatish Balay 
1731*f17f7ee2SSatish Balay    Input Parameters:
1732*f17f7ee2SSatish Balay +     A - the matrix
1733*f17f7ee2SSatish Balay .     nativetopetsc - if true, maps from H2Opus ordering to PETSc ordering. If false, applies the reverse map
1734*f17f7ee2SSatish Balay -     in - the vector to be mapped
1735*f17f7ee2SSatish Balay 
1736*f17f7ee2SSatish Balay    Output Parameter:
1737*f17f7ee2SSatish Balay .     out - the newly created mapped vector
1738*f17f7ee2SSatish Balay 
1739*f17f7ee2SSatish Balay    Level: intermediate
1740*f17f7ee2SSatish Balay 
1741*f17f7ee2SSatish Balay .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel()
1742*f17f7ee2SSatish Balay @*/
1743*f17f7ee2SSatish Balay PetscErrorCode MatH2OpusMapVec(Mat A, PetscBool nativetopetsc, Vec in, Vec* out)
1744*f17f7ee2SSatish Balay {
1745*f17f7ee2SSatish Balay   PetscBool      ish2opus;
1746*f17f7ee2SSatish Balay   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1747*f17f7ee2SSatish Balay   PetscScalar    *xin,*xout;
1748*f17f7ee2SSatish Balay   PetscBool      nm;
1749*f17f7ee2SSatish Balay 
1750*f17f7ee2SSatish Balay   PetscFunctionBegin;
1751*f17f7ee2SSatish Balay   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1752*f17f7ee2SSatish Balay   PetscValidType(A,1);
1753*f17f7ee2SSatish Balay   PetscValidLogicalCollectiveBool(A,nativetopetsc,2);
1754*f17f7ee2SSatish Balay   PetscValidHeaderSpecific(in,VEC_CLASSID,3);
1755*f17f7ee2SSatish Balay   PetscValidPointer(out,4);
1756*f17f7ee2SSatish Balay   PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1757*f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
1758*f17f7ee2SSatish Balay   PetscCheck(ish2opus,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name);
1759*f17f7ee2SSatish Balay   nm   = a->nativemult;
1760*f17f7ee2SSatish Balay   PetscCall(MatH2OpusSetNativeMult(A,(PetscBool)!nativetopetsc));
1761*f17f7ee2SSatish Balay   PetscCall(MatCreateVecs(A,out,NULL));
1762*f17f7ee2SSatish Balay   PetscCall(MatH2OpusSetNativeMult(A,nm));
1763*f17f7ee2SSatish Balay   if (!a->sf) { /* same ordering */
1764*f17f7ee2SSatish Balay     PetscCall(VecCopy(in,*out));
1765*f17f7ee2SSatish Balay     PetscFunctionReturn(0);
1766*f17f7ee2SSatish Balay   }
1767*f17f7ee2SSatish Balay   PetscCall(VecGetArrayRead(in,(const PetscScalar**)&xin));
1768*f17f7ee2SSatish Balay   PetscCall(VecGetArrayWrite(*out,&xout));
1769*f17f7ee2SSatish Balay   if (nativetopetsc) {
1770*f17f7ee2SSatish Balay     PetscCall(PetscSFReduceBegin(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE));
1771*f17f7ee2SSatish Balay     PetscCall(PetscSFReduceEnd(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE));
1772*f17f7ee2SSatish Balay   } else {
1773*f17f7ee2SSatish Balay     PetscCall(PetscSFBcastBegin(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE));
1774*f17f7ee2SSatish Balay     PetscCall(PetscSFBcastEnd(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE));
1775*f17f7ee2SSatish Balay   }
1776*f17f7ee2SSatish Balay   PetscCall(VecRestoreArrayRead(in,(const PetscScalar**)&xin));
1777*f17f7ee2SSatish Balay   PetscCall(VecRestoreArrayWrite(*out,&xout));
1778*f17f7ee2SSatish Balay   PetscFunctionReturn(0);
1779*f17f7ee2SSatish Balay }
1780*f17f7ee2SSatish Balay 
1781*f17f7ee2SSatish Balay /*@C
1782*f17f7ee2SSatish Balay      MatH2OpusLowRankUpdate - Perform a low-rank update of the form A = A + s * U * V^T
1783*f17f7ee2SSatish Balay 
1784*f17f7ee2SSatish Balay    Input Parameters:
1785*f17f7ee2SSatish Balay +     A - the hierarchical matrix
1786*f17f7ee2SSatish Balay .     s - the scaling factor
1787*f17f7ee2SSatish Balay .     U - the dense low-rank update matrix
1788*f17f7ee2SSatish Balay -     V - (optional) the dense low-rank update matrix (if NULL, then V = U is assumed)
1789*f17f7ee2SSatish Balay 
1790*f17f7ee2SSatish Balay    Notes: The U and V matrices must be in dense format
1791*f17f7ee2SSatish Balay 
1792*f17f7ee2SSatish Balay    Level: intermediate
1793*f17f7ee2SSatish Balay 
1794*f17f7ee2SSatish Balay .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel(), MatH2OpusCompress(), MatH2OpusOrthogonalize(), MATDENSE
1795*f17f7ee2SSatish Balay @*/
1796*f17f7ee2SSatish Balay PetscErrorCode MatH2OpusLowRankUpdate(Mat A, Mat U, Mat V, PetscScalar s)
1797*f17f7ee2SSatish Balay {
1798*f17f7ee2SSatish Balay   PetscBool      flg;
1799*f17f7ee2SSatish Balay 
1800*f17f7ee2SSatish Balay   PetscFunctionBegin;
1801*f17f7ee2SSatish Balay   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1802*f17f7ee2SSatish Balay   PetscValidType(A,1);
1803*f17f7ee2SSatish Balay   PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1804*f17f7ee2SSatish Balay   PetscValidHeaderSpecific(U,MAT_CLASSID,2);
1805*f17f7ee2SSatish Balay   PetscCheckSameComm(A,1,U,2);
1806*f17f7ee2SSatish Balay   if (V) {
1807*f17f7ee2SSatish Balay     PetscValidHeaderSpecific(V,MAT_CLASSID,3);
1808*f17f7ee2SSatish Balay     PetscCheckSameComm(A,1,V,3);
1809*f17f7ee2SSatish Balay   }
1810*f17f7ee2SSatish Balay   PetscValidLogicalCollectiveScalar(A,s,4);
1811*f17f7ee2SSatish Balay 
1812*f17f7ee2SSatish Balay   if (!V) V = U;
1813*f17f7ee2SSatish Balay   PetscCheckFalse(U->cmap->N != V->cmap->N,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Non matching rank update %" PetscInt_FMT " != %" PetscInt_FMT,U->cmap->N,V->cmap->N);
1814*f17f7ee2SSatish Balay   if (!U->cmap->N) PetscFunctionReturn(0);
1815*f17f7ee2SSatish Balay   PetscCall(PetscLayoutCompare(U->rmap,A->rmap,&flg));
1816*f17f7ee2SSatish Balay   PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"A and U must have the same row layout");
1817*f17f7ee2SSatish Balay   PetscCall(PetscLayoutCompare(V->rmap,A->cmap,&flg));
1818*f17f7ee2SSatish Balay   PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"A column layout must match V row column layout");
1819*f17f7ee2SSatish Balay   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&flg));
1820*f17f7ee2SSatish Balay   if (flg) {
1821*f17f7ee2SSatish Balay     Mat_H2OPUS        *a = (Mat_H2OPUS*)A->data;
1822*f17f7ee2SSatish Balay     const PetscScalar *u,*v,*uu,*vv;
1823*f17f7ee2SSatish Balay     PetscInt          ldu,ldv;
1824*f17f7ee2SSatish Balay     PetscMPIInt       size;
1825*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI)
1826*f17f7ee2SSatish Balay     h2opusHandle_t    handle = a->handle->handle;
1827*f17f7ee2SSatish Balay #else
1828*f17f7ee2SSatish Balay     h2opusHandle_t    handle = a->handle;
1829*f17f7ee2SSatish Balay #endif
1830*f17f7ee2SSatish Balay     PetscBool         usesf = (PetscBool)(a->sf && !a->nativemult);
1831*f17f7ee2SSatish Balay     PetscSF           usf,vsf;
1832*f17f7ee2SSatish Balay 
1833*f17f7ee2SSatish Balay     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1834*f17f7ee2SSatish Balay     PetscCheckFalse(size > 1,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not yet implemented in parallel");
1835*f17f7ee2SSatish Balay     PetscCall(PetscLogEventBegin(MAT_H2Opus_LR,A,0,0,0));
1836*f17f7ee2SSatish Balay     PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)U,&flg,MATSEQDENSE,MATMPIDENSE,""));
1837*f17f7ee2SSatish Balay     PetscCheck(flg,PetscObjectComm((PetscObject)U),PETSC_ERR_SUP,"Not for U of type %s",((PetscObject)U)->type_name);
1838*f17f7ee2SSatish Balay     PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)V,&flg,MATSEQDENSE,MATMPIDENSE,""));
1839*f17f7ee2SSatish Balay     PetscCheck(flg,PetscObjectComm((PetscObject)V),PETSC_ERR_SUP,"Not for V of type %s",((PetscObject)V)->type_name);
1840*f17f7ee2SSatish Balay     PetscCall(MatDenseGetLDA(U,&ldu));
1841*f17f7ee2SSatish Balay     PetscCall(MatDenseGetLDA(V,&ldv));
1842*f17f7ee2SSatish Balay     PetscCall(MatBoundToCPU(A,&flg));
1843*f17f7ee2SSatish Balay     if (usesf) {
1844*f17f7ee2SSatish Balay       PetscInt n;
1845*f17f7ee2SSatish Balay 
1846*f17f7ee2SSatish Balay       PetscCall(MatDenseGetH2OpusVectorSF(U,a->sf,&usf));
1847*f17f7ee2SSatish Balay       PetscCall(MatDenseGetH2OpusVectorSF(V,a->sf,&vsf));
1848*f17f7ee2SSatish Balay       PetscCall(MatH2OpusResizeBuffers_Private(A,U->cmap->N,V->cmap->N));
1849*f17f7ee2SSatish Balay       PetscCall(PetscSFGetGraph(a->sf,NULL,&n,NULL,NULL));
1850*f17f7ee2SSatish Balay       ldu = n;
1851*f17f7ee2SSatish Balay       ldv = n;
1852*f17f7ee2SSatish Balay     }
1853*f17f7ee2SSatish Balay     if (flg) {
1854*f17f7ee2SSatish Balay       PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1855*f17f7ee2SSatish Balay       PetscCall(MatDenseGetArrayRead(U,&u));
1856*f17f7ee2SSatish Balay       PetscCall(MatDenseGetArrayRead(V,&v));
1857*f17f7ee2SSatish Balay       if (usesf) {
1858*f17f7ee2SSatish Balay         vv = MatH2OpusGetThrustPointer(*a->yy);
1859*f17f7ee2SSatish Balay         PetscCall(PetscSFBcastBegin(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE));
1860*f17f7ee2SSatish Balay         PetscCall(PetscSFBcastEnd(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE));
1861*f17f7ee2SSatish Balay         if (U != V) {
1862*f17f7ee2SSatish Balay           uu = MatH2OpusGetThrustPointer(*a->xx);
1863*f17f7ee2SSatish Balay           PetscCall(PetscSFBcastBegin(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE));
1864*f17f7ee2SSatish Balay           PetscCall(PetscSFBcastEnd(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE));
1865*f17f7ee2SSatish Balay         } else uu = vv;
1866*f17f7ee2SSatish Balay       } else { uu = u; vv = v; }
1867*f17f7ee2SSatish Balay       hlru_global(*a->hmatrix,uu,ldu,vv,ldv,U->cmap->N,s,handle);
1868*f17f7ee2SSatish Balay       PetscCall(MatDenseRestoreArrayRead(U,&u));
1869*f17f7ee2SSatish Balay       PetscCall(MatDenseRestoreArrayRead(V,&v));
1870*f17f7ee2SSatish Balay     } else {
1871*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1872*f17f7ee2SSatish Balay       PetscBool flgU, flgV;
1873*f17f7ee2SSatish Balay 
1874*f17f7ee2SSatish Balay       PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1875*f17f7ee2SSatish Balay       PetscCall(PetscObjectTypeCompareAny((PetscObject)U,&flgU,MATSEQDENSE,MATMPIDENSE,""));
1876*f17f7ee2SSatish Balay       if (flgU) PetscCall(MatConvert(U,MATDENSECUDA,MAT_INPLACE_MATRIX,&U));
1877*f17f7ee2SSatish Balay       PetscCall(PetscObjectTypeCompareAny((PetscObject)V,&flgV,MATSEQDENSE,MATMPIDENSE,""));
1878*f17f7ee2SSatish Balay       if (flgV) PetscCall(MatConvert(V,MATDENSECUDA,MAT_INPLACE_MATRIX,&V));
1879*f17f7ee2SSatish Balay       PetscCall(MatDenseCUDAGetArrayRead(U,&u));
1880*f17f7ee2SSatish Balay       PetscCall(MatDenseCUDAGetArrayRead(V,&v));
1881*f17f7ee2SSatish Balay       if (usesf) {
1882*f17f7ee2SSatish Balay         vv = MatH2OpusGetThrustPointer(*a->yy_gpu);
1883*f17f7ee2SSatish Balay         PetscCall(PetscSFBcastBegin(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE));
1884*f17f7ee2SSatish Balay         PetscCall(PetscSFBcastEnd(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE));
1885*f17f7ee2SSatish Balay         if (U != V) {
1886*f17f7ee2SSatish Balay           uu = MatH2OpusGetThrustPointer(*a->xx_gpu);
1887*f17f7ee2SSatish Balay           PetscCall(PetscSFBcastBegin(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE));
1888*f17f7ee2SSatish Balay           PetscCall(PetscSFBcastEnd(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE));
1889*f17f7ee2SSatish Balay         } else uu = vv;
1890*f17f7ee2SSatish Balay       } else { uu = u; vv = v; }
1891*f17f7ee2SSatish Balay #else
1892*f17f7ee2SSatish Balay       SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"This should not happen");
1893*f17f7ee2SSatish Balay #endif
1894*f17f7ee2SSatish Balay       hlru_global(*a->hmatrix_gpu,uu,ldu,vv,ldv,U->cmap->N,s,handle);
1895*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU)
1896*f17f7ee2SSatish Balay       PetscCall(MatDenseCUDARestoreArrayRead(U,&u));
1897*f17f7ee2SSatish Balay       PetscCall(MatDenseCUDARestoreArrayRead(V,&v));
1898*f17f7ee2SSatish Balay       if (flgU) PetscCall(MatConvert(U,MATDENSE,MAT_INPLACE_MATRIX,&U));
1899*f17f7ee2SSatish Balay       if (flgV) PetscCall(MatConvert(V,MATDENSE,MAT_INPLACE_MATRIX,&V));
1900*f17f7ee2SSatish Balay #endif
1901*f17f7ee2SSatish Balay     }
1902*f17f7ee2SSatish Balay     PetscCall(PetscLogEventEnd(MAT_H2Opus_LR,A,0,0,0));
1903*f17f7ee2SSatish Balay     a->orthogonal = PETSC_FALSE;
1904*f17f7ee2SSatish Balay   }
1905*f17f7ee2SSatish Balay   PetscFunctionReturn(0);
1906*f17f7ee2SSatish Balay }
1907*f17f7ee2SSatish Balay #endif
1908