1*f17f7ee2SSatish Balay #include <h2opusconf.h> 2*f17f7ee2SSatish Balay /* skip compilation of this .cu file if H2OPUS is CPU only while PETSc has GPU support */ 3*f17f7ee2SSatish Balay #if !defined(__CUDACC__) || defined(H2OPUS_USE_GPU) 4*f17f7ee2SSatish Balay #include <h2opus.h> 5*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 6*f17f7ee2SSatish Balay #include <h2opus/distributed/distributed_h2opus_handle.h> 7*f17f7ee2SSatish Balay #include <h2opus/distributed/distributed_geometric_construction.h> 8*f17f7ee2SSatish Balay #include <h2opus/distributed/distributed_hgemv.h> 9*f17f7ee2SSatish Balay #include <h2opus/distributed/distributed_horthog.h> 10*f17f7ee2SSatish Balay #include <h2opus/distributed/distributed_hcompress.h> 11*f17f7ee2SSatish Balay #endif 12*f17f7ee2SSatish Balay #include <h2opus/util/boxentrygen.h> 13*f17f7ee2SSatish Balay #include <petsc/private/matimpl.h> 14*f17f7ee2SSatish Balay #include <petsc/private/vecimpl.h> 15*f17f7ee2SSatish Balay #include <petsc/private/deviceimpl.h> 16*f17f7ee2SSatish Balay #include <petscsf.h> 17*f17f7ee2SSatish Balay 18*f17f7ee2SSatish Balay /* math2opusutils */ 19*f17f7ee2SSatish Balay PETSC_INTERN PetscErrorCode PetscSFGetVectorSF(PetscSF,PetscInt,PetscInt,PetscInt,PetscSF*); 20*f17f7ee2SSatish Balay PETSC_INTERN PetscErrorCode MatDenseGetH2OpusVectorSF(Mat,PetscSF,PetscSF*); 21*f17f7ee2SSatish Balay PETSC_INTERN PetscErrorCode VecSign(Vec,Vec); 22*f17f7ee2SSatish Balay PETSC_INTERN PetscErrorCode VecSetDelta(Vec,PetscInt); 23*f17f7ee2SSatish Balay PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat,NormType,PetscInt,PetscReal*); 24*f17f7ee2SSatish Balay 25*f17f7ee2SSatish Balay #define MatH2OpusGetThrustPointer(v) thrust::raw_pointer_cast((v).data()) 26*f17f7ee2SSatish Balay 27*f17f7ee2SSatish Balay /* Use GPU only if H2OPUS is configured for GPU */ 28*f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU) 29*f17f7ee2SSatish Balay #define PETSC_H2OPUS_USE_GPU 30*f17f7ee2SSatish Balay #endif 31*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 32*f17f7ee2SSatish Balay #define MatH2OpusUpdateIfNeeded(A,B) MatBindToCPU(A,(PetscBool)((A)->boundtocpu || (B))) 33*f17f7ee2SSatish Balay #else 34*f17f7ee2SSatish Balay #define MatH2OpusUpdateIfNeeded(A,B) 0 35*f17f7ee2SSatish Balay #endif 36*f17f7ee2SSatish Balay 37*f17f7ee2SSatish Balay // TODO H2OPUS: 38*f17f7ee2SSatish Balay // DistributedHMatrix 39*f17f7ee2SSatish Balay // unsymmetric ? 40*f17f7ee2SSatish Balay // transpose for distributed_hgemv? 41*f17f7ee2SSatish Balay // clearData() 42*f17f7ee2SSatish Balay // Unify interface for sequential and parallel? 43*f17f7ee2SSatish Balay // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled) 44*f17f7ee2SSatish Balay // 45*f17f7ee2SSatish Balay template <class T> class PetscPointCloud : public H2OpusDataSet<T> 46*f17f7ee2SSatish Balay { 47*f17f7ee2SSatish Balay private: 48*f17f7ee2SSatish Balay int dimension; 49*f17f7ee2SSatish Balay size_t num_points; 50*f17f7ee2SSatish Balay std::vector<T> pts; 51*f17f7ee2SSatish Balay 52*f17f7ee2SSatish Balay public: 53*f17f7ee2SSatish Balay PetscPointCloud(int dim, size_t num_pts, const T coords[]) 54*f17f7ee2SSatish Balay { 55*f17f7ee2SSatish Balay dim = dim > 0 ? dim : 1; 56*f17f7ee2SSatish Balay this->dimension = dim; 57*f17f7ee2SSatish Balay this->num_points = num_pts; 58*f17f7ee2SSatish Balay 59*f17f7ee2SSatish Balay pts.resize(num_pts*dim); 60*f17f7ee2SSatish Balay if (coords) { 61*f17f7ee2SSatish Balay for (size_t n = 0; n < num_points; n++) 62*f17f7ee2SSatish Balay for (int i = 0; i < dim; i++) 63*f17f7ee2SSatish Balay pts[n*dim + i] = coords[n*dim + i]; 64*f17f7ee2SSatish Balay } else { 65*f17f7ee2SSatish Balay PetscReal h = 1./(num_points - 1); 66*f17f7ee2SSatish Balay for (size_t n = 0; n < num_points; n++) 67*f17f7ee2SSatish Balay for (int i = 0; i < dim; i++) 68*f17f7ee2SSatish Balay pts[n*dim + i] = i*h; 69*f17f7ee2SSatish Balay } 70*f17f7ee2SSatish Balay } 71*f17f7ee2SSatish Balay 72*f17f7ee2SSatish Balay PetscPointCloud(const PetscPointCloud<T>& other) 73*f17f7ee2SSatish Balay { 74*f17f7ee2SSatish Balay size_t N = other.dimension * other.num_points; 75*f17f7ee2SSatish Balay this->dimension = other.dimension; 76*f17f7ee2SSatish Balay this->num_points = other.num_points; 77*f17f7ee2SSatish Balay this->pts.resize(N); 78*f17f7ee2SSatish Balay for (size_t i = 0; i < N; i++) 79*f17f7ee2SSatish Balay this->pts[i] = other.pts[i]; 80*f17f7ee2SSatish Balay } 81*f17f7ee2SSatish Balay 82*f17f7ee2SSatish Balay int getDimension() const 83*f17f7ee2SSatish Balay { 84*f17f7ee2SSatish Balay return dimension; 85*f17f7ee2SSatish Balay } 86*f17f7ee2SSatish Balay 87*f17f7ee2SSatish Balay size_t getDataSetSize() const 88*f17f7ee2SSatish Balay { 89*f17f7ee2SSatish Balay return num_points; 90*f17f7ee2SSatish Balay } 91*f17f7ee2SSatish Balay 92*f17f7ee2SSatish Balay T getDataPoint(size_t idx, int dim) const 93*f17f7ee2SSatish Balay { 94*f17f7ee2SSatish Balay assert(dim < dimension && idx < num_points); 95*f17f7ee2SSatish Balay return pts[idx*dimension + dim]; 96*f17f7ee2SSatish Balay } 97*f17f7ee2SSatish Balay 98*f17f7ee2SSatish Balay void Print(std::ostream& out = std::cout) 99*f17f7ee2SSatish Balay { 100*f17f7ee2SSatish Balay out << "Dimension: " << dimension << std::endl; 101*f17f7ee2SSatish Balay out << "NumPoints: " << num_points << std::endl; 102*f17f7ee2SSatish Balay for (size_t n = 0; n < num_points; n++) { 103*f17f7ee2SSatish Balay for (int d = 0; d < dimension; d++) 104*f17f7ee2SSatish Balay out << pts[n*dimension + d] << " "; 105*f17f7ee2SSatish Balay out << std::endl; 106*f17f7ee2SSatish Balay } 107*f17f7ee2SSatish Balay } 108*f17f7ee2SSatish Balay }; 109*f17f7ee2SSatish Balay 110*f17f7ee2SSatish Balay template<class T> class PetscFunctionGenerator 111*f17f7ee2SSatish Balay { 112*f17f7ee2SSatish Balay private: 113*f17f7ee2SSatish Balay MatH2OpusKernel k; 114*f17f7ee2SSatish Balay int dim; 115*f17f7ee2SSatish Balay void *ctx; 116*f17f7ee2SSatish Balay 117*f17f7ee2SSatish Balay public: 118*f17f7ee2SSatish Balay PetscFunctionGenerator(MatH2OpusKernel k, int dim, void* ctx) { this->k = k; this->dim = dim; this->ctx = ctx; } 119*f17f7ee2SSatish Balay PetscFunctionGenerator(PetscFunctionGenerator& other) { this->k = other.k; this->dim = other.dim; this->ctx = other.ctx; } 120*f17f7ee2SSatish Balay T operator()(PetscReal *pt1, PetscReal *pt2) 121*f17f7ee2SSatish Balay { 122*f17f7ee2SSatish Balay return (T)((*this->k)(this->dim,pt1,pt2,this->ctx)); 123*f17f7ee2SSatish Balay } 124*f17f7ee2SSatish Balay }; 125*f17f7ee2SSatish Balay 126*f17f7ee2SSatish Balay #include <../src/mat/impls/h2opus/math2opussampler.hpp> 127*f17f7ee2SSatish Balay 128*f17f7ee2SSatish Balay /* just to not clutter the code */ 129*f17f7ee2SSatish Balay #if !defined(H2OPUS_USE_GPU) 130*f17f7ee2SSatish Balay typedef HMatrix HMatrix_GPU; 131*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 132*f17f7ee2SSatish Balay typedef DistributedHMatrix DistributedHMatrix_GPU; 133*f17f7ee2SSatish Balay #endif 134*f17f7ee2SSatish Balay #endif 135*f17f7ee2SSatish Balay 136*f17f7ee2SSatish Balay typedef struct { 137*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 138*f17f7ee2SSatish Balay distributedH2OpusHandle_t handle; 139*f17f7ee2SSatish Balay #else 140*f17f7ee2SSatish Balay h2opusHandle_t handle; 141*f17f7ee2SSatish Balay #endif 142*f17f7ee2SSatish Balay /* Sequential and parallel matrices are two different classes at the moment */ 143*f17f7ee2SSatish Balay HMatrix *hmatrix; 144*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 145*f17f7ee2SSatish Balay DistributedHMatrix *dist_hmatrix; 146*f17f7ee2SSatish Balay #else 147*f17f7ee2SSatish Balay HMatrix *dist_hmatrix; /* just to not clutter the code */ 148*f17f7ee2SSatish Balay #endif 149*f17f7ee2SSatish Balay /* May use permutations */ 150*f17f7ee2SSatish Balay PetscSF sf; 151*f17f7ee2SSatish Balay PetscLayout h2opus_rmap, h2opus_cmap; 152*f17f7ee2SSatish Balay IS h2opus_indexmap; 153*f17f7ee2SSatish Balay thrust::host_vector<PetscScalar> *xx,*yy; 154*f17f7ee2SSatish Balay PetscInt xxs,yys; 155*f17f7ee2SSatish Balay PetscBool multsetup; 156*f17f7ee2SSatish Balay 157*f17f7ee2SSatish Balay /* GPU */ 158*f17f7ee2SSatish Balay HMatrix_GPU *hmatrix_gpu; 159*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 160*f17f7ee2SSatish Balay DistributedHMatrix_GPU *dist_hmatrix_gpu; 161*f17f7ee2SSatish Balay #else 162*f17f7ee2SSatish Balay HMatrix_GPU *dist_hmatrix_gpu; /* just to not clutter the code */ 163*f17f7ee2SSatish Balay #endif 164*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 165*f17f7ee2SSatish Balay thrust::device_vector<PetscScalar> *xx_gpu,*yy_gpu; 166*f17f7ee2SSatish Balay PetscInt xxs_gpu,yys_gpu; 167*f17f7ee2SSatish Balay #endif 168*f17f7ee2SSatish Balay 169*f17f7ee2SSatish Balay /* construction from matvecs */ 170*f17f7ee2SSatish Balay PetscMatrixSampler* sampler; 171*f17f7ee2SSatish Balay PetscBool nativemult; 172*f17f7ee2SSatish Balay 173*f17f7ee2SSatish Balay /* Admissibility */ 174*f17f7ee2SSatish Balay PetscReal eta; 175*f17f7ee2SSatish Balay PetscInt leafsize; 176*f17f7ee2SSatish Balay 177*f17f7ee2SSatish Balay /* for dof reordering */ 178*f17f7ee2SSatish Balay PetscPointCloud<PetscReal> *ptcloud; 179*f17f7ee2SSatish Balay 180*f17f7ee2SSatish Balay /* kernel for generating matrix entries */ 181*f17f7ee2SSatish Balay PetscFunctionGenerator<PetscScalar> *kernel; 182*f17f7ee2SSatish Balay 183*f17f7ee2SSatish Balay /* basis orthogonalized? */ 184*f17f7ee2SSatish Balay PetscBool orthogonal; 185*f17f7ee2SSatish Balay 186*f17f7ee2SSatish Balay /* customization */ 187*f17f7ee2SSatish Balay PetscInt basisord; 188*f17f7ee2SSatish Balay PetscInt max_rank; 189*f17f7ee2SSatish Balay PetscInt bs; 190*f17f7ee2SSatish Balay PetscReal rtol; 191*f17f7ee2SSatish Balay PetscInt norm_max_samples; 192*f17f7ee2SSatish Balay PetscBool check_construction; 193*f17f7ee2SSatish Balay PetscBool hara_verbose; 194*f17f7ee2SSatish Balay 195*f17f7ee2SSatish Balay /* keeps track of MatScale values */ 196*f17f7ee2SSatish Balay PetscScalar s; 197*f17f7ee2SSatish Balay } Mat_H2OPUS; 198*f17f7ee2SSatish Balay 199*f17f7ee2SSatish Balay static PetscErrorCode MatDestroy_H2OPUS(Mat A) 200*f17f7ee2SSatish Balay { 201*f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 202*f17f7ee2SSatish Balay 203*f17f7ee2SSatish Balay PetscFunctionBegin; 204*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 205*f17f7ee2SSatish Balay h2opusDestroyDistributedHandle(a->handle); 206*f17f7ee2SSatish Balay #else 207*f17f7ee2SSatish Balay h2opusDestroyHandle(a->handle); 208*f17f7ee2SSatish Balay #endif 209*f17f7ee2SSatish Balay delete a->dist_hmatrix; 210*f17f7ee2SSatish Balay delete a->hmatrix; 211*f17f7ee2SSatish Balay PetscCall(PetscSFDestroy(&a->sf)); 212*f17f7ee2SSatish Balay PetscCall(PetscLayoutDestroy(&a->h2opus_rmap)); 213*f17f7ee2SSatish Balay PetscCall(PetscLayoutDestroy(&a->h2opus_cmap)); 214*f17f7ee2SSatish Balay PetscCall(ISDestroy(&a->h2opus_indexmap)); 215*f17f7ee2SSatish Balay delete a->xx; 216*f17f7ee2SSatish Balay delete a->yy; 217*f17f7ee2SSatish Balay delete a->hmatrix_gpu; 218*f17f7ee2SSatish Balay delete a->dist_hmatrix_gpu; 219*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 220*f17f7ee2SSatish Balay delete a->xx_gpu; 221*f17f7ee2SSatish Balay delete a->yy_gpu; 222*f17f7ee2SSatish Balay #endif 223*f17f7ee2SSatish Balay delete a->sampler; 224*f17f7ee2SSatish Balay delete a->ptcloud; 225*f17f7ee2SSatish Balay delete a->kernel; 226*f17f7ee2SSatish Balay PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdense_C",NULL)); 227*f17f7ee2SSatish Balay PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdensecuda_C",NULL)); 228*f17f7ee2SSatish Balay PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidense_C",NULL)); 229*f17f7ee2SSatish Balay PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidensecuda_C",NULL)); 230*f17f7ee2SSatish Balay PetscCall(PetscObjectChangeTypeName((PetscObject)A,NULL)); 231*f17f7ee2SSatish Balay PetscCall(PetscFree(A->data)); 232*f17f7ee2SSatish Balay PetscFunctionReturn(0); 233*f17f7ee2SSatish Balay } 234*f17f7ee2SSatish Balay 235*f17f7ee2SSatish Balay PetscErrorCode MatH2OpusSetNativeMult(Mat A, PetscBool nm) 236*f17f7ee2SSatish Balay { 237*f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 238*f17f7ee2SSatish Balay PetscBool ish2opus; 239*f17f7ee2SSatish Balay 240*f17f7ee2SSatish Balay PetscFunctionBegin; 241*f17f7ee2SSatish Balay PetscValidHeaderSpecific(A,MAT_CLASSID,1); 242*f17f7ee2SSatish Balay PetscValidLogicalCollectiveBool(A,nm,2); 243*f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus)); 244*f17f7ee2SSatish Balay if (ish2opus) { 245*f17f7ee2SSatish Balay if (a->h2opus_rmap) { /* need to swap layouts for vector creation */ 246*f17f7ee2SSatish Balay if ((!a->nativemult && nm) || (a->nativemult && !nm)) { 247*f17f7ee2SSatish Balay PetscLayout t; 248*f17f7ee2SSatish Balay t = A->rmap; 249*f17f7ee2SSatish Balay A->rmap = a->h2opus_rmap; 250*f17f7ee2SSatish Balay a->h2opus_rmap = t; 251*f17f7ee2SSatish Balay t = A->cmap; 252*f17f7ee2SSatish Balay A->cmap = a->h2opus_cmap; 253*f17f7ee2SSatish Balay a->h2opus_cmap = t; 254*f17f7ee2SSatish Balay } 255*f17f7ee2SSatish Balay } 256*f17f7ee2SSatish Balay a->nativemult = nm; 257*f17f7ee2SSatish Balay } 258*f17f7ee2SSatish Balay PetscFunctionReturn(0); 259*f17f7ee2SSatish Balay } 260*f17f7ee2SSatish Balay 261*f17f7ee2SSatish Balay PetscErrorCode MatH2OpusGetNativeMult(Mat A, PetscBool *nm) 262*f17f7ee2SSatish Balay { 263*f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 264*f17f7ee2SSatish Balay PetscBool ish2opus; 265*f17f7ee2SSatish Balay 266*f17f7ee2SSatish Balay PetscFunctionBegin; 267*f17f7ee2SSatish Balay PetscValidHeaderSpecific(A,MAT_CLASSID,1); 268*f17f7ee2SSatish Balay PetscValidPointer(nm,2); 269*f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus)); 270*f17f7ee2SSatish Balay PetscCheck(ish2opus,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name); 271*f17f7ee2SSatish Balay *nm = a->nativemult; 272*f17f7ee2SSatish Balay PetscFunctionReturn(0); 273*f17f7ee2SSatish Balay } 274*f17f7ee2SSatish Balay 275*f17f7ee2SSatish Balay PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat A, NormType normtype, PetscReal* n) 276*f17f7ee2SSatish Balay { 277*f17f7ee2SSatish Balay PetscBool ish2opus; 278*f17f7ee2SSatish Balay PetscInt nmax = PETSC_DECIDE; 279*f17f7ee2SSatish Balay Mat_H2OPUS *a = NULL; 280*f17f7ee2SSatish Balay PetscBool mult = PETSC_FALSE; 281*f17f7ee2SSatish Balay 282*f17f7ee2SSatish Balay PetscFunctionBegin; 283*f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus)); 284*f17f7ee2SSatish Balay if (ish2opus) { /* set userdefine number of samples and fastpath for mult (norms are order independent) */ 285*f17f7ee2SSatish Balay a = (Mat_H2OPUS*)A->data; 286*f17f7ee2SSatish Balay 287*f17f7ee2SSatish Balay nmax = a->norm_max_samples; 288*f17f7ee2SSatish Balay mult = a->nativemult; 289*f17f7ee2SSatish Balay PetscCall(MatH2OpusSetNativeMult(A,PETSC_TRUE)); 290*f17f7ee2SSatish Balay } else { 291*f17f7ee2SSatish Balay PetscCall(PetscOptionsGetInt(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_approximate_norm_samples",&nmax,NULL)); 292*f17f7ee2SSatish Balay } 293*f17f7ee2SSatish Balay PetscCall(MatApproximateNorm_Private(A,normtype,nmax,n)); 294*f17f7ee2SSatish Balay if (a) PetscCall(MatH2OpusSetNativeMult(A,mult)); 295*f17f7ee2SSatish Balay PetscFunctionReturn(0); 296*f17f7ee2SSatish Balay } 297*f17f7ee2SSatish Balay 298*f17f7ee2SSatish Balay static PetscErrorCode MatH2OpusResizeBuffers_Private(Mat A, PetscInt xN, PetscInt yN) 299*f17f7ee2SSatish Balay { 300*f17f7ee2SSatish Balay Mat_H2OPUS *h2opus = (Mat_H2OPUS*)A->data; 301*f17f7ee2SSatish Balay PetscInt n; 302*f17f7ee2SSatish Balay PetscBool boundtocpu = PETSC_TRUE; 303*f17f7ee2SSatish Balay 304*f17f7ee2SSatish Balay PetscFunctionBegin; 305*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 306*f17f7ee2SSatish Balay boundtocpu = A->boundtocpu; 307*f17f7ee2SSatish Balay #endif 308*f17f7ee2SSatish Balay PetscCall(PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL)); 309*f17f7ee2SSatish Balay if (boundtocpu) { 310*f17f7ee2SSatish Balay if (h2opus->xxs < xN) { h2opus->xx->resize(n*xN); h2opus->xxs = xN; } 311*f17f7ee2SSatish Balay if (h2opus->yys < yN) { h2opus->yy->resize(n*yN); h2opus->yys = yN; } 312*f17f7ee2SSatish Balay } 313*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 314*f17f7ee2SSatish Balay if (!boundtocpu) { 315*f17f7ee2SSatish Balay if (h2opus->xxs_gpu < xN) { h2opus->xx_gpu->resize(n*xN); h2opus->xxs_gpu = xN; } 316*f17f7ee2SSatish Balay if (h2opus->yys_gpu < yN) { h2opus->yy_gpu->resize(n*yN); h2opus->yys_gpu = yN; } 317*f17f7ee2SSatish Balay } 318*f17f7ee2SSatish Balay #endif 319*f17f7ee2SSatish Balay PetscFunctionReturn(0); 320*f17f7ee2SSatish Balay } 321*f17f7ee2SSatish Balay 322*f17f7ee2SSatish Balay static PetscErrorCode MatMultNKernel_H2OPUS(Mat A, PetscBool transA, Mat B, Mat C) 323*f17f7ee2SSatish Balay { 324*f17f7ee2SSatish Balay Mat_H2OPUS *h2opus = (Mat_H2OPUS*)A->data; 325*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 326*f17f7ee2SSatish Balay h2opusHandle_t handle = h2opus->handle->handle; 327*f17f7ee2SSatish Balay #else 328*f17f7ee2SSatish Balay h2opusHandle_t handle = h2opus->handle; 329*f17f7ee2SSatish Balay #endif 330*f17f7ee2SSatish Balay PetscBool boundtocpu = PETSC_TRUE; 331*f17f7ee2SSatish Balay PetscScalar *xx,*yy,*uxx,*uyy; 332*f17f7ee2SSatish Balay PetscInt blda,clda; 333*f17f7ee2SSatish Balay PetscMPIInt size; 334*f17f7ee2SSatish Balay PetscSF bsf,csf; 335*f17f7ee2SSatish Balay PetscBool usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult); 336*f17f7ee2SSatish Balay 337*f17f7ee2SSatish Balay PetscFunctionBegin; 338*f17f7ee2SSatish Balay HLibProfile::clear(); 339*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 340*f17f7ee2SSatish Balay boundtocpu = A->boundtocpu; 341*f17f7ee2SSatish Balay #endif 342*f17f7ee2SSatish Balay PetscCall(MatDenseGetLDA(B,&blda)); 343*f17f7ee2SSatish Balay PetscCall(MatDenseGetLDA(C,&clda)); 344*f17f7ee2SSatish Balay if (usesf) { 345*f17f7ee2SSatish Balay PetscInt n; 346*f17f7ee2SSatish Balay 347*f17f7ee2SSatish Balay PetscCall(MatDenseGetH2OpusVectorSF(B,h2opus->sf,&bsf)); 348*f17f7ee2SSatish Balay PetscCall(MatDenseGetH2OpusVectorSF(C,h2opus->sf,&csf)); 349*f17f7ee2SSatish Balay 350*f17f7ee2SSatish Balay PetscCall(MatH2OpusResizeBuffers_Private(A,B->cmap->N,C->cmap->N)); 351*f17f7ee2SSatish Balay PetscCall(PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL)); 352*f17f7ee2SSatish Balay blda = n; 353*f17f7ee2SSatish Balay clda = n; 354*f17f7ee2SSatish Balay } 355*f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 356*f17f7ee2SSatish Balay if (boundtocpu) { 357*f17f7ee2SSatish Balay PetscCall(MatDenseGetArrayRead(B,(const PetscScalar**)&xx)); 358*f17f7ee2SSatish Balay PetscCall(MatDenseGetArrayWrite(C,&yy)); 359*f17f7ee2SSatish Balay if (usesf) { 360*f17f7ee2SSatish Balay uxx = MatH2OpusGetThrustPointer(*h2opus->xx); 361*f17f7ee2SSatish Balay uyy = MatH2OpusGetThrustPointer(*h2opus->yy); 362*f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE)); 363*f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE)); 364*f17f7ee2SSatish Balay } else { 365*f17f7ee2SSatish Balay uxx = xx; 366*f17f7ee2SSatish Balay uyy = yy; 367*f17f7ee2SSatish Balay } 368*f17f7ee2SSatish Balay if (size > 1) { 369*f17f7ee2SSatish Balay PetscCheck(h2opus->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix"); 370*f17f7ee2SSatish Balay PetscCheckFalse(transA && !A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel"); 371*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 372*f17f7ee2SSatish Balay distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle); 373*f17f7ee2SSatish Balay #endif 374*f17f7ee2SSatish Balay } else { 375*f17f7ee2SSatish Balay PetscCheck(h2opus->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 376*f17f7ee2SSatish Balay hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle); 377*f17f7ee2SSatish Balay } 378*f17f7ee2SSatish Balay PetscCall(MatDenseRestoreArrayRead(B,(const PetscScalar**)&xx)); 379*f17f7ee2SSatish Balay if (usesf) { 380*f17f7ee2SSatish Balay PetscCall(PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE)); 381*f17f7ee2SSatish Balay PetscCall(PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE)); 382*f17f7ee2SSatish Balay } 383*f17f7ee2SSatish Balay PetscCall(MatDenseRestoreArrayWrite(C,&yy)); 384*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 385*f17f7ee2SSatish Balay } else { 386*f17f7ee2SSatish Balay PetscBool ciscuda,biscuda; 387*f17f7ee2SSatish Balay 388*f17f7ee2SSatish Balay /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */ 389*f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)B,&biscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,"")); 390*f17f7ee2SSatish Balay if (!biscuda) { 391*f17f7ee2SSatish Balay PetscCall(MatConvert(B,MATDENSECUDA,MAT_INPLACE_MATRIX,&B)); 392*f17f7ee2SSatish Balay } 393*f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)C,&ciscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,"")); 394*f17f7ee2SSatish Balay if (!ciscuda) { 395*f17f7ee2SSatish Balay C->assembled = PETSC_TRUE; 396*f17f7ee2SSatish Balay PetscCall(MatConvert(C,MATDENSECUDA,MAT_INPLACE_MATRIX,&C)); 397*f17f7ee2SSatish Balay } 398*f17f7ee2SSatish Balay PetscCall(MatDenseCUDAGetArrayRead(B,(const PetscScalar**)&xx)); 399*f17f7ee2SSatish Balay PetscCall(MatDenseCUDAGetArrayWrite(C,&yy)); 400*f17f7ee2SSatish Balay if (usesf) { 401*f17f7ee2SSatish Balay uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu); 402*f17f7ee2SSatish Balay uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu); 403*f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE)); 404*f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE)); 405*f17f7ee2SSatish Balay } else { 406*f17f7ee2SSatish Balay uxx = xx; 407*f17f7ee2SSatish Balay uyy = yy; 408*f17f7ee2SSatish Balay } 409*f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeBegin()); 410*f17f7ee2SSatish Balay if (size > 1) { 411*f17f7ee2SSatish Balay PetscCheck(h2opus->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed GPU matrix"); 412*f17f7ee2SSatish Balay PetscCheckFalse(transA && !A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel"); 413*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 414*f17f7ee2SSatish Balay distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle); 415*f17f7ee2SSatish Balay #endif 416*f17f7ee2SSatish Balay } else { 417*f17f7ee2SSatish Balay PetscCheck(h2opus->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 418*f17f7ee2SSatish Balay hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle); 419*f17f7ee2SSatish Balay } 420*f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeEnd()); 421*f17f7ee2SSatish Balay PetscCall(MatDenseCUDARestoreArrayRead(B,(const PetscScalar**)&xx)); 422*f17f7ee2SSatish Balay if (usesf) { 423*f17f7ee2SSatish Balay PetscCall(PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE)); 424*f17f7ee2SSatish Balay PetscCall(PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE)); 425*f17f7ee2SSatish Balay } 426*f17f7ee2SSatish Balay PetscCall(MatDenseCUDARestoreArrayWrite(C,&yy)); 427*f17f7ee2SSatish Balay if (!biscuda) { 428*f17f7ee2SSatish Balay PetscCall(MatConvert(B,MATDENSE,MAT_INPLACE_MATRIX,&B)); 429*f17f7ee2SSatish Balay } 430*f17f7ee2SSatish Balay if (!ciscuda) { 431*f17f7ee2SSatish Balay PetscCall(MatConvert(C,MATDENSE,MAT_INPLACE_MATRIX,&C)); 432*f17f7ee2SSatish Balay } 433*f17f7ee2SSatish Balay #endif 434*f17f7ee2SSatish Balay } 435*f17f7ee2SSatish Balay { /* log flops */ 436*f17f7ee2SSatish Balay double gops,time,perf,dev; 437*f17f7ee2SSatish Balay HLibProfile::getHgemvPerf(gops,time,perf,dev); 438*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 439*f17f7ee2SSatish Balay if (boundtocpu) { 440*f17f7ee2SSatish Balay PetscCall(PetscLogFlops(1e9*gops)); 441*f17f7ee2SSatish Balay } else { 442*f17f7ee2SSatish Balay PetscCall(PetscLogGpuFlops(1e9*gops)); 443*f17f7ee2SSatish Balay } 444*f17f7ee2SSatish Balay #else 445*f17f7ee2SSatish Balay PetscCall(PetscLogFlops(1e9*gops)); 446*f17f7ee2SSatish Balay #endif 447*f17f7ee2SSatish Balay } 448*f17f7ee2SSatish Balay PetscFunctionReturn(0); 449*f17f7ee2SSatish Balay } 450*f17f7ee2SSatish Balay 451*f17f7ee2SSatish Balay static PetscErrorCode MatProductNumeric_H2OPUS(Mat C) 452*f17f7ee2SSatish Balay { 453*f17f7ee2SSatish Balay Mat_Product *product = C->product; 454*f17f7ee2SSatish Balay 455*f17f7ee2SSatish Balay PetscFunctionBegin; 456*f17f7ee2SSatish Balay MatCheckProduct(C,1); 457*f17f7ee2SSatish Balay switch (product->type) { 458*f17f7ee2SSatish Balay case MATPRODUCT_AB: 459*f17f7ee2SSatish Balay PetscCall(MatMultNKernel_H2OPUS(product->A,PETSC_FALSE,product->B,C)); 460*f17f7ee2SSatish Balay break; 461*f17f7ee2SSatish Balay case MATPRODUCT_AtB: 462*f17f7ee2SSatish Balay PetscCall(MatMultNKernel_H2OPUS(product->A,PETSC_TRUE,product->B,C)); 463*f17f7ee2SSatish Balay break; 464*f17f7ee2SSatish Balay default: 465*f17f7ee2SSatish Balay SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]); 466*f17f7ee2SSatish Balay } 467*f17f7ee2SSatish Balay PetscFunctionReturn(0); 468*f17f7ee2SSatish Balay } 469*f17f7ee2SSatish Balay 470*f17f7ee2SSatish Balay static PetscErrorCode MatProductSymbolic_H2OPUS(Mat C) 471*f17f7ee2SSatish Balay { 472*f17f7ee2SSatish Balay Mat_Product *product = C->product; 473*f17f7ee2SSatish Balay PetscBool cisdense; 474*f17f7ee2SSatish Balay Mat A,B; 475*f17f7ee2SSatish Balay 476*f17f7ee2SSatish Balay PetscFunctionBegin; 477*f17f7ee2SSatish Balay MatCheckProduct(C,1); 478*f17f7ee2SSatish Balay A = product->A; 479*f17f7ee2SSatish Balay B = product->B; 480*f17f7ee2SSatish Balay switch (product->type) { 481*f17f7ee2SSatish Balay case MATPRODUCT_AB: 482*f17f7ee2SSatish Balay PetscCall(MatSetSizes(C,A->rmap->n,B->cmap->n,A->rmap->N,B->cmap->N)); 483*f17f7ee2SSatish Balay PetscCall(MatSetBlockSizesFromMats(C,product->A,product->B)); 484*f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,"")); 485*f17f7ee2SSatish Balay if (!cisdense) PetscCall(MatSetType(C,((PetscObject)product->B)->type_name)); 486*f17f7ee2SSatish Balay PetscCall(MatSetUp(C)); 487*f17f7ee2SSatish Balay break; 488*f17f7ee2SSatish Balay case MATPRODUCT_AtB: 489*f17f7ee2SSatish Balay PetscCall(MatSetSizes(C,A->cmap->n,B->cmap->n,A->cmap->N,B->cmap->N)); 490*f17f7ee2SSatish Balay PetscCall(MatSetBlockSizesFromMats(C,product->A,product->B)); 491*f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,"")); 492*f17f7ee2SSatish Balay if (!cisdense) PetscCall(MatSetType(C,((PetscObject)product->B)->type_name)); 493*f17f7ee2SSatish Balay PetscCall(MatSetUp(C)); 494*f17f7ee2SSatish Balay break; 495*f17f7ee2SSatish Balay default: 496*f17f7ee2SSatish Balay SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]); 497*f17f7ee2SSatish Balay } 498*f17f7ee2SSatish Balay C->ops->productsymbolic = NULL; 499*f17f7ee2SSatish Balay C->ops->productnumeric = MatProductNumeric_H2OPUS; 500*f17f7ee2SSatish Balay PetscFunctionReturn(0); 501*f17f7ee2SSatish Balay } 502*f17f7ee2SSatish Balay 503*f17f7ee2SSatish Balay static PetscErrorCode MatProductSetFromOptions_H2OPUS(Mat C) 504*f17f7ee2SSatish Balay { 505*f17f7ee2SSatish Balay PetscFunctionBegin; 506*f17f7ee2SSatish Balay MatCheckProduct(C,1); 507*f17f7ee2SSatish Balay if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) { 508*f17f7ee2SSatish Balay C->ops->productsymbolic = MatProductSymbolic_H2OPUS; 509*f17f7ee2SSatish Balay } 510*f17f7ee2SSatish Balay PetscFunctionReturn(0); 511*f17f7ee2SSatish Balay } 512*f17f7ee2SSatish Balay 513*f17f7ee2SSatish Balay static PetscErrorCode MatMultKernel_H2OPUS(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans) 514*f17f7ee2SSatish Balay { 515*f17f7ee2SSatish Balay Mat_H2OPUS *h2opus = (Mat_H2OPUS*)A->data; 516*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 517*f17f7ee2SSatish Balay h2opusHandle_t handle = h2opus->handle->handle; 518*f17f7ee2SSatish Balay #else 519*f17f7ee2SSatish Balay h2opusHandle_t handle = h2opus->handle; 520*f17f7ee2SSatish Balay #endif 521*f17f7ee2SSatish Balay PetscBool boundtocpu = PETSC_TRUE; 522*f17f7ee2SSatish Balay PetscInt n; 523*f17f7ee2SSatish Balay PetscScalar *xx,*yy,*uxx,*uyy; 524*f17f7ee2SSatish Balay PetscMPIInt size; 525*f17f7ee2SSatish Balay PetscBool usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult); 526*f17f7ee2SSatish Balay 527*f17f7ee2SSatish Balay PetscFunctionBegin; 528*f17f7ee2SSatish Balay HLibProfile::clear(); 529*f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 530*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 531*f17f7ee2SSatish Balay boundtocpu = A->boundtocpu; 532*f17f7ee2SSatish Balay #endif 533*f17f7ee2SSatish Balay if (usesf) { 534*f17f7ee2SSatish Balay PetscCall(PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL)); 535*f17f7ee2SSatish Balay } else n = A->rmap->n; 536*f17f7ee2SSatish Balay if (boundtocpu) { 537*f17f7ee2SSatish Balay PetscCall(VecGetArrayRead(x,(const PetscScalar**)&xx)); 538*f17f7ee2SSatish Balay if (sy == 0.0) { 539*f17f7ee2SSatish Balay PetscCall(VecGetArrayWrite(y,&yy)); 540*f17f7ee2SSatish Balay } else { 541*f17f7ee2SSatish Balay PetscCall(VecGetArray(y,&yy)); 542*f17f7ee2SSatish Balay } 543*f17f7ee2SSatish Balay if (usesf) { 544*f17f7ee2SSatish Balay uxx = MatH2OpusGetThrustPointer(*h2opus->xx); 545*f17f7ee2SSatish Balay uyy = MatH2OpusGetThrustPointer(*h2opus->yy); 546*f17f7ee2SSatish Balay 547*f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE)); 548*f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE)); 549*f17f7ee2SSatish Balay if (sy != 0.0) { 550*f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE)); 551*f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE)); 552*f17f7ee2SSatish Balay } 553*f17f7ee2SSatish Balay } else { 554*f17f7ee2SSatish Balay uxx = xx; 555*f17f7ee2SSatish Balay uyy = yy; 556*f17f7ee2SSatish Balay } 557*f17f7ee2SSatish Balay if (size > 1) { 558*f17f7ee2SSatish Balay PetscCheck(h2opus->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix"); 559*f17f7ee2SSatish Balay PetscCheckFalse(trans && !A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel"); 560*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 561*f17f7ee2SSatish Balay distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix, uxx, n, sy, uyy, n, 1, h2opus->handle); 562*f17f7ee2SSatish Balay #endif 563*f17f7ee2SSatish Balay } else { 564*f17f7ee2SSatish Balay PetscCheck(h2opus->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 565*f17f7ee2SSatish Balay hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, n, sy, uyy, n, 1, handle); 566*f17f7ee2SSatish Balay } 567*f17f7ee2SSatish Balay PetscCall(VecRestoreArrayRead(x,(const PetscScalar**)&xx)); 568*f17f7ee2SSatish Balay if (usesf) { 569*f17f7ee2SSatish Balay PetscCall(PetscSFReduceBegin(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE)); 570*f17f7ee2SSatish Balay PetscCall(PetscSFReduceEnd(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE)); 571*f17f7ee2SSatish Balay } 572*f17f7ee2SSatish Balay if (sy == 0.0) { 573*f17f7ee2SSatish Balay PetscCall(VecRestoreArrayWrite(y,&yy)); 574*f17f7ee2SSatish Balay } else { 575*f17f7ee2SSatish Balay PetscCall(VecRestoreArray(y,&yy)); 576*f17f7ee2SSatish Balay } 577*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 578*f17f7ee2SSatish Balay } else { 579*f17f7ee2SSatish Balay PetscCall(VecCUDAGetArrayRead(x,(const PetscScalar**)&xx)); 580*f17f7ee2SSatish Balay if (sy == 0.0) { 581*f17f7ee2SSatish Balay PetscCall(VecCUDAGetArrayWrite(y,&yy)); 582*f17f7ee2SSatish Balay } else { 583*f17f7ee2SSatish Balay PetscCall(VecCUDAGetArray(y,&yy)); 584*f17f7ee2SSatish Balay } 585*f17f7ee2SSatish Balay if (usesf) { 586*f17f7ee2SSatish Balay uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu); 587*f17f7ee2SSatish Balay uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu); 588*f17f7ee2SSatish Balay 589*f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE)); 590*f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE)); 591*f17f7ee2SSatish Balay if (sy != 0.0) { 592*f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE)); 593*f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE)); 594*f17f7ee2SSatish Balay } 595*f17f7ee2SSatish Balay } else { 596*f17f7ee2SSatish Balay uxx = xx; 597*f17f7ee2SSatish Balay uyy = yy; 598*f17f7ee2SSatish Balay } 599*f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeBegin()); 600*f17f7ee2SSatish Balay if (size > 1) { 601*f17f7ee2SSatish Balay PetscCheck(h2opus->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed GPU matrix"); 602*f17f7ee2SSatish Balay PetscCheckFalse(trans && !A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel"); 603*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 604*f17f7ee2SSatish Balay distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, h2opus->handle); 605*f17f7ee2SSatish Balay #endif 606*f17f7ee2SSatish Balay } else { 607*f17f7ee2SSatish Balay PetscCheck(h2opus->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 608*f17f7ee2SSatish Balay hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle); 609*f17f7ee2SSatish Balay } 610*f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeEnd()); 611*f17f7ee2SSatish Balay PetscCall(VecCUDARestoreArrayRead(x,(const PetscScalar**)&xx)); 612*f17f7ee2SSatish Balay if (usesf) { 613*f17f7ee2SSatish Balay PetscCall(PetscSFReduceBegin(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE)); 614*f17f7ee2SSatish Balay PetscCall(PetscSFReduceEnd(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE)); 615*f17f7ee2SSatish Balay } 616*f17f7ee2SSatish Balay if (sy == 0.0) { 617*f17f7ee2SSatish Balay PetscCall(VecCUDARestoreArrayWrite(y,&yy)); 618*f17f7ee2SSatish Balay } else { 619*f17f7ee2SSatish Balay PetscCall(VecCUDARestoreArray(y,&yy)); 620*f17f7ee2SSatish Balay } 621*f17f7ee2SSatish Balay #endif 622*f17f7ee2SSatish Balay } 623*f17f7ee2SSatish Balay { /* log flops */ 624*f17f7ee2SSatish Balay double gops,time,perf,dev; 625*f17f7ee2SSatish Balay HLibProfile::getHgemvPerf(gops,time,perf,dev); 626*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 627*f17f7ee2SSatish Balay if (boundtocpu) { 628*f17f7ee2SSatish Balay PetscCall(PetscLogFlops(1e9*gops)); 629*f17f7ee2SSatish Balay } else { 630*f17f7ee2SSatish Balay PetscCall(PetscLogGpuFlops(1e9*gops)); 631*f17f7ee2SSatish Balay } 632*f17f7ee2SSatish Balay #else 633*f17f7ee2SSatish Balay PetscCall(PetscLogFlops(1e9*gops)); 634*f17f7ee2SSatish Balay #endif 635*f17f7ee2SSatish Balay } 636*f17f7ee2SSatish Balay PetscFunctionReturn(0); 637*f17f7ee2SSatish Balay } 638*f17f7ee2SSatish Balay 639*f17f7ee2SSatish Balay static PetscErrorCode MatMultTranspose_H2OPUS(Mat A, Vec x, Vec y) 640*f17f7ee2SSatish Balay { 641*f17f7ee2SSatish Balay PetscBool xiscuda,yiscuda; 642*f17f7ee2SSatish Balay 643*f17f7ee2SSatish Balay PetscFunctionBegin; 644*f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"")); 645*f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)y,&yiscuda,VECSEQCUDA,VECMPICUDA,"")); 646*f17f7ee2SSatish Balay PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !yiscuda)); 647*f17f7ee2SSatish Balay PetscCall(MatMultKernel_H2OPUS(A,x,0.0,y,PETSC_TRUE)); 648*f17f7ee2SSatish Balay PetscFunctionReturn(0); 649*f17f7ee2SSatish Balay } 650*f17f7ee2SSatish Balay 651*f17f7ee2SSatish Balay static PetscErrorCode MatMult_H2OPUS(Mat A, Vec x, Vec y) 652*f17f7ee2SSatish Balay { 653*f17f7ee2SSatish Balay PetscBool xiscuda,yiscuda; 654*f17f7ee2SSatish Balay 655*f17f7ee2SSatish Balay PetscFunctionBegin; 656*f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"")); 657*f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)y,&yiscuda,VECSEQCUDA,VECMPICUDA,"")); 658*f17f7ee2SSatish Balay PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !yiscuda)); 659*f17f7ee2SSatish Balay PetscCall(MatMultKernel_H2OPUS(A,x,0.0,y,PETSC_FALSE)); 660*f17f7ee2SSatish Balay PetscFunctionReturn(0); 661*f17f7ee2SSatish Balay } 662*f17f7ee2SSatish Balay 663*f17f7ee2SSatish Balay static PetscErrorCode MatMultTransposeAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z) 664*f17f7ee2SSatish Balay { 665*f17f7ee2SSatish Balay PetscBool xiscuda,ziscuda; 666*f17f7ee2SSatish Balay 667*f17f7ee2SSatish Balay PetscFunctionBegin; 668*f17f7ee2SSatish Balay PetscCall(VecCopy(y,z)); 669*f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"")); 670*f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)z,&ziscuda,VECSEQCUDA,VECMPICUDA,"")); 671*f17f7ee2SSatish Balay PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !ziscuda)); 672*f17f7ee2SSatish Balay PetscCall(MatMultKernel_H2OPUS(A,x,1.0,z,PETSC_TRUE)); 673*f17f7ee2SSatish Balay PetscFunctionReturn(0); 674*f17f7ee2SSatish Balay } 675*f17f7ee2SSatish Balay 676*f17f7ee2SSatish Balay static PetscErrorCode MatMultAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z) 677*f17f7ee2SSatish Balay { 678*f17f7ee2SSatish Balay PetscBool xiscuda,ziscuda; 679*f17f7ee2SSatish Balay 680*f17f7ee2SSatish Balay PetscFunctionBegin; 681*f17f7ee2SSatish Balay PetscCall(VecCopy(y,z)); 682*f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"")); 683*f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)z,&ziscuda,VECSEQCUDA,VECMPICUDA,"")); 684*f17f7ee2SSatish Balay PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !ziscuda)); 685*f17f7ee2SSatish Balay PetscCall(MatMultKernel_H2OPUS(A,x,1.0,z,PETSC_FALSE)); 686*f17f7ee2SSatish Balay PetscFunctionReturn(0); 687*f17f7ee2SSatish Balay } 688*f17f7ee2SSatish Balay 689*f17f7ee2SSatish Balay static PetscErrorCode MatScale_H2OPUS(Mat A, PetscScalar s) 690*f17f7ee2SSatish Balay { 691*f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 692*f17f7ee2SSatish Balay 693*f17f7ee2SSatish Balay PetscFunctionBegin; 694*f17f7ee2SSatish Balay a->s *= s; 695*f17f7ee2SSatish Balay PetscFunctionReturn(0); 696*f17f7ee2SSatish Balay } 697*f17f7ee2SSatish Balay 698*f17f7ee2SSatish Balay static PetscErrorCode MatSetFromOptions_H2OPUS(PetscOptionItems *PetscOptionsObject,Mat A) 699*f17f7ee2SSatish Balay { 700*f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 701*f17f7ee2SSatish Balay 702*f17f7ee2SSatish Balay PetscFunctionBegin; 703*f17f7ee2SSatish Balay PetscCall(PetscOptionsHead(PetscOptionsObject,"H2OPUS options")); 704*f17f7ee2SSatish Balay PetscCall(PetscOptionsInt("-mat_h2opus_leafsize","Leaf size of cluster tree",NULL,a->leafsize,&a->leafsize,NULL)); 705*f17f7ee2SSatish Balay PetscCall(PetscOptionsReal("-mat_h2opus_eta","Admissibility condition tolerance",NULL,a->eta,&a->eta,NULL)); 706*f17f7ee2SSatish Balay PetscCall(PetscOptionsInt("-mat_h2opus_order","Basis order for off-diagonal sampling when constructed from kernel",NULL,a->basisord,&a->basisord,NULL)); 707*f17f7ee2SSatish Balay PetscCall(PetscOptionsInt("-mat_h2opus_maxrank","Maximum rank when constructed from matvecs",NULL,a->max_rank,&a->max_rank,NULL)); 708*f17f7ee2SSatish Balay PetscCall(PetscOptionsInt("-mat_h2opus_samples","Maximum number of samples to be taken concurrently when constructing from matvecs",NULL,a->bs,&a->bs,NULL)); 709*f17f7ee2SSatish Balay PetscCall(PetscOptionsInt("-mat_h2opus_normsamples","Maximum bumber of samples to be when estimating norms",NULL,a->norm_max_samples,&a->norm_max_samples,NULL)); 710*f17f7ee2SSatish Balay PetscCall(PetscOptionsReal("-mat_h2opus_rtol","Relative tolerance for construction from sampling",NULL,a->rtol,&a->rtol,NULL)); 711*f17f7ee2SSatish Balay PetscCall(PetscOptionsBool("-mat_h2opus_check","Check error when constructing from sampling during MatAssemblyEnd()",NULL,a->check_construction,&a->check_construction,NULL)); 712*f17f7ee2SSatish Balay PetscCall(PetscOptionsBool("-mat_h2opus_hara_verbose","Verbose output from hara construction",NULL,a->hara_verbose,&a->hara_verbose,NULL)); 713*f17f7ee2SSatish Balay PetscCall(PetscOptionsTail()); 714*f17f7ee2SSatish Balay PetscFunctionReturn(0); 715*f17f7ee2SSatish Balay } 716*f17f7ee2SSatish Balay 717*f17f7ee2SSatish Balay static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat,PetscInt,const PetscReal[],PetscBool,MatH2OpusKernel,void*); 718*f17f7ee2SSatish Balay 719*f17f7ee2SSatish Balay static PetscErrorCode MatH2OpusInferCoordinates_Private(Mat A) 720*f17f7ee2SSatish Balay { 721*f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 722*f17f7ee2SSatish Balay Vec c; 723*f17f7ee2SSatish Balay PetscInt spacedim; 724*f17f7ee2SSatish Balay const PetscScalar *coords; 725*f17f7ee2SSatish Balay 726*f17f7ee2SSatish Balay PetscFunctionBegin; 727*f17f7ee2SSatish Balay if (a->ptcloud) PetscFunctionReturn(0); 728*f17f7ee2SSatish Balay PetscCall(PetscObjectQuery((PetscObject)A,"__math2opus_coords",(PetscObject*)&c)); 729*f17f7ee2SSatish Balay if (!c && a->sampler) { 730*f17f7ee2SSatish Balay Mat S = a->sampler->GetSamplingMat(); 731*f17f7ee2SSatish Balay 732*f17f7ee2SSatish Balay PetscCall(PetscObjectQuery((PetscObject)S,"__math2opus_coords",(PetscObject*)&c)); 733*f17f7ee2SSatish Balay } 734*f17f7ee2SSatish Balay if (!c) { 735*f17f7ee2SSatish Balay PetscCall(MatH2OpusSetCoords_H2OPUS(A,-1,NULL,PETSC_FALSE,NULL,NULL)); 736*f17f7ee2SSatish Balay } else { 737*f17f7ee2SSatish Balay PetscCall(VecGetArrayRead(c,&coords)); 738*f17f7ee2SSatish Balay PetscCall(VecGetBlockSize(c,&spacedim)); 739*f17f7ee2SSatish Balay PetscCall(MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,PETSC_FALSE,NULL,NULL)); 740*f17f7ee2SSatish Balay PetscCall(VecRestoreArrayRead(c,&coords)); 741*f17f7ee2SSatish Balay } 742*f17f7ee2SSatish Balay PetscFunctionReturn(0); 743*f17f7ee2SSatish Balay } 744*f17f7ee2SSatish Balay 745*f17f7ee2SSatish Balay static PetscErrorCode MatSetUpMultiply_H2OPUS(Mat A) 746*f17f7ee2SSatish Balay { 747*f17f7ee2SSatish Balay MPI_Comm comm; 748*f17f7ee2SSatish Balay PetscMPIInt size; 749*f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 750*f17f7ee2SSatish Balay PetscInt n = 0,*idx = NULL; 751*f17f7ee2SSatish Balay int *iidx = NULL; 752*f17f7ee2SSatish Balay PetscCopyMode own; 753*f17f7ee2SSatish Balay PetscBool rid; 754*f17f7ee2SSatish Balay 755*f17f7ee2SSatish Balay PetscFunctionBegin; 756*f17f7ee2SSatish Balay if (a->multsetup) PetscFunctionReturn(0); 757*f17f7ee2SSatish Balay if (a->sf) { /* MatDuplicate_H2OPUS takes reference to the SF */ 758*f17f7ee2SSatish Balay PetscCall(PetscSFGetGraph(a->sf,NULL,&n,NULL,NULL)); 759*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 760*f17f7ee2SSatish Balay a->xx_gpu = new thrust::device_vector<PetscScalar>(n); 761*f17f7ee2SSatish Balay a->yy_gpu = new thrust::device_vector<PetscScalar>(n); 762*f17f7ee2SSatish Balay a->xxs_gpu = 1; 763*f17f7ee2SSatish Balay a->yys_gpu = 1; 764*f17f7ee2SSatish Balay #endif 765*f17f7ee2SSatish Balay a->xx = new thrust::host_vector<PetscScalar>(n); 766*f17f7ee2SSatish Balay a->yy = new thrust::host_vector<PetscScalar>(n); 767*f17f7ee2SSatish Balay a->xxs = 1; 768*f17f7ee2SSatish Balay a->yys = 1; 769*f17f7ee2SSatish Balay } else { 770*f17f7ee2SSatish Balay IS is; 771*f17f7ee2SSatish Balay PetscCall(PetscObjectGetComm((PetscObject)A,&comm)); 772*f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(comm,&size)); 773*f17f7ee2SSatish Balay if (!a->h2opus_indexmap) { 774*f17f7ee2SSatish Balay if (size > 1) { 775*f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix"); 776*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 777*f17f7ee2SSatish Balay iidx = MatH2OpusGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map); 778*f17f7ee2SSatish Balay n = a->dist_hmatrix->basis_tree.basis_branch.index_map.size(); 779*f17f7ee2SSatish Balay #endif 780*f17f7ee2SSatish Balay } else { 781*f17f7ee2SSatish Balay iidx = MatH2OpusGetThrustPointer(a->hmatrix->u_basis_tree.index_map); 782*f17f7ee2SSatish Balay n = a->hmatrix->u_basis_tree.index_map.size(); 783*f17f7ee2SSatish Balay } 784*f17f7ee2SSatish Balay 785*f17f7ee2SSatish Balay if (PetscDefined(USE_64BIT_INDICES)) { 786*f17f7ee2SSatish Balay PetscInt i; 787*f17f7ee2SSatish Balay 788*f17f7ee2SSatish Balay own = PETSC_OWN_POINTER; 789*f17f7ee2SSatish Balay PetscCall(PetscMalloc1(n,&idx)); 790*f17f7ee2SSatish Balay for (i=0;i<n;i++) idx[i] = iidx[i]; 791*f17f7ee2SSatish Balay } else { 792*f17f7ee2SSatish Balay own = PETSC_COPY_VALUES; 793*f17f7ee2SSatish Balay idx = (PetscInt*)iidx; 794*f17f7ee2SSatish Balay } 795*f17f7ee2SSatish Balay PetscCall(ISCreateGeneral(comm,n,idx,own,&is)); 796*f17f7ee2SSatish Balay PetscCall(ISSetPermutation(is)); 797*f17f7ee2SSatish Balay PetscCall(ISViewFromOptions(is,(PetscObject)A,"-mat_h2opus_indexmap_view")); 798*f17f7ee2SSatish Balay a->h2opus_indexmap = is; 799*f17f7ee2SSatish Balay } 800*f17f7ee2SSatish Balay PetscCall(ISGetLocalSize(a->h2opus_indexmap,&n)); 801*f17f7ee2SSatish Balay PetscCall(ISGetIndices(a->h2opus_indexmap,(const PetscInt **)&idx)); 802*f17f7ee2SSatish Balay rid = (PetscBool)(n == A->rmap->n); 803*f17f7ee2SSatish Balay PetscCall(MPIU_Allreduce(MPI_IN_PLACE,&rid,1,MPIU_BOOL,MPI_LAND,comm)); 804*f17f7ee2SSatish Balay if (rid) { 805*f17f7ee2SSatish Balay PetscCall(ISIdentity(a->h2opus_indexmap,&rid)); 806*f17f7ee2SSatish Balay } 807*f17f7ee2SSatish Balay if (!rid) { 808*f17f7ee2SSatish Balay if (size > 1) { /* Parallel distribution may be different, save it here for fast path in MatMult (see MatH2OpusSetNativeMult) */ 809*f17f7ee2SSatish Balay PetscCall(PetscLayoutCreate(comm,&a->h2opus_rmap)); 810*f17f7ee2SSatish Balay PetscCall(PetscLayoutSetLocalSize(a->h2opus_rmap,n)); 811*f17f7ee2SSatish Balay PetscCall(PetscLayoutSetUp(a->h2opus_rmap)); 812*f17f7ee2SSatish Balay PetscCall(PetscLayoutReference(a->h2opus_rmap,&a->h2opus_cmap)); 813*f17f7ee2SSatish Balay } 814*f17f7ee2SSatish Balay PetscCall(PetscSFCreate(comm,&a->sf)); 815*f17f7ee2SSatish Balay PetscCall(PetscSFSetGraphLayout(a->sf,A->rmap,n,NULL,PETSC_OWN_POINTER,idx)); 816*f17f7ee2SSatish Balay PetscCall(PetscSFViewFromOptions(a->sf,(PetscObject)A,"-mat_h2opus_sf_view")); 817*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 818*f17f7ee2SSatish Balay a->xx_gpu = new thrust::device_vector<PetscScalar>(n); 819*f17f7ee2SSatish Balay a->yy_gpu = new thrust::device_vector<PetscScalar>(n); 820*f17f7ee2SSatish Balay a->xxs_gpu = 1; 821*f17f7ee2SSatish Balay a->yys_gpu = 1; 822*f17f7ee2SSatish Balay #endif 823*f17f7ee2SSatish Balay a->xx = new thrust::host_vector<PetscScalar>(n); 824*f17f7ee2SSatish Balay a->yy = new thrust::host_vector<PetscScalar>(n); 825*f17f7ee2SSatish Balay a->xxs = 1; 826*f17f7ee2SSatish Balay a->yys = 1; 827*f17f7ee2SSatish Balay } 828*f17f7ee2SSatish Balay PetscCall(ISRestoreIndices(a->h2opus_indexmap,(const PetscInt **)&idx)); 829*f17f7ee2SSatish Balay } 830*f17f7ee2SSatish Balay a->multsetup = PETSC_TRUE; 831*f17f7ee2SSatish Balay PetscFunctionReturn(0); 832*f17f7ee2SSatish Balay } 833*f17f7ee2SSatish Balay 834*f17f7ee2SSatish Balay static PetscErrorCode MatAssemblyEnd_H2OPUS(Mat A, MatAssemblyType assemblytype) 835*f17f7ee2SSatish Balay { 836*f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 837*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 838*f17f7ee2SSatish Balay h2opusHandle_t handle = a->handle->handle; 839*f17f7ee2SSatish Balay #else 840*f17f7ee2SSatish Balay h2opusHandle_t handle = a->handle; 841*f17f7ee2SSatish Balay #endif 842*f17f7ee2SSatish Balay PetscBool kernel = PETSC_FALSE; 843*f17f7ee2SSatish Balay PetscBool boundtocpu = PETSC_TRUE; 844*f17f7ee2SSatish Balay PetscBool samplingdone = PETSC_FALSE; 845*f17f7ee2SSatish Balay MPI_Comm comm; 846*f17f7ee2SSatish Balay PetscMPIInt size; 847*f17f7ee2SSatish Balay 848*f17f7ee2SSatish Balay PetscFunctionBegin; 849*f17f7ee2SSatish Balay PetscCall(PetscObjectGetComm((PetscObject)A,&comm)); 850*f17f7ee2SSatish Balay PetscCheckFalse(A->rmap->n != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported"); 851*f17f7ee2SSatish Balay PetscCheckFalse(A->rmap->N != A->cmap->N,comm,PETSC_ERR_SUP,"Rectangular matrices are not supported"); 852*f17f7ee2SSatish Balay 853*f17f7ee2SSatish Balay /* XXX */ 854*f17f7ee2SSatish Balay a->leafsize = PetscMin(a->leafsize, PetscMin(A->rmap->N, A->cmap->N)); 855*f17f7ee2SSatish Balay 856*f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(comm,&size)); 857*f17f7ee2SSatish Balay /* TODO REUSABILITY of geometric construction */ 858*f17f7ee2SSatish Balay delete a->hmatrix; 859*f17f7ee2SSatish Balay delete a->dist_hmatrix; 860*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 861*f17f7ee2SSatish Balay delete a->hmatrix_gpu; 862*f17f7ee2SSatish Balay delete a->dist_hmatrix_gpu; 863*f17f7ee2SSatish Balay #endif 864*f17f7ee2SSatish Balay a->orthogonal = PETSC_FALSE; 865*f17f7ee2SSatish Balay 866*f17f7ee2SSatish Balay /* TODO: other? */ 867*f17f7ee2SSatish Balay H2OpusBoxCenterAdmissibility adm(a->eta); 868*f17f7ee2SSatish Balay 869*f17f7ee2SSatish Balay PetscCall(PetscLogEventBegin(MAT_H2Opus_Build,A,0,0,0)); 870*f17f7ee2SSatish Balay if (size > 1) { 871*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 872*f17f7ee2SSatish Balay a->dist_hmatrix = new DistributedHMatrix(A->rmap->n/* ,A->symmetric */); 873*f17f7ee2SSatish Balay #else 874*f17f7ee2SSatish Balay a->dist_hmatrix = NULL; 875*f17f7ee2SSatish Balay #endif 876*f17f7ee2SSatish Balay } else { 877*f17f7ee2SSatish Balay a->hmatrix = new HMatrix(A->rmap->n,A->symmetric); 878*f17f7ee2SSatish Balay } 879*f17f7ee2SSatish Balay PetscCall(MatH2OpusInferCoordinates_Private(A)); 880*f17f7ee2SSatish Balay PetscCheck(a->ptcloud,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing pointcloud"); 881*f17f7ee2SSatish Balay if (a->kernel) { 882*f17f7ee2SSatish Balay BoxEntryGen<PetscScalar, H2OPUS_HWTYPE_CPU, PetscFunctionGenerator<PetscScalar>> entry_gen(*a->kernel); 883*f17f7ee2SSatish Balay if (size > 1) { 884*f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix"); 885*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 886*f17f7ee2SSatish Balay buildDistributedHMatrix(*a->dist_hmatrix,a->ptcloud,adm,entry_gen,a->leafsize,a->basisord,a->handle); 887*f17f7ee2SSatish Balay #endif 888*f17f7ee2SSatish Balay } else { 889*f17f7ee2SSatish Balay buildHMatrix(*a->hmatrix,a->ptcloud,adm,entry_gen,a->leafsize,a->basisord); 890*f17f7ee2SSatish Balay } 891*f17f7ee2SSatish Balay kernel = PETSC_TRUE; 892*f17f7ee2SSatish Balay } else { 893*f17f7ee2SSatish Balay PetscCheckFalse(size > 1,comm,PETSC_ERR_SUP,"Construction from sampling not supported in parallel"); 894*f17f7ee2SSatish Balay buildHMatrixStructure(*a->hmatrix,a->ptcloud,a->leafsize,adm); 895*f17f7ee2SSatish Balay } 896*f17f7ee2SSatish Balay PetscCall(MatSetUpMultiply_H2OPUS(A)); 897*f17f7ee2SSatish Balay 898*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 899*f17f7ee2SSatish Balay boundtocpu = A->boundtocpu; 900*f17f7ee2SSatish Balay if (!boundtocpu) { 901*f17f7ee2SSatish Balay if (size > 1) { 902*f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix"); 903*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 904*f17f7ee2SSatish Balay a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix); 905*f17f7ee2SSatish Balay #endif 906*f17f7ee2SSatish Balay } else { 907*f17f7ee2SSatish Balay a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix); 908*f17f7ee2SSatish Balay } 909*f17f7ee2SSatish Balay } 910*f17f7ee2SSatish Balay #endif 911*f17f7ee2SSatish Balay if (size == 1) { 912*f17f7ee2SSatish Balay if (!kernel && a->sampler && a->sampler->GetSamplingMat()) { 913*f17f7ee2SSatish Balay PetscReal Anorm; 914*f17f7ee2SSatish Balay bool verbose; 915*f17f7ee2SSatish Balay 916*f17f7ee2SSatish Balay PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_hara_verbose",&a->hara_verbose,NULL)); 917*f17f7ee2SSatish Balay verbose = a->hara_verbose; 918*f17f7ee2SSatish Balay PetscCall(MatApproximateNorm_Private(a->sampler->GetSamplingMat(),NORM_2,a->norm_max_samples,&Anorm)); 919*f17f7ee2SSatish Balay if (a->hara_verbose) PetscCall(PetscPrintf(PETSC_COMM_SELF,"Sampling uses max rank %d, tol %g (%g*%g), %s samples %d\n",a->max_rank,a->rtol*Anorm,a->rtol,Anorm,boundtocpu ? "CPU" : "GPU",a->bs)); 920*f17f7ee2SSatish Balay if (a->sf && !a->nativemult) { 921*f17f7ee2SSatish Balay a->sampler->SetIndexMap(a->hmatrix->u_basis_tree.index_map.size(),a->hmatrix->u_basis_tree.index_map.data()); 922*f17f7ee2SSatish Balay } 923*f17f7ee2SSatish Balay a->sampler->SetStream(handle->getMainStream()); 924*f17f7ee2SSatish Balay if (boundtocpu) { 925*f17f7ee2SSatish Balay a->sampler->SetGPUSampling(false); 926*f17f7ee2SSatish Balay hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose); 927*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 928*f17f7ee2SSatish Balay } else { 929*f17f7ee2SSatish Balay a->sampler->SetGPUSampling(true); 930*f17f7ee2SSatish Balay hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose); 931*f17f7ee2SSatish Balay #endif 932*f17f7ee2SSatish Balay } 933*f17f7ee2SSatish Balay samplingdone = PETSC_TRUE; 934*f17f7ee2SSatish Balay } 935*f17f7ee2SSatish Balay } 936*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 937*f17f7ee2SSatish Balay if (!boundtocpu) { 938*f17f7ee2SSatish Balay delete a->hmatrix; 939*f17f7ee2SSatish Balay delete a->dist_hmatrix; 940*f17f7ee2SSatish Balay a->hmatrix = NULL; 941*f17f7ee2SSatish Balay a->dist_hmatrix = NULL; 942*f17f7ee2SSatish Balay } 943*f17f7ee2SSatish Balay A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU; 944*f17f7ee2SSatish Balay #endif 945*f17f7ee2SSatish Balay PetscCall(PetscLogEventEnd(MAT_H2Opus_Build,A,0,0,0)); 946*f17f7ee2SSatish Balay 947*f17f7ee2SSatish Balay if (!a->s) a->s = 1.0; 948*f17f7ee2SSatish Balay A->assembled = PETSC_TRUE; 949*f17f7ee2SSatish Balay 950*f17f7ee2SSatish Balay if (samplingdone) { 951*f17f7ee2SSatish Balay PetscBool check = a->check_construction; 952*f17f7ee2SSatish Balay PetscBool checke = PETSC_FALSE; 953*f17f7ee2SSatish Balay 954*f17f7ee2SSatish Balay PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_check",&check,NULL)); 955*f17f7ee2SSatish Balay PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_check_explicit",&checke,NULL)); 956*f17f7ee2SSatish Balay if (check) { 957*f17f7ee2SSatish Balay Mat E,Ae; 958*f17f7ee2SSatish Balay PetscReal n1,ni,n2; 959*f17f7ee2SSatish Balay PetscReal n1A,niA,n2A; 960*f17f7ee2SSatish Balay void (*normfunc)(void); 961*f17f7ee2SSatish Balay 962*f17f7ee2SSatish Balay Ae = a->sampler->GetSamplingMat(); 963*f17f7ee2SSatish Balay PetscCall(MatConvert(A,MATSHELL,MAT_INITIAL_MATRIX,&E)); 964*f17f7ee2SSatish Balay PetscCall(MatShellSetOperation(E,MATOP_NORM,(void (*)(void))MatNorm_H2OPUS)); 965*f17f7ee2SSatish Balay PetscCall(MatAXPY(E,-1.0,Ae,DIFFERENT_NONZERO_PATTERN)); 966*f17f7ee2SSatish Balay PetscCall(MatNorm(E,NORM_1,&n1)); 967*f17f7ee2SSatish Balay PetscCall(MatNorm(E,NORM_INFINITY,&ni)); 968*f17f7ee2SSatish Balay PetscCall(MatNorm(E,NORM_2,&n2)); 969*f17f7ee2SSatish Balay if (checke) { 970*f17f7ee2SSatish Balay Mat eA,eE,eAe; 971*f17f7ee2SSatish Balay 972*f17f7ee2SSatish Balay PetscCall(MatComputeOperator(A,MATAIJ,&eA)); 973*f17f7ee2SSatish Balay PetscCall(MatComputeOperator(E,MATAIJ,&eE)); 974*f17f7ee2SSatish Balay PetscCall(MatComputeOperator(Ae,MATAIJ,&eAe)); 975*f17f7ee2SSatish Balay PetscCall(MatChop(eA,PETSC_SMALL)); 976*f17f7ee2SSatish Balay PetscCall(MatChop(eE,PETSC_SMALL)); 977*f17f7ee2SSatish Balay PetscCall(MatChop(eAe,PETSC_SMALL)); 978*f17f7ee2SSatish Balay PetscCall(PetscObjectSetName((PetscObject)eA,"H2Mat")); 979*f17f7ee2SSatish Balay PetscCall(MatView(eA,NULL)); 980*f17f7ee2SSatish Balay PetscCall(PetscObjectSetName((PetscObject)eAe,"S")); 981*f17f7ee2SSatish Balay PetscCall(MatView(eAe,NULL)); 982*f17f7ee2SSatish Balay PetscCall(PetscObjectSetName((PetscObject)eE,"H2Mat - S")); 983*f17f7ee2SSatish Balay PetscCall(MatView(eE,NULL)); 984*f17f7ee2SSatish Balay PetscCall(MatDestroy(&eA)); 985*f17f7ee2SSatish Balay PetscCall(MatDestroy(&eE)); 986*f17f7ee2SSatish Balay PetscCall(MatDestroy(&eAe)); 987*f17f7ee2SSatish Balay } 988*f17f7ee2SSatish Balay 989*f17f7ee2SSatish Balay PetscCall(MatGetOperation(Ae,MATOP_NORM,&normfunc)); 990*f17f7ee2SSatish Balay PetscCall(MatSetOperation(Ae,MATOP_NORM,(void (*)(void))MatNorm_H2OPUS)); 991*f17f7ee2SSatish Balay PetscCall(MatNorm(Ae,NORM_1,&n1A)); 992*f17f7ee2SSatish Balay PetscCall(MatNorm(Ae,NORM_INFINITY,&niA)); 993*f17f7ee2SSatish Balay PetscCall(MatNorm(Ae,NORM_2,&n2A)); 994*f17f7ee2SSatish Balay n1A = PetscMax(n1A,PETSC_SMALL); 995*f17f7ee2SSatish Balay n2A = PetscMax(n2A,PETSC_SMALL); 996*f17f7ee2SSatish Balay niA = PetscMax(niA,PETSC_SMALL); 997*f17f7ee2SSatish Balay PetscCall(MatSetOperation(Ae,MATOP_NORM,normfunc)); 998*f17f7ee2SSatish Balay PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A),"MATH2OPUS construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n",(double)n1,(double)ni,(double)n2,(double)(n1/n1A),(double)(ni/niA),(double)(n2/n2A))); 999*f17f7ee2SSatish Balay PetscCall(MatDestroy(&E)); 1000*f17f7ee2SSatish Balay } 1001*f17f7ee2SSatish Balay a->sampler->SetSamplingMat(NULL); 1002*f17f7ee2SSatish Balay } 1003*f17f7ee2SSatish Balay PetscFunctionReturn(0); 1004*f17f7ee2SSatish Balay } 1005*f17f7ee2SSatish Balay 1006*f17f7ee2SSatish Balay static PetscErrorCode MatZeroEntries_H2OPUS(Mat A) 1007*f17f7ee2SSatish Balay { 1008*f17f7ee2SSatish Balay PetscMPIInt size; 1009*f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 1010*f17f7ee2SSatish Balay 1011*f17f7ee2SSatish Balay PetscFunctionBegin; 1012*f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 1013*f17f7ee2SSatish Balay PetscCheckFalse(size > 1,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not yet supported"); 1014*f17f7ee2SSatish Balay else { 1015*f17f7ee2SSatish Balay a->hmatrix->clearData(); 1016*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1017*f17f7ee2SSatish Balay if (a->hmatrix_gpu) a->hmatrix_gpu->clearData(); 1018*f17f7ee2SSatish Balay #endif 1019*f17f7ee2SSatish Balay } 1020*f17f7ee2SSatish Balay PetscFunctionReturn(0); 1021*f17f7ee2SSatish Balay } 1022*f17f7ee2SSatish Balay 1023*f17f7ee2SSatish Balay static PetscErrorCode MatDuplicate_H2OPUS(Mat B, MatDuplicateOption op, Mat *nA) 1024*f17f7ee2SSatish Balay { 1025*f17f7ee2SSatish Balay Mat A; 1026*f17f7ee2SSatish Balay Mat_H2OPUS *a, *b = (Mat_H2OPUS*)B->data; 1027*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1028*f17f7ee2SSatish Balay PetscBool iscpu = PETSC_FALSE; 1029*f17f7ee2SSatish Balay #else 1030*f17f7ee2SSatish Balay PetscBool iscpu = PETSC_TRUE; 1031*f17f7ee2SSatish Balay #endif 1032*f17f7ee2SSatish Balay MPI_Comm comm; 1033*f17f7ee2SSatish Balay 1034*f17f7ee2SSatish Balay PetscFunctionBegin; 1035*f17f7ee2SSatish Balay PetscCall(PetscObjectGetComm((PetscObject)B,&comm)); 1036*f17f7ee2SSatish Balay PetscCall(MatCreate(comm,&A)); 1037*f17f7ee2SSatish Balay PetscCall(MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N)); 1038*f17f7ee2SSatish Balay PetscCall(MatSetType(A,MATH2OPUS)); 1039*f17f7ee2SSatish Balay PetscCall(MatPropagateSymmetryOptions(B,A)); 1040*f17f7ee2SSatish Balay a = (Mat_H2OPUS*)A->data; 1041*f17f7ee2SSatish Balay 1042*f17f7ee2SSatish Balay a->eta = b->eta; 1043*f17f7ee2SSatish Balay a->leafsize = b->leafsize; 1044*f17f7ee2SSatish Balay a->basisord = b->basisord; 1045*f17f7ee2SSatish Balay a->max_rank = b->max_rank; 1046*f17f7ee2SSatish Balay a->bs = b->bs; 1047*f17f7ee2SSatish Balay a->rtol = b->rtol; 1048*f17f7ee2SSatish Balay a->norm_max_samples = b->norm_max_samples; 1049*f17f7ee2SSatish Balay if (op == MAT_COPY_VALUES) a->s = b->s; 1050*f17f7ee2SSatish Balay 1051*f17f7ee2SSatish Balay a->ptcloud = new PetscPointCloud<PetscReal>(*b->ptcloud); 1052*f17f7ee2SSatish Balay if (op == MAT_COPY_VALUES && b->kernel) a->kernel = new PetscFunctionGenerator<PetscScalar>(*b->kernel); 1053*f17f7ee2SSatish Balay 1054*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1055*f17f7ee2SSatish Balay if (b->dist_hmatrix) { a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix); } 1056*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1057*f17f7ee2SSatish Balay if (b->dist_hmatrix_gpu) { a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu); } 1058*f17f7ee2SSatish Balay #endif 1059*f17f7ee2SSatish Balay #endif 1060*f17f7ee2SSatish Balay if (b->hmatrix) { 1061*f17f7ee2SSatish Balay a->hmatrix = new HMatrix(*b->hmatrix); 1062*f17f7ee2SSatish Balay if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData(); 1063*f17f7ee2SSatish Balay } 1064*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1065*f17f7ee2SSatish Balay if (b->hmatrix_gpu) { 1066*f17f7ee2SSatish Balay a->hmatrix_gpu = new HMatrix_GPU(*b->hmatrix_gpu); 1067*f17f7ee2SSatish Balay if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData(); 1068*f17f7ee2SSatish Balay } 1069*f17f7ee2SSatish Balay #endif 1070*f17f7ee2SSatish Balay if (b->sf) { 1071*f17f7ee2SSatish Balay PetscCall(PetscObjectReference((PetscObject)b->sf)); 1072*f17f7ee2SSatish Balay a->sf = b->sf; 1073*f17f7ee2SSatish Balay } 1074*f17f7ee2SSatish Balay if (b->h2opus_indexmap) { 1075*f17f7ee2SSatish Balay PetscCall(PetscObjectReference((PetscObject)b->h2opus_indexmap)); 1076*f17f7ee2SSatish Balay a->h2opus_indexmap = b->h2opus_indexmap; 1077*f17f7ee2SSatish Balay } 1078*f17f7ee2SSatish Balay 1079*f17f7ee2SSatish Balay PetscCall(MatSetUp(A)); 1080*f17f7ee2SSatish Balay PetscCall(MatSetUpMultiply_H2OPUS(A)); 1081*f17f7ee2SSatish Balay if (op == MAT_COPY_VALUES) { 1082*f17f7ee2SSatish Balay A->assembled = PETSC_TRUE; 1083*f17f7ee2SSatish Balay a->orthogonal = b->orthogonal; 1084*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1085*f17f7ee2SSatish Balay A->offloadmask = B->offloadmask; 1086*f17f7ee2SSatish Balay #endif 1087*f17f7ee2SSatish Balay } 1088*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1089*f17f7ee2SSatish Balay iscpu = B->boundtocpu; 1090*f17f7ee2SSatish Balay #endif 1091*f17f7ee2SSatish Balay PetscCall(MatBindToCPU(A,iscpu)); 1092*f17f7ee2SSatish Balay 1093*f17f7ee2SSatish Balay *nA = A; 1094*f17f7ee2SSatish Balay PetscFunctionReturn(0); 1095*f17f7ee2SSatish Balay } 1096*f17f7ee2SSatish Balay 1097*f17f7ee2SSatish Balay static PetscErrorCode MatView_H2OPUS(Mat A, PetscViewer view) 1098*f17f7ee2SSatish Balay { 1099*f17f7ee2SSatish Balay Mat_H2OPUS *h2opus = (Mat_H2OPUS*)A->data; 1100*f17f7ee2SSatish Balay PetscBool isascii; 1101*f17f7ee2SSatish Balay PetscMPIInt size; 1102*f17f7ee2SSatish Balay PetscViewerFormat format; 1103*f17f7ee2SSatish Balay 1104*f17f7ee2SSatish Balay PetscFunctionBegin; 1105*f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)view,PETSCVIEWERASCII,&isascii)); 1106*f17f7ee2SSatish Balay PetscCall(PetscViewerGetFormat(view,&format)); 1107*f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 1108*f17f7ee2SSatish Balay if (isascii) { 1109*f17f7ee2SSatish Balay if (format == PETSC_VIEWER_ASCII_MATLAB) { 1110*f17f7ee2SSatish Balay if (size == 1) { 1111*f17f7ee2SSatish Balay FILE *fp; 1112*f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIGetPointer(view,&fp)); 1113*f17f7ee2SSatish Balay dumpHMatrix(*h2opus->hmatrix,6,fp); 1114*f17f7ee2SSatish Balay } 1115*f17f7ee2SSatish Balay } else { 1116*f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view," H-Matrix constructed from %s\n",h2opus->kernel ? "Kernel" : "Mat")); 1117*f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view," PointCloud dim %" PetscInt_FMT "\n",h2opus->ptcloud ? h2opus->ptcloud->getDimension() : 0)); 1118*f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view," Admissibility parameters: leaf size %" PetscInt_FMT ", eta %g\n",h2opus->leafsize,(double)h2opus->eta)); 1119*f17f7ee2SSatish Balay if (!h2opus->kernel) { 1120*f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view," Sampling parameters: max_rank %" PetscInt_FMT ", samples %" PetscInt_FMT ", tolerance %g\n",h2opus->max_rank,h2opus->bs,(double)h2opus->rtol)); 1121*f17f7ee2SSatish Balay } else { 1122*f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view," Offdiagonal blocks approximation order %" PetscInt_FMT "\n",h2opus->basisord)); 1123*f17f7ee2SSatish Balay } 1124*f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view," Number of samples for norms %" PetscInt_FMT "\n",h2opus->norm_max_samples)); 1125*f17f7ee2SSatish Balay if (size == 1) { 1126*f17f7ee2SSatish Balay double dense_mem_cpu = h2opus->hmatrix ? h2opus->hmatrix->getDenseMemoryUsage() : 0; 1127*f17f7ee2SSatish Balay double low_rank_cpu = h2opus->hmatrix ? h2opus->hmatrix->getLowRankMemoryUsage() : 0; 1128*f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA) 1129*f17f7ee2SSatish Balay double dense_mem_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getDenseMemoryUsage() : 0; 1130*f17f7ee2SSatish Balay double low_rank_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getLowRankMemoryUsage() : 0; 1131*f17f7ee2SSatish Balay #endif 1132*f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view," Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu)); 1133*f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA) 1134*f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view," Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu)); 1135*f17f7ee2SSatish Balay #endif 1136*f17f7ee2SSatish Balay } else { 1137*f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA) 1138*f17f7ee2SSatish Balay double matrix_mem[4] = {0.,0.,0.,0.}; 1139*f17f7ee2SSatish Balay PetscMPIInt rsize = 4; 1140*f17f7ee2SSatish Balay #else 1141*f17f7ee2SSatish Balay double matrix_mem[2] = {0.,0.}; 1142*f17f7ee2SSatish Balay PetscMPIInt rsize = 2; 1143*f17f7ee2SSatish Balay #endif 1144*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1145*f17f7ee2SSatish Balay matrix_mem[0] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalDenseMemoryUsage() : 0; 1146*f17f7ee2SSatish Balay matrix_mem[1] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalLowRankMemoryUsage() : 0; 1147*f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA) 1148*f17f7ee2SSatish Balay matrix_mem[2] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalDenseMemoryUsage() : 0; 1149*f17f7ee2SSatish Balay matrix_mem[3] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalLowRankMemoryUsage() : 0; 1150*f17f7ee2SSatish Balay #endif 1151*f17f7ee2SSatish Balay #endif 1152*f17f7ee2SSatish Balay PetscCall(MPIU_Allreduce(MPI_IN_PLACE,matrix_mem,rsize,MPI_DOUBLE_PRECISION,MPI_SUM,PetscObjectComm((PetscObject)A))); 1153*f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view," Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[0], matrix_mem[1], matrix_mem[0] + matrix_mem[1])); 1154*f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA) 1155*f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view," Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[2], matrix_mem[3], matrix_mem[2] + matrix_mem[3])); 1156*f17f7ee2SSatish Balay #endif 1157*f17f7ee2SSatish Balay } 1158*f17f7ee2SSatish Balay } 1159*f17f7ee2SSatish Balay } 1160*f17f7ee2SSatish Balay #if 0 1161*f17f7ee2SSatish Balay if (size == 1) { 1162*f17f7ee2SSatish Balay char filename[256]; 1163*f17f7ee2SSatish Balay const char *name; 1164*f17f7ee2SSatish Balay 1165*f17f7ee2SSatish Balay PetscCall(PetscObjectGetName((PetscObject)A,&name)); 1166*f17f7ee2SSatish Balay PetscCall(PetscSNPrintf(filename,sizeof(filename),"%s_structure.eps",name)); 1167*f17f7ee2SSatish Balay outputEps(*h2opus->hmatrix,filename); 1168*f17f7ee2SSatish Balay } 1169*f17f7ee2SSatish Balay #endif 1170*f17f7ee2SSatish Balay PetscFunctionReturn(0); 1171*f17f7ee2SSatish Balay } 1172*f17f7ee2SSatish Balay 1173*f17f7ee2SSatish Balay static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat A, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx) 1174*f17f7ee2SSatish Balay { 1175*f17f7ee2SSatish Balay Mat_H2OPUS *h2opus = (Mat_H2OPUS*)A->data; 1176*f17f7ee2SSatish Balay PetscReal *gcoords; 1177*f17f7ee2SSatish Balay PetscInt N; 1178*f17f7ee2SSatish Balay MPI_Comm comm; 1179*f17f7ee2SSatish Balay PetscMPIInt size; 1180*f17f7ee2SSatish Balay PetscBool cong; 1181*f17f7ee2SSatish Balay 1182*f17f7ee2SSatish Balay PetscFunctionBegin; 1183*f17f7ee2SSatish Balay PetscCall(PetscLayoutSetUp(A->rmap)); 1184*f17f7ee2SSatish Balay PetscCall(PetscLayoutSetUp(A->cmap)); 1185*f17f7ee2SSatish Balay PetscCall(PetscObjectGetComm((PetscObject)A,&comm)); 1186*f17f7ee2SSatish Balay PetscCall(MatHasCongruentLayouts(A,&cong)); 1187*f17f7ee2SSatish Balay PetscCheck(cong,comm,PETSC_ERR_SUP,"Only for square matrices with congruent layouts"); 1188*f17f7ee2SSatish Balay N = A->rmap->N; 1189*f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(comm,&size)); 1190*f17f7ee2SSatish Balay if (spacedim > 0 && size > 1 && cdist) { 1191*f17f7ee2SSatish Balay PetscSF sf; 1192*f17f7ee2SSatish Balay MPI_Datatype dtype; 1193*f17f7ee2SSatish Balay 1194*f17f7ee2SSatish Balay PetscCallMPI(MPI_Type_contiguous(spacedim,MPIU_REAL,&dtype)); 1195*f17f7ee2SSatish Balay PetscCallMPI(MPI_Type_commit(&dtype)); 1196*f17f7ee2SSatish Balay 1197*f17f7ee2SSatish Balay PetscCall(PetscSFCreate(comm,&sf)); 1198*f17f7ee2SSatish Balay PetscCall(PetscSFSetGraphWithPattern(sf,A->rmap,PETSCSF_PATTERN_ALLGATHER)); 1199*f17f7ee2SSatish Balay PetscCall(PetscMalloc1(spacedim*N,&gcoords)); 1200*f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(sf,dtype,coords,gcoords,MPI_REPLACE)); 1201*f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(sf,dtype,coords,gcoords,MPI_REPLACE)); 1202*f17f7ee2SSatish Balay PetscCall(PetscSFDestroy(&sf)); 1203*f17f7ee2SSatish Balay PetscCallMPI(MPI_Type_free(&dtype)); 1204*f17f7ee2SSatish Balay } else gcoords = (PetscReal*)coords; 1205*f17f7ee2SSatish Balay 1206*f17f7ee2SSatish Balay delete h2opus->ptcloud; 1207*f17f7ee2SSatish Balay delete h2opus->kernel; 1208*f17f7ee2SSatish Balay h2opus->ptcloud = new PetscPointCloud<PetscReal>(spacedim,N,gcoords); 1209*f17f7ee2SSatish Balay if (kernel) h2opus->kernel = new PetscFunctionGenerator<PetscScalar>(kernel,spacedim,kernelctx); 1210*f17f7ee2SSatish Balay if (gcoords != coords) PetscCall(PetscFree(gcoords)); 1211*f17f7ee2SSatish Balay A->preallocated = PETSC_TRUE; 1212*f17f7ee2SSatish Balay PetscFunctionReturn(0); 1213*f17f7ee2SSatish Balay } 1214*f17f7ee2SSatish Balay 1215*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1216*f17f7ee2SSatish Balay static PetscErrorCode MatBindToCPU_H2OPUS(Mat A, PetscBool flg) 1217*f17f7ee2SSatish Balay { 1218*f17f7ee2SSatish Balay PetscMPIInt size; 1219*f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 1220*f17f7ee2SSatish Balay 1221*f17f7ee2SSatish Balay PetscFunctionBegin; 1222*f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 1223*f17f7ee2SSatish Balay if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) { 1224*f17f7ee2SSatish Balay if (size > 1) { 1225*f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 1226*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1227*f17f7ee2SSatish Balay if (!a->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix_gpu); 1228*f17f7ee2SSatish Balay else *a->dist_hmatrix = *a->dist_hmatrix_gpu; 1229*f17f7ee2SSatish Balay #endif 1230*f17f7ee2SSatish Balay } else { 1231*f17f7ee2SSatish Balay PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 1232*f17f7ee2SSatish Balay if (!a->hmatrix) a->hmatrix = new HMatrix(*a->hmatrix_gpu); 1233*f17f7ee2SSatish Balay else *a->hmatrix = *a->hmatrix_gpu; 1234*f17f7ee2SSatish Balay } 1235*f17f7ee2SSatish Balay delete a->hmatrix_gpu; 1236*f17f7ee2SSatish Balay delete a->dist_hmatrix_gpu; 1237*f17f7ee2SSatish Balay a->hmatrix_gpu = NULL; 1238*f17f7ee2SSatish Balay a->dist_hmatrix_gpu = NULL; 1239*f17f7ee2SSatish Balay A->offloadmask = PETSC_OFFLOAD_CPU; 1240*f17f7ee2SSatish Balay } else if (!flg && A->offloadmask == PETSC_OFFLOAD_CPU) { 1241*f17f7ee2SSatish Balay if (size > 1) { 1242*f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 1243*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1244*f17f7ee2SSatish Balay if (!a->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix); 1245*f17f7ee2SSatish Balay else *a->dist_hmatrix_gpu = *a->dist_hmatrix; 1246*f17f7ee2SSatish Balay #endif 1247*f17f7ee2SSatish Balay } else { 1248*f17f7ee2SSatish Balay PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 1249*f17f7ee2SSatish Balay if (!a->hmatrix_gpu) a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix); 1250*f17f7ee2SSatish Balay else *a->hmatrix_gpu = *a->hmatrix; 1251*f17f7ee2SSatish Balay } 1252*f17f7ee2SSatish Balay delete a->hmatrix; 1253*f17f7ee2SSatish Balay delete a->dist_hmatrix; 1254*f17f7ee2SSatish Balay a->hmatrix = NULL; 1255*f17f7ee2SSatish Balay a->dist_hmatrix = NULL; 1256*f17f7ee2SSatish Balay A->offloadmask = PETSC_OFFLOAD_GPU; 1257*f17f7ee2SSatish Balay } 1258*f17f7ee2SSatish Balay PetscCall(PetscFree(A->defaultvectype)); 1259*f17f7ee2SSatish Balay if (!flg) { 1260*f17f7ee2SSatish Balay PetscCall(PetscStrallocpy(VECCUDA,&A->defaultvectype)); 1261*f17f7ee2SSatish Balay } else { 1262*f17f7ee2SSatish Balay PetscCall(PetscStrallocpy(VECSTANDARD,&A->defaultvectype)); 1263*f17f7ee2SSatish Balay } 1264*f17f7ee2SSatish Balay A->boundtocpu = flg; 1265*f17f7ee2SSatish Balay PetscFunctionReturn(0); 1266*f17f7ee2SSatish Balay } 1267*f17f7ee2SSatish Balay #endif 1268*f17f7ee2SSatish Balay 1269*f17f7ee2SSatish Balay /*MC 1270*f17f7ee2SSatish Balay MATH2OPUS = "h2opus" - A matrix type for hierarchical matrices using the H2Opus package. 1271*f17f7ee2SSatish Balay 1272*f17f7ee2SSatish Balay Options Database Keys: 1273*f17f7ee2SSatish Balay . -mat_type h2opus - matrix type to "h2opus" during a call to MatSetFromOptions() 1274*f17f7ee2SSatish Balay 1275*f17f7ee2SSatish Balay Notes: 1276*f17f7ee2SSatish Balay H2Opus implements hierarchical matrices in the H^2 flavour. 1277*f17f7ee2SSatish Balay It supports CPU or NVIDIA GPUs. 1278*f17f7ee2SSatish Balay For CPU only builds, use ./configure --download-h2opus --download-thrust to install PETSc to use H2Opus. 1279*f17f7ee2SSatish Balay In order to run on NVIDIA GPUs, use ./configure --download-h2opus --download-magma --download-kblas. 1280*f17f7ee2SSatish Balay For details and additional references, see 1281*f17f7ee2SSatish Balay "H2Opus: A distributed-memory multi-GPU software package for non-local operators", 1282*f17f7ee2SSatish Balay available at https://arxiv.org/abs/2109.05451. 1283*f17f7ee2SSatish Balay 1284*f17f7ee2SSatish Balay Level: beginner 1285*f17f7ee2SSatish Balay 1286*f17f7ee2SSatish Balay .seealso: MATHTOOL, MATDENSE, MatCreateH2OpusFromKernel(), MatCreateH2OpusFromMat() 1287*f17f7ee2SSatish Balay M*/ 1288*f17f7ee2SSatish Balay PETSC_EXTERN PetscErrorCode MatCreate_H2OPUS(Mat A) 1289*f17f7ee2SSatish Balay { 1290*f17f7ee2SSatish Balay Mat_H2OPUS *a; 1291*f17f7ee2SSatish Balay PetscMPIInt size; 1292*f17f7ee2SSatish Balay 1293*f17f7ee2SSatish Balay PetscFunctionBegin; 1294*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1295*f17f7ee2SSatish Balay PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 1296*f17f7ee2SSatish Balay #endif 1297*f17f7ee2SSatish Balay PetscCall(PetscNewLog(A,&a)); 1298*f17f7ee2SSatish Balay A->data = (void*)a; 1299*f17f7ee2SSatish Balay 1300*f17f7ee2SSatish Balay a->eta = 0.9; 1301*f17f7ee2SSatish Balay a->leafsize = 32; 1302*f17f7ee2SSatish Balay a->basisord = 4; 1303*f17f7ee2SSatish Balay a->max_rank = 64; 1304*f17f7ee2SSatish Balay a->bs = 32; 1305*f17f7ee2SSatish Balay a->rtol = 1.e-4; 1306*f17f7ee2SSatish Balay a->s = 1.0; 1307*f17f7ee2SSatish Balay a->norm_max_samples = 10; 1308*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1309*f17f7ee2SSatish Balay h2opusCreateDistributedHandleComm(&a->handle,PetscObjectComm((PetscObject)A)); 1310*f17f7ee2SSatish Balay #else 1311*f17f7ee2SSatish Balay h2opusCreateHandle(&a->handle); 1312*f17f7ee2SSatish Balay #endif 1313*f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 1314*f17f7ee2SSatish Balay PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATH2OPUS)); 1315*f17f7ee2SSatish Balay PetscCall(PetscMemzero(A->ops,sizeof(struct _MatOps))); 1316*f17f7ee2SSatish Balay 1317*f17f7ee2SSatish Balay A->ops->destroy = MatDestroy_H2OPUS; 1318*f17f7ee2SSatish Balay A->ops->view = MatView_H2OPUS; 1319*f17f7ee2SSatish Balay A->ops->assemblyend = MatAssemblyEnd_H2OPUS; 1320*f17f7ee2SSatish Balay A->ops->mult = MatMult_H2OPUS; 1321*f17f7ee2SSatish Balay A->ops->multtranspose = MatMultTranspose_H2OPUS; 1322*f17f7ee2SSatish Balay A->ops->multadd = MatMultAdd_H2OPUS; 1323*f17f7ee2SSatish Balay A->ops->multtransposeadd = MatMultTransposeAdd_H2OPUS; 1324*f17f7ee2SSatish Balay A->ops->scale = MatScale_H2OPUS; 1325*f17f7ee2SSatish Balay A->ops->duplicate = MatDuplicate_H2OPUS; 1326*f17f7ee2SSatish Balay A->ops->setfromoptions = MatSetFromOptions_H2OPUS; 1327*f17f7ee2SSatish Balay A->ops->norm = MatNorm_H2OPUS; 1328*f17f7ee2SSatish Balay A->ops->zeroentries = MatZeroEntries_H2OPUS; 1329*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1330*f17f7ee2SSatish Balay A->ops->bindtocpu = MatBindToCPU_H2OPUS; 1331*f17f7ee2SSatish Balay #endif 1332*f17f7ee2SSatish Balay 1333*f17f7ee2SSatish Balay PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdense_C",MatProductSetFromOptions_H2OPUS)); 1334*f17f7ee2SSatish Balay PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdensecuda_C",MatProductSetFromOptions_H2OPUS)); 1335*f17f7ee2SSatish Balay PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidense_C",MatProductSetFromOptions_H2OPUS)); 1336*f17f7ee2SSatish Balay PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidensecuda_C",MatProductSetFromOptions_H2OPUS)); 1337*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1338*f17f7ee2SSatish Balay PetscCall(PetscFree(A->defaultvectype)); 1339*f17f7ee2SSatish Balay PetscCall(PetscStrallocpy(VECCUDA,&A->defaultvectype)); 1340*f17f7ee2SSatish Balay #endif 1341*f17f7ee2SSatish Balay PetscFunctionReturn(0); 1342*f17f7ee2SSatish Balay } 1343*f17f7ee2SSatish Balay 1344*f17f7ee2SSatish Balay /*@C 1345*f17f7ee2SSatish Balay MatH2OpusOrthogonalize - Orthogonalize the basis tree of a hierarchical matrix. 1346*f17f7ee2SSatish Balay 1347*f17f7ee2SSatish Balay Input Parameter: 1348*f17f7ee2SSatish Balay . A - the matrix 1349*f17f7ee2SSatish Balay 1350*f17f7ee2SSatish Balay Level: intermediate 1351*f17f7ee2SSatish Balay 1352*f17f7ee2SSatish Balay .seealso: MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel(), MatH2OpusCompress() 1353*f17f7ee2SSatish Balay @*/ 1354*f17f7ee2SSatish Balay PetscErrorCode MatH2OpusOrthogonalize(Mat A) 1355*f17f7ee2SSatish Balay { 1356*f17f7ee2SSatish Balay PetscBool ish2opus; 1357*f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 1358*f17f7ee2SSatish Balay PetscMPIInt size; 1359*f17f7ee2SSatish Balay PetscBool boundtocpu = PETSC_TRUE; 1360*f17f7ee2SSatish Balay 1361*f17f7ee2SSatish Balay PetscFunctionBegin; 1362*f17f7ee2SSatish Balay PetscValidHeaderSpecific(A,MAT_CLASSID,1); 1363*f17f7ee2SSatish Balay PetscValidType(A,1); 1364*f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus)); 1365*f17f7ee2SSatish Balay if (!ish2opus) PetscFunctionReturn(0); 1366*f17f7ee2SSatish Balay if (a->orthogonal) PetscFunctionReturn(0); 1367*f17f7ee2SSatish Balay HLibProfile::clear(); 1368*f17f7ee2SSatish Balay PetscCall(PetscLogEventBegin(MAT_H2Opus_Orthog,A,0,0,0)); 1369*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1370*f17f7ee2SSatish Balay boundtocpu = A->boundtocpu; 1371*f17f7ee2SSatish Balay #endif 1372*f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 1373*f17f7ee2SSatish Balay if (size > 1) { 1374*f17f7ee2SSatish Balay if (boundtocpu) { 1375*f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 1376*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1377*f17f7ee2SSatish Balay distributed_horthog(*a->dist_hmatrix, a->handle); 1378*f17f7ee2SSatish Balay #endif 1379*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1380*f17f7ee2SSatish Balay A->offloadmask = PETSC_OFFLOAD_CPU; 1381*f17f7ee2SSatish Balay } else { 1382*f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 1383*f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeBegin()); 1384*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1385*f17f7ee2SSatish Balay distributed_horthog(*a->dist_hmatrix_gpu, a->handle); 1386*f17f7ee2SSatish Balay #endif 1387*f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeEnd()); 1388*f17f7ee2SSatish Balay #endif 1389*f17f7ee2SSatish Balay } 1390*f17f7ee2SSatish Balay } else { 1391*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1392*f17f7ee2SSatish Balay h2opusHandle_t handle = a->handle->handle; 1393*f17f7ee2SSatish Balay #else 1394*f17f7ee2SSatish Balay h2opusHandle_t handle = a->handle; 1395*f17f7ee2SSatish Balay #endif 1396*f17f7ee2SSatish Balay if (boundtocpu) { 1397*f17f7ee2SSatish Balay PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 1398*f17f7ee2SSatish Balay horthog(*a->hmatrix, handle); 1399*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1400*f17f7ee2SSatish Balay A->offloadmask = PETSC_OFFLOAD_CPU; 1401*f17f7ee2SSatish Balay } else { 1402*f17f7ee2SSatish Balay PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 1403*f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeBegin()); 1404*f17f7ee2SSatish Balay horthog(*a->hmatrix_gpu, handle); 1405*f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeEnd()); 1406*f17f7ee2SSatish Balay #endif 1407*f17f7ee2SSatish Balay } 1408*f17f7ee2SSatish Balay } 1409*f17f7ee2SSatish Balay a->orthogonal = PETSC_TRUE; 1410*f17f7ee2SSatish Balay { /* log flops */ 1411*f17f7ee2SSatish Balay double gops,time,perf,dev; 1412*f17f7ee2SSatish Balay HLibProfile::getHorthogPerf(gops,time,perf,dev); 1413*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1414*f17f7ee2SSatish Balay if (boundtocpu) { 1415*f17f7ee2SSatish Balay PetscCall(PetscLogFlops(1e9*gops)); 1416*f17f7ee2SSatish Balay } else { 1417*f17f7ee2SSatish Balay PetscCall(PetscLogGpuFlops(1e9*gops)); 1418*f17f7ee2SSatish Balay } 1419*f17f7ee2SSatish Balay #else 1420*f17f7ee2SSatish Balay PetscCall(PetscLogFlops(1e9*gops)); 1421*f17f7ee2SSatish Balay #endif 1422*f17f7ee2SSatish Balay } 1423*f17f7ee2SSatish Balay PetscCall(PetscLogEventEnd(MAT_H2Opus_Orthog,A,0,0,0)); 1424*f17f7ee2SSatish Balay PetscFunctionReturn(0); 1425*f17f7ee2SSatish Balay } 1426*f17f7ee2SSatish Balay 1427*f17f7ee2SSatish Balay /*@C 1428*f17f7ee2SSatish Balay MatH2OpusCompress - Compress a hierarchical matrix. 1429*f17f7ee2SSatish Balay 1430*f17f7ee2SSatish Balay Input Parameters: 1431*f17f7ee2SSatish Balay + A - the matrix 1432*f17f7ee2SSatish Balay - tol - the absolute truncation threshold 1433*f17f7ee2SSatish Balay 1434*f17f7ee2SSatish Balay Level: intermediate 1435*f17f7ee2SSatish Balay 1436*f17f7ee2SSatish Balay .seealso: MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel(), MatH2OpusOrthogonalize() 1437*f17f7ee2SSatish Balay @*/ 1438*f17f7ee2SSatish Balay PetscErrorCode MatH2OpusCompress(Mat A, PetscReal tol) 1439*f17f7ee2SSatish Balay { 1440*f17f7ee2SSatish Balay PetscBool ish2opus; 1441*f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 1442*f17f7ee2SSatish Balay PetscMPIInt size; 1443*f17f7ee2SSatish Balay PetscBool boundtocpu = PETSC_TRUE; 1444*f17f7ee2SSatish Balay 1445*f17f7ee2SSatish Balay PetscFunctionBegin; 1446*f17f7ee2SSatish Balay PetscValidHeaderSpecific(A,MAT_CLASSID,1); 1447*f17f7ee2SSatish Balay PetscValidType(A,1); 1448*f17f7ee2SSatish Balay PetscValidLogicalCollectiveReal(A,tol,2); 1449*f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus)); 1450*f17f7ee2SSatish Balay if (!ish2opus || tol <= 0.0) PetscFunctionReturn(0); 1451*f17f7ee2SSatish Balay PetscCall(MatH2OpusOrthogonalize(A)); 1452*f17f7ee2SSatish Balay HLibProfile::clear(); 1453*f17f7ee2SSatish Balay PetscCall(PetscLogEventBegin(MAT_H2Opus_Compress,A,0,0,0)); 1454*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1455*f17f7ee2SSatish Balay boundtocpu = A->boundtocpu; 1456*f17f7ee2SSatish Balay #endif 1457*f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 1458*f17f7ee2SSatish Balay if (size > 1) { 1459*f17f7ee2SSatish Balay if (boundtocpu) { 1460*f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 1461*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1462*f17f7ee2SSatish Balay distributed_hcompress(*a->dist_hmatrix, tol, a->handle); 1463*f17f7ee2SSatish Balay #endif 1464*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1465*f17f7ee2SSatish Balay A->offloadmask = PETSC_OFFLOAD_CPU; 1466*f17f7ee2SSatish Balay } else { 1467*f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 1468*f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeBegin()); 1469*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1470*f17f7ee2SSatish Balay distributed_hcompress(*a->dist_hmatrix_gpu, tol, a->handle); 1471*f17f7ee2SSatish Balay #endif 1472*f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeEnd()); 1473*f17f7ee2SSatish Balay #endif 1474*f17f7ee2SSatish Balay } 1475*f17f7ee2SSatish Balay } else { 1476*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1477*f17f7ee2SSatish Balay h2opusHandle_t handle = a->handle->handle; 1478*f17f7ee2SSatish Balay #else 1479*f17f7ee2SSatish Balay h2opusHandle_t handle = a->handle; 1480*f17f7ee2SSatish Balay #endif 1481*f17f7ee2SSatish Balay if (boundtocpu) { 1482*f17f7ee2SSatish Balay PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 1483*f17f7ee2SSatish Balay hcompress(*a->hmatrix, tol, handle); 1484*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1485*f17f7ee2SSatish Balay A->offloadmask = PETSC_OFFLOAD_CPU; 1486*f17f7ee2SSatish Balay } else { 1487*f17f7ee2SSatish Balay PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 1488*f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeBegin()); 1489*f17f7ee2SSatish Balay hcompress(*a->hmatrix_gpu, tol, handle); 1490*f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeEnd()); 1491*f17f7ee2SSatish Balay #endif 1492*f17f7ee2SSatish Balay } 1493*f17f7ee2SSatish Balay } 1494*f17f7ee2SSatish Balay { /* log flops */ 1495*f17f7ee2SSatish Balay double gops,time,perf,dev; 1496*f17f7ee2SSatish Balay HLibProfile::getHcompressPerf(gops,time,perf,dev); 1497*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1498*f17f7ee2SSatish Balay if (boundtocpu) { 1499*f17f7ee2SSatish Balay PetscCall(PetscLogFlops(1e9*gops)); 1500*f17f7ee2SSatish Balay } else { 1501*f17f7ee2SSatish Balay PetscCall(PetscLogGpuFlops(1e9*gops)); 1502*f17f7ee2SSatish Balay } 1503*f17f7ee2SSatish Balay #else 1504*f17f7ee2SSatish Balay PetscCall(PetscLogFlops(1e9*gops)); 1505*f17f7ee2SSatish Balay #endif 1506*f17f7ee2SSatish Balay } 1507*f17f7ee2SSatish Balay PetscCall(PetscLogEventEnd(MAT_H2Opus_Compress,A,0,0,0)); 1508*f17f7ee2SSatish Balay PetscFunctionReturn(0); 1509*f17f7ee2SSatish Balay } 1510*f17f7ee2SSatish Balay 1511*f17f7ee2SSatish Balay /*@C 1512*f17f7ee2SSatish Balay MatH2OpusSetSamplingMat - Set a matrix to be sampled from matrix vector product to construct a hierarchical matrix. 1513*f17f7ee2SSatish Balay 1514*f17f7ee2SSatish Balay Input Parameters: 1515*f17f7ee2SSatish Balay + A - the hierarchical matrix 1516*f17f7ee2SSatish Balay . B - the matrix to be sampled 1517*f17f7ee2SSatish Balay . bs - maximum number of samples to be taken concurrently 1518*f17f7ee2SSatish Balay - tol - relative tolerance for construction 1519*f17f7ee2SSatish Balay 1520*f17f7ee2SSatish Balay Notes: Need to call MatAssemblyBegin/End() to update the hierarchical matrix. 1521*f17f7ee2SSatish Balay 1522*f17f7ee2SSatish Balay Level: intermediate 1523*f17f7ee2SSatish Balay 1524*f17f7ee2SSatish Balay .seealso: MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel(), MatH2OpusCompress(), MatH2OpusOrthogonalize() 1525*f17f7ee2SSatish Balay @*/ 1526*f17f7ee2SSatish Balay PetscErrorCode MatH2OpusSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol) 1527*f17f7ee2SSatish Balay { 1528*f17f7ee2SSatish Balay PetscBool ish2opus; 1529*f17f7ee2SSatish Balay 1530*f17f7ee2SSatish Balay PetscFunctionBegin; 1531*f17f7ee2SSatish Balay PetscValidHeaderSpecific(A,MAT_CLASSID,1); 1532*f17f7ee2SSatish Balay PetscValidType(A,1); 1533*f17f7ee2SSatish Balay if (B) PetscValidHeaderSpecific(B,MAT_CLASSID,2); 1534*f17f7ee2SSatish Balay PetscValidLogicalCollectiveInt(A,bs,3); 1535*f17f7ee2SSatish Balay PetscValidLogicalCollectiveReal(A,tol,3); 1536*f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus)); 1537*f17f7ee2SSatish Balay if (ish2opus) { 1538*f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 1539*f17f7ee2SSatish Balay 1540*f17f7ee2SSatish Balay if (!a->sampler) a->sampler = new PetscMatrixSampler(); 1541*f17f7ee2SSatish Balay a->sampler->SetSamplingMat(B); 1542*f17f7ee2SSatish Balay if (bs > 0) a->bs = bs; 1543*f17f7ee2SSatish Balay if (tol > 0.) a->rtol = tol; 1544*f17f7ee2SSatish Balay delete a->kernel; 1545*f17f7ee2SSatish Balay } 1546*f17f7ee2SSatish Balay PetscFunctionReturn(0); 1547*f17f7ee2SSatish Balay } 1548*f17f7ee2SSatish Balay 1549*f17f7ee2SSatish Balay /*@C 1550*f17f7ee2SSatish Balay MatCreateH2OpusFromKernel - Creates a MATH2OPUS from a user-supplied kernel. 1551*f17f7ee2SSatish Balay 1552*f17f7ee2SSatish Balay Input Parameters: 1553*f17f7ee2SSatish Balay + comm - MPI communicator 1554*f17f7ee2SSatish Balay . m - number of local rows (or PETSC_DECIDE to have calculated if M is given) 1555*f17f7ee2SSatish Balay . n - number of local columns (or PETSC_DECIDE to have calculated if N is given) 1556*f17f7ee2SSatish Balay . M - number of global rows (or PETSC_DETERMINE to have calculated if m is given) 1557*f17f7ee2SSatish Balay . N - number of global columns (or PETSC_DETERMINE to have calculated if n is given) 1558*f17f7ee2SSatish Balay . spacedim - dimension of the space coordinates 1559*f17f7ee2SSatish Balay . coords - coordinates of the points 1560*f17f7ee2SSatish Balay . cdist - whether or not coordinates are distributed 1561*f17f7ee2SSatish Balay . kernel - computational kernel (or NULL) 1562*f17f7ee2SSatish Balay . kernelctx - kernel context 1563*f17f7ee2SSatish Balay . eta - admissibility condition tolerance 1564*f17f7ee2SSatish Balay . leafsize - leaf size in cluster tree 1565*f17f7ee2SSatish Balay - basisord - approximation order for Chebychev interpolation of low-rank blocks 1566*f17f7ee2SSatish Balay 1567*f17f7ee2SSatish Balay Output Parameter: 1568*f17f7ee2SSatish Balay . nA - matrix 1569*f17f7ee2SSatish Balay 1570*f17f7ee2SSatish Balay Options Database Keys: 1571*f17f7ee2SSatish Balay + -mat_h2opus_leafsize <PetscInt> - Leaf size of cluster tree 1572*f17f7ee2SSatish Balay . -mat_h2opus_eta <PetscReal> - Admissibility condition tolerance 1573*f17f7ee2SSatish Balay . -mat_h2opus_order <PetscInt> - Chebychev approximation order 1574*f17f7ee2SSatish Balay - -mat_h2opus_normsamples <PetscInt> - Maximum number of samples to be used when estimating norms 1575*f17f7ee2SSatish Balay 1576*f17f7ee2SSatish Balay Level: intermediate 1577*f17f7ee2SSatish Balay 1578*f17f7ee2SSatish Balay .seealso: MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat() 1579*f17f7ee2SSatish Balay @*/ 1580*f17f7ee2SSatish Balay PetscErrorCode MatCreateH2OpusFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat* nA) 1581*f17f7ee2SSatish Balay { 1582*f17f7ee2SSatish Balay Mat A; 1583*f17f7ee2SSatish Balay Mat_H2OPUS *h2opus; 1584*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1585*f17f7ee2SSatish Balay PetscBool iscpu = PETSC_FALSE; 1586*f17f7ee2SSatish Balay #else 1587*f17f7ee2SSatish Balay PetscBool iscpu = PETSC_TRUE; 1588*f17f7ee2SSatish Balay #endif 1589*f17f7ee2SSatish Balay 1590*f17f7ee2SSatish Balay PetscFunctionBegin; 1591*f17f7ee2SSatish Balay PetscCheckFalse(m != n,PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported"); 1592*f17f7ee2SSatish Balay PetscCall(MatCreate(comm,&A)); 1593*f17f7ee2SSatish Balay PetscCall(MatSetSizes(A,m,n,M,N)); 1594*f17f7ee2SSatish Balay PetscCheckFalse(M != N,comm,PETSC_ERR_SUP,"Rectangular matrices are not supported"); 1595*f17f7ee2SSatish Balay PetscCall(MatSetType(A,MATH2OPUS)); 1596*f17f7ee2SSatish Balay PetscCall(MatBindToCPU(A,iscpu)); 1597*f17f7ee2SSatish Balay PetscCall(MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,cdist,kernel,kernelctx)); 1598*f17f7ee2SSatish Balay 1599*f17f7ee2SSatish Balay h2opus = (Mat_H2OPUS*)A->data; 1600*f17f7ee2SSatish Balay if (eta > 0.) h2opus->eta = eta; 1601*f17f7ee2SSatish Balay if (leafsize > 0) h2opus->leafsize = leafsize; 1602*f17f7ee2SSatish Balay if (basisord > 0) h2opus->basisord = basisord; 1603*f17f7ee2SSatish Balay 1604*f17f7ee2SSatish Balay *nA = A; 1605*f17f7ee2SSatish Balay PetscFunctionReturn(0); 1606*f17f7ee2SSatish Balay } 1607*f17f7ee2SSatish Balay 1608*f17f7ee2SSatish Balay /*@C 1609*f17f7ee2SSatish Balay MatCreateH2OpusFromMat - Creates a MATH2OPUS sampling from a user-supplied operator. 1610*f17f7ee2SSatish Balay 1611*f17f7ee2SSatish Balay Input Parameters: 1612*f17f7ee2SSatish Balay + B - the matrix to be sampled 1613*f17f7ee2SSatish Balay . spacedim - dimension of the space coordinates 1614*f17f7ee2SSatish Balay . coords - coordinates of the points 1615*f17f7ee2SSatish Balay . cdist - whether or not coordinates are distributed 1616*f17f7ee2SSatish Balay . eta - admissibility condition tolerance 1617*f17f7ee2SSatish Balay . leafsize - leaf size in cluster tree 1618*f17f7ee2SSatish Balay . maxrank - maximum rank allowed 1619*f17f7ee2SSatish Balay . bs - maximum number of samples to be taken concurrently 1620*f17f7ee2SSatish Balay - rtol - relative tolerance for construction 1621*f17f7ee2SSatish Balay 1622*f17f7ee2SSatish Balay Output Parameter: 1623*f17f7ee2SSatish Balay . nA - matrix 1624*f17f7ee2SSatish Balay 1625*f17f7ee2SSatish Balay Options Database Keys: 1626*f17f7ee2SSatish Balay + -mat_h2opus_leafsize <PetscInt> - Leaf size of cluster tree 1627*f17f7ee2SSatish Balay . -mat_h2opus_eta <PetscReal> - Admissibility condition tolerance 1628*f17f7ee2SSatish Balay . -mat_h2opus_maxrank <PetscInt> - Maximum rank when constructed from matvecs 1629*f17f7ee2SSatish Balay . -mat_h2opus_samples <PetscInt> - Maximum number of samples to be taken concurrently when constructing from matvecs 1630*f17f7ee2SSatish Balay . -mat_h2opus_rtol <PetscReal> - Relative tolerance for construction from sampling 1631*f17f7ee2SSatish Balay . -mat_h2opus_check <PetscBool> - Check error when constructing from sampling during MatAssemblyEnd() 1632*f17f7ee2SSatish Balay . -mat_h2opus_hara_verbose <PetscBool> - Verbose output from hara construction 1633*f17f7ee2SSatish Balay - -mat_h2opus_normsamples <PetscInt> - Maximum bumber of samples to be when estimating norms 1634*f17f7ee2SSatish Balay 1635*f17f7ee2SSatish Balay Notes: not available in parallel 1636*f17f7ee2SSatish Balay 1637*f17f7ee2SSatish Balay Level: intermediate 1638*f17f7ee2SSatish Balay 1639*f17f7ee2SSatish Balay .seealso: MatCreate(), MATH2OPUS, MatCreateH2OpusFromKernel() 1640*f17f7ee2SSatish Balay @*/ 1641*f17f7ee2SSatish Balay PetscErrorCode MatCreateH2OpusFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA) 1642*f17f7ee2SSatish Balay { 1643*f17f7ee2SSatish Balay Mat A; 1644*f17f7ee2SSatish Balay Mat_H2OPUS *h2opus; 1645*f17f7ee2SSatish Balay MPI_Comm comm; 1646*f17f7ee2SSatish Balay PetscBool boundtocpu = PETSC_TRUE; 1647*f17f7ee2SSatish Balay 1648*f17f7ee2SSatish Balay PetscFunctionBegin; 1649*f17f7ee2SSatish Balay PetscValidHeaderSpecific(B,MAT_CLASSID,1); 1650*f17f7ee2SSatish Balay PetscValidLogicalCollectiveInt(B,spacedim,2); 1651*f17f7ee2SSatish Balay PetscValidLogicalCollectiveBool(B,cdist,4); 1652*f17f7ee2SSatish Balay PetscValidLogicalCollectiveReal(B,eta,5); 1653*f17f7ee2SSatish Balay PetscValidLogicalCollectiveInt(B,leafsize,6); 1654*f17f7ee2SSatish Balay PetscValidLogicalCollectiveInt(B,maxrank,7); 1655*f17f7ee2SSatish Balay PetscValidLogicalCollectiveInt(B,bs,8); 1656*f17f7ee2SSatish Balay PetscValidLogicalCollectiveReal(B,rtol,9); 1657*f17f7ee2SSatish Balay PetscValidPointer(nA,10); 1658*f17f7ee2SSatish Balay PetscCall(PetscObjectGetComm((PetscObject)B,&comm)); 1659*f17f7ee2SSatish Balay PetscCheckFalse(B->rmap->n != B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported"); 1660*f17f7ee2SSatish Balay PetscCheckFalse(B->rmap->N != B->cmap->N,comm,PETSC_ERR_SUP,"Rectangular matrices are not supported"); 1661*f17f7ee2SSatish Balay PetscCall(MatCreate(comm,&A)); 1662*f17f7ee2SSatish Balay PetscCall(MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N)); 1663*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1664*f17f7ee2SSatish Balay { 1665*f17f7ee2SSatish Balay PetscBool iscuda; 1666*f17f7ee2SSatish Balay VecType vtype; 1667*f17f7ee2SSatish Balay 1668*f17f7ee2SSatish Balay PetscCall(MatGetVecType(B,&vtype)); 1669*f17f7ee2SSatish Balay PetscCall(PetscStrcmp(vtype,VECCUDA,&iscuda)); 1670*f17f7ee2SSatish Balay if (!iscuda) { 1671*f17f7ee2SSatish Balay PetscCall(PetscStrcmp(vtype,VECSEQCUDA,&iscuda)); 1672*f17f7ee2SSatish Balay if (!iscuda) { 1673*f17f7ee2SSatish Balay PetscCall(PetscStrcmp(vtype,VECMPICUDA,&iscuda)); 1674*f17f7ee2SSatish Balay } 1675*f17f7ee2SSatish Balay } 1676*f17f7ee2SSatish Balay if (iscuda && !B->boundtocpu) boundtocpu = PETSC_FALSE; 1677*f17f7ee2SSatish Balay } 1678*f17f7ee2SSatish Balay #endif 1679*f17f7ee2SSatish Balay PetscCall(MatSetType(A,MATH2OPUS)); 1680*f17f7ee2SSatish Balay PetscCall(MatBindToCPU(A,boundtocpu)); 1681*f17f7ee2SSatish Balay if (spacedim) { 1682*f17f7ee2SSatish Balay PetscCall(MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,cdist,NULL,NULL)); 1683*f17f7ee2SSatish Balay } 1684*f17f7ee2SSatish Balay PetscCall(MatPropagateSymmetryOptions(B,A)); 1685*f17f7ee2SSatish Balay /* PetscCheck(A->symmetric,comm,PETSC_ERR_SUP,"Unsymmetric sampling does not work"); */ 1686*f17f7ee2SSatish Balay 1687*f17f7ee2SSatish Balay h2opus = (Mat_H2OPUS*)A->data; 1688*f17f7ee2SSatish Balay h2opus->sampler = new PetscMatrixSampler(B); 1689*f17f7ee2SSatish Balay if (eta > 0.) h2opus->eta = eta; 1690*f17f7ee2SSatish Balay if (leafsize > 0) h2opus->leafsize = leafsize; 1691*f17f7ee2SSatish Balay if (maxrank > 0) h2opus->max_rank = maxrank; 1692*f17f7ee2SSatish Balay if (bs > 0) h2opus->bs = bs; 1693*f17f7ee2SSatish Balay if (rtol > 0.) h2opus->rtol = rtol; 1694*f17f7ee2SSatish Balay *nA = A; 1695*f17f7ee2SSatish Balay A->preallocated = PETSC_TRUE; 1696*f17f7ee2SSatish Balay PetscFunctionReturn(0); 1697*f17f7ee2SSatish Balay } 1698*f17f7ee2SSatish Balay 1699*f17f7ee2SSatish Balay /*@C 1700*f17f7ee2SSatish Balay MatH2OpusGetIndexMap - Access reordering index set. 1701*f17f7ee2SSatish Balay 1702*f17f7ee2SSatish Balay Input Parameters: 1703*f17f7ee2SSatish Balay . A - the matrix 1704*f17f7ee2SSatish Balay 1705*f17f7ee2SSatish Balay Output Parameter: 1706*f17f7ee2SSatish Balay . indexmap - the index set for the reordering 1707*f17f7ee2SSatish Balay 1708*f17f7ee2SSatish Balay Level: intermediate 1709*f17f7ee2SSatish Balay 1710*f17f7ee2SSatish Balay .seealso: MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel() 1711*f17f7ee2SSatish Balay @*/ 1712*f17f7ee2SSatish Balay PetscErrorCode MatH2OpusGetIndexMap(Mat A, IS *indexmap) 1713*f17f7ee2SSatish Balay { 1714*f17f7ee2SSatish Balay PetscBool ish2opus; 1715*f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 1716*f17f7ee2SSatish Balay 1717*f17f7ee2SSatish Balay PetscFunctionBegin; 1718*f17f7ee2SSatish Balay PetscValidHeaderSpecific(A,MAT_CLASSID,1); 1719*f17f7ee2SSatish Balay PetscValidType(A,1); 1720*f17f7ee2SSatish Balay PetscValidPointer(indexmap,2); 1721*f17f7ee2SSatish Balay PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 1722*f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus)); 1723*f17f7ee2SSatish Balay PetscCheck(ish2opus,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name); 1724*f17f7ee2SSatish Balay *indexmap = a->h2opus_indexmap; 1725*f17f7ee2SSatish Balay PetscFunctionReturn(0); 1726*f17f7ee2SSatish Balay } 1727*f17f7ee2SSatish Balay 1728*f17f7ee2SSatish Balay /*@C 1729*f17f7ee2SSatish Balay MatH2OpusMapVec - Maps a vector between PETSc and H2Opus ordering 1730*f17f7ee2SSatish Balay 1731*f17f7ee2SSatish Balay Input Parameters: 1732*f17f7ee2SSatish Balay + A - the matrix 1733*f17f7ee2SSatish Balay . nativetopetsc - if true, maps from H2Opus ordering to PETSc ordering. If false, applies the reverse map 1734*f17f7ee2SSatish Balay - in - the vector to be mapped 1735*f17f7ee2SSatish Balay 1736*f17f7ee2SSatish Balay Output Parameter: 1737*f17f7ee2SSatish Balay . out - the newly created mapped vector 1738*f17f7ee2SSatish Balay 1739*f17f7ee2SSatish Balay Level: intermediate 1740*f17f7ee2SSatish Balay 1741*f17f7ee2SSatish Balay .seealso: MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel() 1742*f17f7ee2SSatish Balay @*/ 1743*f17f7ee2SSatish Balay PetscErrorCode MatH2OpusMapVec(Mat A, PetscBool nativetopetsc, Vec in, Vec* out) 1744*f17f7ee2SSatish Balay { 1745*f17f7ee2SSatish Balay PetscBool ish2opus; 1746*f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 1747*f17f7ee2SSatish Balay PetscScalar *xin,*xout; 1748*f17f7ee2SSatish Balay PetscBool nm; 1749*f17f7ee2SSatish Balay 1750*f17f7ee2SSatish Balay PetscFunctionBegin; 1751*f17f7ee2SSatish Balay PetscValidHeaderSpecific(A,MAT_CLASSID,1); 1752*f17f7ee2SSatish Balay PetscValidType(A,1); 1753*f17f7ee2SSatish Balay PetscValidLogicalCollectiveBool(A,nativetopetsc,2); 1754*f17f7ee2SSatish Balay PetscValidHeaderSpecific(in,VEC_CLASSID,3); 1755*f17f7ee2SSatish Balay PetscValidPointer(out,4); 1756*f17f7ee2SSatish Balay PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 1757*f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus)); 1758*f17f7ee2SSatish Balay PetscCheck(ish2opus,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name); 1759*f17f7ee2SSatish Balay nm = a->nativemult; 1760*f17f7ee2SSatish Balay PetscCall(MatH2OpusSetNativeMult(A,(PetscBool)!nativetopetsc)); 1761*f17f7ee2SSatish Balay PetscCall(MatCreateVecs(A,out,NULL)); 1762*f17f7ee2SSatish Balay PetscCall(MatH2OpusSetNativeMult(A,nm)); 1763*f17f7ee2SSatish Balay if (!a->sf) { /* same ordering */ 1764*f17f7ee2SSatish Balay PetscCall(VecCopy(in,*out)); 1765*f17f7ee2SSatish Balay PetscFunctionReturn(0); 1766*f17f7ee2SSatish Balay } 1767*f17f7ee2SSatish Balay PetscCall(VecGetArrayRead(in,(const PetscScalar**)&xin)); 1768*f17f7ee2SSatish Balay PetscCall(VecGetArrayWrite(*out,&xout)); 1769*f17f7ee2SSatish Balay if (nativetopetsc) { 1770*f17f7ee2SSatish Balay PetscCall(PetscSFReduceBegin(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE)); 1771*f17f7ee2SSatish Balay PetscCall(PetscSFReduceEnd(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE)); 1772*f17f7ee2SSatish Balay } else { 1773*f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE)); 1774*f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE)); 1775*f17f7ee2SSatish Balay } 1776*f17f7ee2SSatish Balay PetscCall(VecRestoreArrayRead(in,(const PetscScalar**)&xin)); 1777*f17f7ee2SSatish Balay PetscCall(VecRestoreArrayWrite(*out,&xout)); 1778*f17f7ee2SSatish Balay PetscFunctionReturn(0); 1779*f17f7ee2SSatish Balay } 1780*f17f7ee2SSatish Balay 1781*f17f7ee2SSatish Balay /*@C 1782*f17f7ee2SSatish Balay MatH2OpusLowRankUpdate - Perform a low-rank update of the form A = A + s * U * V^T 1783*f17f7ee2SSatish Balay 1784*f17f7ee2SSatish Balay Input Parameters: 1785*f17f7ee2SSatish Balay + A - the hierarchical matrix 1786*f17f7ee2SSatish Balay . s - the scaling factor 1787*f17f7ee2SSatish Balay . U - the dense low-rank update matrix 1788*f17f7ee2SSatish Balay - V - (optional) the dense low-rank update matrix (if NULL, then V = U is assumed) 1789*f17f7ee2SSatish Balay 1790*f17f7ee2SSatish Balay Notes: The U and V matrices must be in dense format 1791*f17f7ee2SSatish Balay 1792*f17f7ee2SSatish Balay Level: intermediate 1793*f17f7ee2SSatish Balay 1794*f17f7ee2SSatish Balay .seealso: MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel(), MatH2OpusCompress(), MatH2OpusOrthogonalize(), MATDENSE 1795*f17f7ee2SSatish Balay @*/ 1796*f17f7ee2SSatish Balay PetscErrorCode MatH2OpusLowRankUpdate(Mat A, Mat U, Mat V, PetscScalar s) 1797*f17f7ee2SSatish Balay { 1798*f17f7ee2SSatish Balay PetscBool flg; 1799*f17f7ee2SSatish Balay 1800*f17f7ee2SSatish Balay PetscFunctionBegin; 1801*f17f7ee2SSatish Balay PetscValidHeaderSpecific(A,MAT_CLASSID,1); 1802*f17f7ee2SSatish Balay PetscValidType(A,1); 1803*f17f7ee2SSatish Balay PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 1804*f17f7ee2SSatish Balay PetscValidHeaderSpecific(U,MAT_CLASSID,2); 1805*f17f7ee2SSatish Balay PetscCheckSameComm(A,1,U,2); 1806*f17f7ee2SSatish Balay if (V) { 1807*f17f7ee2SSatish Balay PetscValidHeaderSpecific(V,MAT_CLASSID,3); 1808*f17f7ee2SSatish Balay PetscCheckSameComm(A,1,V,3); 1809*f17f7ee2SSatish Balay } 1810*f17f7ee2SSatish Balay PetscValidLogicalCollectiveScalar(A,s,4); 1811*f17f7ee2SSatish Balay 1812*f17f7ee2SSatish Balay if (!V) V = U; 1813*f17f7ee2SSatish Balay PetscCheckFalse(U->cmap->N != V->cmap->N,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Non matching rank update %" PetscInt_FMT " != %" PetscInt_FMT,U->cmap->N,V->cmap->N); 1814*f17f7ee2SSatish Balay if (!U->cmap->N) PetscFunctionReturn(0); 1815*f17f7ee2SSatish Balay PetscCall(PetscLayoutCompare(U->rmap,A->rmap,&flg)); 1816*f17f7ee2SSatish Balay PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"A and U must have the same row layout"); 1817*f17f7ee2SSatish Balay PetscCall(PetscLayoutCompare(V->rmap,A->cmap,&flg)); 1818*f17f7ee2SSatish Balay PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"A column layout must match V row column layout"); 1819*f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&flg)); 1820*f17f7ee2SSatish Balay if (flg) { 1821*f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 1822*f17f7ee2SSatish Balay const PetscScalar *u,*v,*uu,*vv; 1823*f17f7ee2SSatish Balay PetscInt ldu,ldv; 1824*f17f7ee2SSatish Balay PetscMPIInt size; 1825*f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1826*f17f7ee2SSatish Balay h2opusHandle_t handle = a->handle->handle; 1827*f17f7ee2SSatish Balay #else 1828*f17f7ee2SSatish Balay h2opusHandle_t handle = a->handle; 1829*f17f7ee2SSatish Balay #endif 1830*f17f7ee2SSatish Balay PetscBool usesf = (PetscBool)(a->sf && !a->nativemult); 1831*f17f7ee2SSatish Balay PetscSF usf,vsf; 1832*f17f7ee2SSatish Balay 1833*f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 1834*f17f7ee2SSatish Balay PetscCheckFalse(size > 1,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not yet implemented in parallel"); 1835*f17f7ee2SSatish Balay PetscCall(PetscLogEventBegin(MAT_H2Opus_LR,A,0,0,0)); 1836*f17f7ee2SSatish Balay PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)U,&flg,MATSEQDENSE,MATMPIDENSE,"")); 1837*f17f7ee2SSatish Balay PetscCheck(flg,PetscObjectComm((PetscObject)U),PETSC_ERR_SUP,"Not for U of type %s",((PetscObject)U)->type_name); 1838*f17f7ee2SSatish Balay PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)V,&flg,MATSEQDENSE,MATMPIDENSE,"")); 1839*f17f7ee2SSatish Balay PetscCheck(flg,PetscObjectComm((PetscObject)V),PETSC_ERR_SUP,"Not for V of type %s",((PetscObject)V)->type_name); 1840*f17f7ee2SSatish Balay PetscCall(MatDenseGetLDA(U,&ldu)); 1841*f17f7ee2SSatish Balay PetscCall(MatDenseGetLDA(V,&ldv)); 1842*f17f7ee2SSatish Balay PetscCall(MatBoundToCPU(A,&flg)); 1843*f17f7ee2SSatish Balay if (usesf) { 1844*f17f7ee2SSatish Balay PetscInt n; 1845*f17f7ee2SSatish Balay 1846*f17f7ee2SSatish Balay PetscCall(MatDenseGetH2OpusVectorSF(U,a->sf,&usf)); 1847*f17f7ee2SSatish Balay PetscCall(MatDenseGetH2OpusVectorSF(V,a->sf,&vsf)); 1848*f17f7ee2SSatish Balay PetscCall(MatH2OpusResizeBuffers_Private(A,U->cmap->N,V->cmap->N)); 1849*f17f7ee2SSatish Balay PetscCall(PetscSFGetGraph(a->sf,NULL,&n,NULL,NULL)); 1850*f17f7ee2SSatish Balay ldu = n; 1851*f17f7ee2SSatish Balay ldv = n; 1852*f17f7ee2SSatish Balay } 1853*f17f7ee2SSatish Balay if (flg) { 1854*f17f7ee2SSatish Balay PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 1855*f17f7ee2SSatish Balay PetscCall(MatDenseGetArrayRead(U,&u)); 1856*f17f7ee2SSatish Balay PetscCall(MatDenseGetArrayRead(V,&v)); 1857*f17f7ee2SSatish Balay if (usesf) { 1858*f17f7ee2SSatish Balay vv = MatH2OpusGetThrustPointer(*a->yy); 1859*f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE)); 1860*f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE)); 1861*f17f7ee2SSatish Balay if (U != V) { 1862*f17f7ee2SSatish Balay uu = MatH2OpusGetThrustPointer(*a->xx); 1863*f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE)); 1864*f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE)); 1865*f17f7ee2SSatish Balay } else uu = vv; 1866*f17f7ee2SSatish Balay } else { uu = u; vv = v; } 1867*f17f7ee2SSatish Balay hlru_global(*a->hmatrix,uu,ldu,vv,ldv,U->cmap->N,s,handle); 1868*f17f7ee2SSatish Balay PetscCall(MatDenseRestoreArrayRead(U,&u)); 1869*f17f7ee2SSatish Balay PetscCall(MatDenseRestoreArrayRead(V,&v)); 1870*f17f7ee2SSatish Balay } else { 1871*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1872*f17f7ee2SSatish Balay PetscBool flgU, flgV; 1873*f17f7ee2SSatish Balay 1874*f17f7ee2SSatish Balay PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 1875*f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)U,&flgU,MATSEQDENSE,MATMPIDENSE,"")); 1876*f17f7ee2SSatish Balay if (flgU) PetscCall(MatConvert(U,MATDENSECUDA,MAT_INPLACE_MATRIX,&U)); 1877*f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)V,&flgV,MATSEQDENSE,MATMPIDENSE,"")); 1878*f17f7ee2SSatish Balay if (flgV) PetscCall(MatConvert(V,MATDENSECUDA,MAT_INPLACE_MATRIX,&V)); 1879*f17f7ee2SSatish Balay PetscCall(MatDenseCUDAGetArrayRead(U,&u)); 1880*f17f7ee2SSatish Balay PetscCall(MatDenseCUDAGetArrayRead(V,&v)); 1881*f17f7ee2SSatish Balay if (usesf) { 1882*f17f7ee2SSatish Balay vv = MatH2OpusGetThrustPointer(*a->yy_gpu); 1883*f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE)); 1884*f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE)); 1885*f17f7ee2SSatish Balay if (U != V) { 1886*f17f7ee2SSatish Balay uu = MatH2OpusGetThrustPointer(*a->xx_gpu); 1887*f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE)); 1888*f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE)); 1889*f17f7ee2SSatish Balay } else uu = vv; 1890*f17f7ee2SSatish Balay } else { uu = u; vv = v; } 1891*f17f7ee2SSatish Balay #else 1892*f17f7ee2SSatish Balay SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"This should not happen"); 1893*f17f7ee2SSatish Balay #endif 1894*f17f7ee2SSatish Balay hlru_global(*a->hmatrix_gpu,uu,ldu,vv,ldv,U->cmap->N,s,handle); 1895*f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1896*f17f7ee2SSatish Balay PetscCall(MatDenseCUDARestoreArrayRead(U,&u)); 1897*f17f7ee2SSatish Balay PetscCall(MatDenseCUDARestoreArrayRead(V,&v)); 1898*f17f7ee2SSatish Balay if (flgU) PetscCall(MatConvert(U,MATDENSE,MAT_INPLACE_MATRIX,&U)); 1899*f17f7ee2SSatish Balay if (flgV) PetscCall(MatConvert(V,MATDENSE,MAT_INPLACE_MATRIX,&V)); 1900*f17f7ee2SSatish Balay #endif 1901*f17f7ee2SSatish Balay } 1902*f17f7ee2SSatish Balay PetscCall(PetscLogEventEnd(MAT_H2Opus_LR,A,0,0,0)); 1903*f17f7ee2SSatish Balay a->orthogonal = PETSC_FALSE; 1904*f17f7ee2SSatish Balay } 1905*f17f7ee2SSatish Balay PetscFunctionReturn(0); 1906*f17f7ee2SSatish Balay } 1907*f17f7ee2SSatish Balay #endif 1908