1f17f7ee2SSatish Balay #include <h2opusconf.h> 2f17f7ee2SSatish Balay /* skip compilation of this .cu file if H2OPUS is CPU only while PETSc has GPU support */ 3f17f7ee2SSatish Balay #if !defined(__CUDACC__) || defined(H2OPUS_USE_GPU) 4f17f7ee2SSatish Balay #include <h2opus.h> 5f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 6f17f7ee2SSatish Balay #include <h2opus/distributed/distributed_h2opus_handle.h> 7f17f7ee2SSatish Balay #include <h2opus/distributed/distributed_geometric_construction.h> 8f17f7ee2SSatish Balay #include <h2opus/distributed/distributed_hgemv.h> 9f17f7ee2SSatish Balay #include <h2opus/distributed/distributed_horthog.h> 10f17f7ee2SSatish Balay #include <h2opus/distributed/distributed_hcompress.h> 11f17f7ee2SSatish Balay #endif 12f17f7ee2SSatish Balay #include <h2opus/util/boxentrygen.h> 13f17f7ee2SSatish Balay #include <petsc/private/matimpl.h> 14f17f7ee2SSatish Balay #include <petsc/private/vecimpl.h> 15f17f7ee2SSatish Balay #include <petsc/private/deviceimpl.h> 16f17f7ee2SSatish Balay #include <petscsf.h> 17f17f7ee2SSatish Balay 18f17f7ee2SSatish Balay /* math2opusutils */ 19f17f7ee2SSatish Balay PETSC_INTERN PetscErrorCode PetscSFGetVectorSF(PetscSF,PetscInt,PetscInt,PetscInt,PetscSF*); 20f17f7ee2SSatish Balay PETSC_INTERN PetscErrorCode MatDenseGetH2OpusVectorSF(Mat,PetscSF,PetscSF*); 21f17f7ee2SSatish Balay PETSC_INTERN PetscErrorCode VecSign(Vec,Vec); 22f17f7ee2SSatish Balay PETSC_INTERN PetscErrorCode VecSetDelta(Vec,PetscInt); 23f17f7ee2SSatish Balay PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat,NormType,PetscInt,PetscReal*); 24f17f7ee2SSatish Balay 25f17f7ee2SSatish Balay #define MatH2OpusGetThrustPointer(v) thrust::raw_pointer_cast((v).data()) 26f17f7ee2SSatish Balay 27f17f7ee2SSatish Balay /* Use GPU only if H2OPUS is configured for GPU */ 28f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU) 29f17f7ee2SSatish Balay #define PETSC_H2OPUS_USE_GPU 30f17f7ee2SSatish Balay #endif 31f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 32f17f7ee2SSatish Balay #define MatH2OpusUpdateIfNeeded(A,B) MatBindToCPU(A,(PetscBool)((A)->boundtocpu || (B))) 33f17f7ee2SSatish Balay #else 34f17f7ee2SSatish Balay #define MatH2OpusUpdateIfNeeded(A,B) 0 35f17f7ee2SSatish Balay #endif 36f17f7ee2SSatish Balay 37f17f7ee2SSatish Balay // TODO H2OPUS: 38f17f7ee2SSatish Balay // DistributedHMatrix 39f17f7ee2SSatish Balay // unsymmetric ? 40f17f7ee2SSatish Balay // transpose for distributed_hgemv? 41f17f7ee2SSatish Balay // clearData() 42f17f7ee2SSatish Balay // Unify interface for sequential and parallel? 43f17f7ee2SSatish Balay // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled) 44f17f7ee2SSatish Balay // 45f17f7ee2SSatish Balay template <class T> class PetscPointCloud : public H2OpusDataSet<T> 46f17f7ee2SSatish Balay { 47f17f7ee2SSatish Balay private: 48f17f7ee2SSatish Balay int dimension; 49f17f7ee2SSatish Balay size_t num_points; 50f17f7ee2SSatish Balay std::vector<T> pts; 51f17f7ee2SSatish Balay 52f17f7ee2SSatish Balay public: 53f17f7ee2SSatish Balay PetscPointCloud(int dim, size_t num_pts, const T coords[]) 54f17f7ee2SSatish Balay { 55f17f7ee2SSatish Balay dim = dim > 0 ? dim : 1; 56f17f7ee2SSatish Balay this->dimension = dim; 57f17f7ee2SSatish Balay this->num_points = num_pts; 58f17f7ee2SSatish Balay 59f17f7ee2SSatish Balay pts.resize(num_pts*dim); 60f17f7ee2SSatish Balay if (coords) { 61*036e4bdcSSatish Balay for (size_t n = 0; n < num_pts; n++) 62f17f7ee2SSatish Balay for (int i = 0; i < dim; i++) 63f17f7ee2SSatish Balay pts[n*dim + i] = coords[n*dim + i]; 64f17f7ee2SSatish Balay } else { 65*036e4bdcSSatish Balay PetscReal h = 1.0; //num_pts > 1 ? 1./(num_pts - 1) : 0.0; 66*036e4bdcSSatish Balay for (size_t n = 0; n < num_pts; n++) { 67*036e4bdcSSatish Balay pts[n*dim] = n*h; 68*036e4bdcSSatish Balay for (int i = 1; i < dim; i++) 69*036e4bdcSSatish Balay pts[n*dim + i] = 0.0; 70*036e4bdcSSatish Balay } 71f17f7ee2SSatish Balay } 72f17f7ee2SSatish Balay } 73f17f7ee2SSatish Balay 74f17f7ee2SSatish Balay PetscPointCloud(const PetscPointCloud<T>& other) 75f17f7ee2SSatish Balay { 76f17f7ee2SSatish Balay size_t N = other.dimension * other.num_points; 77f17f7ee2SSatish Balay this->dimension = other.dimension; 78f17f7ee2SSatish Balay this->num_points = other.num_points; 79f17f7ee2SSatish Balay this->pts.resize(N); 80f17f7ee2SSatish Balay for (size_t i = 0; i < N; i++) 81f17f7ee2SSatish Balay this->pts[i] = other.pts[i]; 82f17f7ee2SSatish Balay } 83f17f7ee2SSatish Balay 84f17f7ee2SSatish Balay int getDimension() const 85f17f7ee2SSatish Balay { 86f17f7ee2SSatish Balay return dimension; 87f17f7ee2SSatish Balay } 88f17f7ee2SSatish Balay 89f17f7ee2SSatish Balay size_t getDataSetSize() const 90f17f7ee2SSatish Balay { 91f17f7ee2SSatish Balay return num_points; 92f17f7ee2SSatish Balay } 93f17f7ee2SSatish Balay 94f17f7ee2SSatish Balay T getDataPoint(size_t idx, int dim) const 95f17f7ee2SSatish Balay { 96f17f7ee2SSatish Balay assert(dim < dimension && idx < num_points); 97f17f7ee2SSatish Balay return pts[idx*dimension + dim]; 98f17f7ee2SSatish Balay } 99f17f7ee2SSatish Balay 100f17f7ee2SSatish Balay void Print(std::ostream& out = std::cout) 101f17f7ee2SSatish Balay { 102f17f7ee2SSatish Balay out << "Dimension: " << dimension << std::endl; 103f17f7ee2SSatish Balay out << "NumPoints: " << num_points << std::endl; 104f17f7ee2SSatish Balay for (size_t n = 0; n < num_points; n++) { 105f17f7ee2SSatish Balay for (int d = 0; d < dimension; d++) 106f17f7ee2SSatish Balay out << pts[n*dimension + d] << " "; 107f17f7ee2SSatish Balay out << std::endl; 108f17f7ee2SSatish Balay } 109f17f7ee2SSatish Balay } 110f17f7ee2SSatish Balay }; 111f17f7ee2SSatish Balay 112f17f7ee2SSatish Balay template<class T> class PetscFunctionGenerator 113f17f7ee2SSatish Balay { 114f17f7ee2SSatish Balay private: 115f17f7ee2SSatish Balay MatH2OpusKernel k; 116f17f7ee2SSatish Balay int dim; 117f17f7ee2SSatish Balay void *ctx; 118f17f7ee2SSatish Balay 119f17f7ee2SSatish Balay public: 120f17f7ee2SSatish Balay PetscFunctionGenerator(MatH2OpusKernel k, int dim, void* ctx) { this->k = k; this->dim = dim; this->ctx = ctx; } 121f17f7ee2SSatish Balay PetscFunctionGenerator(PetscFunctionGenerator& other) { this->k = other.k; this->dim = other.dim; this->ctx = other.ctx; } 122f17f7ee2SSatish Balay T operator()(PetscReal *pt1, PetscReal *pt2) 123f17f7ee2SSatish Balay { 124f17f7ee2SSatish Balay return (T)((*this->k)(this->dim,pt1,pt2,this->ctx)); 125f17f7ee2SSatish Balay } 126f17f7ee2SSatish Balay }; 127f17f7ee2SSatish Balay 128f17f7ee2SSatish Balay #include <../src/mat/impls/h2opus/math2opussampler.hpp> 129f17f7ee2SSatish Balay 130f17f7ee2SSatish Balay /* just to not clutter the code */ 131f17f7ee2SSatish Balay #if !defined(H2OPUS_USE_GPU) 132f17f7ee2SSatish Balay typedef HMatrix HMatrix_GPU; 133f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 134f17f7ee2SSatish Balay typedef DistributedHMatrix DistributedHMatrix_GPU; 135f17f7ee2SSatish Balay #endif 136f17f7ee2SSatish Balay #endif 137f17f7ee2SSatish Balay 138f17f7ee2SSatish Balay typedef struct { 139f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 140f17f7ee2SSatish Balay distributedH2OpusHandle_t handle; 141f17f7ee2SSatish Balay #else 142f17f7ee2SSatish Balay h2opusHandle_t handle; 143f17f7ee2SSatish Balay #endif 144f17f7ee2SSatish Balay /* Sequential and parallel matrices are two different classes at the moment */ 145f17f7ee2SSatish Balay HMatrix *hmatrix; 146f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 147f17f7ee2SSatish Balay DistributedHMatrix *dist_hmatrix; 148f17f7ee2SSatish Balay #else 149f17f7ee2SSatish Balay HMatrix *dist_hmatrix; /* just to not clutter the code */ 150f17f7ee2SSatish Balay #endif 151f17f7ee2SSatish Balay /* May use permutations */ 152f17f7ee2SSatish Balay PetscSF sf; 153f17f7ee2SSatish Balay PetscLayout h2opus_rmap, h2opus_cmap; 154f17f7ee2SSatish Balay IS h2opus_indexmap; 155f17f7ee2SSatish Balay thrust::host_vector<PetscScalar> *xx,*yy; 156f17f7ee2SSatish Balay PetscInt xxs,yys; 157f17f7ee2SSatish Balay PetscBool multsetup; 158f17f7ee2SSatish Balay 159f17f7ee2SSatish Balay /* GPU */ 160f17f7ee2SSatish Balay HMatrix_GPU *hmatrix_gpu; 161f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 162f17f7ee2SSatish Balay DistributedHMatrix_GPU *dist_hmatrix_gpu; 163f17f7ee2SSatish Balay #else 164f17f7ee2SSatish Balay HMatrix_GPU *dist_hmatrix_gpu; /* just to not clutter the code */ 165f17f7ee2SSatish Balay #endif 166f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 167f17f7ee2SSatish Balay thrust::device_vector<PetscScalar> *xx_gpu,*yy_gpu; 168f17f7ee2SSatish Balay PetscInt xxs_gpu,yys_gpu; 169f17f7ee2SSatish Balay #endif 170f17f7ee2SSatish Balay 171f17f7ee2SSatish Balay /* construction from matvecs */ 172f17f7ee2SSatish Balay PetscMatrixSampler* sampler; 173f17f7ee2SSatish Balay PetscBool nativemult; 174f17f7ee2SSatish Balay 175f17f7ee2SSatish Balay /* Admissibility */ 176f17f7ee2SSatish Balay PetscReal eta; 177f17f7ee2SSatish Balay PetscInt leafsize; 178f17f7ee2SSatish Balay 179f17f7ee2SSatish Balay /* for dof reordering */ 180f17f7ee2SSatish Balay PetscPointCloud<PetscReal> *ptcloud; 181f17f7ee2SSatish Balay 182f17f7ee2SSatish Balay /* kernel for generating matrix entries */ 183f17f7ee2SSatish Balay PetscFunctionGenerator<PetscScalar> *kernel; 184f17f7ee2SSatish Balay 185f17f7ee2SSatish Balay /* basis orthogonalized? */ 186f17f7ee2SSatish Balay PetscBool orthogonal; 187f17f7ee2SSatish Balay 188f17f7ee2SSatish Balay /* customization */ 189f17f7ee2SSatish Balay PetscInt basisord; 190f17f7ee2SSatish Balay PetscInt max_rank; 191f17f7ee2SSatish Balay PetscInt bs; 192f17f7ee2SSatish Balay PetscReal rtol; 193f17f7ee2SSatish Balay PetscInt norm_max_samples; 194f17f7ee2SSatish Balay PetscBool check_construction; 195f17f7ee2SSatish Balay PetscBool hara_verbose; 196*036e4bdcSSatish Balay PetscBool resize; 197f17f7ee2SSatish Balay 198f17f7ee2SSatish Balay /* keeps track of MatScale values */ 199f17f7ee2SSatish Balay PetscScalar s; 200f17f7ee2SSatish Balay } Mat_H2OPUS; 201f17f7ee2SSatish Balay 202f17f7ee2SSatish Balay static PetscErrorCode MatDestroy_H2OPUS(Mat A) 203f17f7ee2SSatish Balay { 204f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 205f17f7ee2SSatish Balay 206f17f7ee2SSatish Balay PetscFunctionBegin; 207f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 208f17f7ee2SSatish Balay h2opusDestroyDistributedHandle(a->handle); 209f17f7ee2SSatish Balay #else 210f17f7ee2SSatish Balay h2opusDestroyHandle(a->handle); 211f17f7ee2SSatish Balay #endif 212f17f7ee2SSatish Balay delete a->dist_hmatrix; 213f17f7ee2SSatish Balay delete a->hmatrix; 214f17f7ee2SSatish Balay PetscCall(PetscSFDestroy(&a->sf)); 215f17f7ee2SSatish Balay PetscCall(PetscLayoutDestroy(&a->h2opus_rmap)); 216f17f7ee2SSatish Balay PetscCall(PetscLayoutDestroy(&a->h2opus_cmap)); 217f17f7ee2SSatish Balay PetscCall(ISDestroy(&a->h2opus_indexmap)); 218f17f7ee2SSatish Balay delete a->xx; 219f17f7ee2SSatish Balay delete a->yy; 220f17f7ee2SSatish Balay delete a->hmatrix_gpu; 221f17f7ee2SSatish Balay delete a->dist_hmatrix_gpu; 222f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 223f17f7ee2SSatish Balay delete a->xx_gpu; 224f17f7ee2SSatish Balay delete a->yy_gpu; 225f17f7ee2SSatish Balay #endif 226f17f7ee2SSatish Balay delete a->sampler; 227f17f7ee2SSatish Balay delete a->ptcloud; 228f17f7ee2SSatish Balay delete a->kernel; 229f17f7ee2SSatish Balay PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdense_C",NULL)); 230f17f7ee2SSatish Balay PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdensecuda_C",NULL)); 231f17f7ee2SSatish Balay PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidense_C",NULL)); 232f17f7ee2SSatish Balay PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidensecuda_C",NULL)); 233f17f7ee2SSatish Balay PetscCall(PetscObjectChangeTypeName((PetscObject)A,NULL)); 234f17f7ee2SSatish Balay PetscCall(PetscFree(A->data)); 235f17f7ee2SSatish Balay PetscFunctionReturn(0); 236f17f7ee2SSatish Balay } 237f17f7ee2SSatish Balay 238f17f7ee2SSatish Balay PetscErrorCode MatH2OpusSetNativeMult(Mat A, PetscBool nm) 239f17f7ee2SSatish Balay { 240f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 241f17f7ee2SSatish Balay PetscBool ish2opus; 242f17f7ee2SSatish Balay 243f17f7ee2SSatish Balay PetscFunctionBegin; 244f17f7ee2SSatish Balay PetscValidHeaderSpecific(A,MAT_CLASSID,1); 245f17f7ee2SSatish Balay PetscValidLogicalCollectiveBool(A,nm,2); 246f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus)); 247f17f7ee2SSatish Balay if (ish2opus) { 248f17f7ee2SSatish Balay if (a->h2opus_rmap) { /* need to swap layouts for vector creation */ 249f17f7ee2SSatish Balay if ((!a->nativemult && nm) || (a->nativemult && !nm)) { 250f17f7ee2SSatish Balay PetscLayout t; 251f17f7ee2SSatish Balay t = A->rmap; 252f17f7ee2SSatish Balay A->rmap = a->h2opus_rmap; 253f17f7ee2SSatish Balay a->h2opus_rmap = t; 254f17f7ee2SSatish Balay t = A->cmap; 255f17f7ee2SSatish Balay A->cmap = a->h2opus_cmap; 256f17f7ee2SSatish Balay a->h2opus_cmap = t; 257f17f7ee2SSatish Balay } 258f17f7ee2SSatish Balay } 259f17f7ee2SSatish Balay a->nativemult = nm; 260f17f7ee2SSatish Balay } 261f17f7ee2SSatish Balay PetscFunctionReturn(0); 262f17f7ee2SSatish Balay } 263f17f7ee2SSatish Balay 264f17f7ee2SSatish Balay PetscErrorCode MatH2OpusGetNativeMult(Mat A, PetscBool *nm) 265f17f7ee2SSatish Balay { 266f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 267f17f7ee2SSatish Balay PetscBool ish2opus; 268f17f7ee2SSatish Balay 269f17f7ee2SSatish Balay PetscFunctionBegin; 270f17f7ee2SSatish Balay PetscValidHeaderSpecific(A,MAT_CLASSID,1); 271f17f7ee2SSatish Balay PetscValidPointer(nm,2); 272f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus)); 273f17f7ee2SSatish Balay PetscCheck(ish2opus,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name); 274f17f7ee2SSatish Balay *nm = a->nativemult; 275f17f7ee2SSatish Balay PetscFunctionReturn(0); 276f17f7ee2SSatish Balay } 277f17f7ee2SSatish Balay 278f17f7ee2SSatish Balay PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat A, NormType normtype, PetscReal* n) 279f17f7ee2SSatish Balay { 280f17f7ee2SSatish Balay PetscBool ish2opus; 281f17f7ee2SSatish Balay PetscInt nmax = PETSC_DECIDE; 282f17f7ee2SSatish Balay Mat_H2OPUS *a = NULL; 283f17f7ee2SSatish Balay PetscBool mult = PETSC_FALSE; 284f17f7ee2SSatish Balay 285f17f7ee2SSatish Balay PetscFunctionBegin; 286f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus)); 287f17f7ee2SSatish Balay if (ish2opus) { /* set userdefine number of samples and fastpath for mult (norms are order independent) */ 288f17f7ee2SSatish Balay a = (Mat_H2OPUS*)A->data; 289f17f7ee2SSatish Balay 290f17f7ee2SSatish Balay nmax = a->norm_max_samples; 291f17f7ee2SSatish Balay mult = a->nativemult; 292f17f7ee2SSatish Balay PetscCall(MatH2OpusSetNativeMult(A,PETSC_TRUE)); 293f17f7ee2SSatish Balay } else { 294f17f7ee2SSatish Balay PetscCall(PetscOptionsGetInt(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_approximate_norm_samples",&nmax,NULL)); 295f17f7ee2SSatish Balay } 296f17f7ee2SSatish Balay PetscCall(MatApproximateNorm_Private(A,normtype,nmax,n)); 297f17f7ee2SSatish Balay if (a) PetscCall(MatH2OpusSetNativeMult(A,mult)); 298f17f7ee2SSatish Balay PetscFunctionReturn(0); 299f17f7ee2SSatish Balay } 300f17f7ee2SSatish Balay 301f17f7ee2SSatish Balay static PetscErrorCode MatH2OpusResizeBuffers_Private(Mat A, PetscInt xN, PetscInt yN) 302f17f7ee2SSatish Balay { 303f17f7ee2SSatish Balay Mat_H2OPUS *h2opus = (Mat_H2OPUS*)A->data; 304f17f7ee2SSatish Balay PetscInt n; 305f17f7ee2SSatish Balay PetscBool boundtocpu = PETSC_TRUE; 306f17f7ee2SSatish Balay 307f17f7ee2SSatish Balay PetscFunctionBegin; 308f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 309f17f7ee2SSatish Balay boundtocpu = A->boundtocpu; 310f17f7ee2SSatish Balay #endif 311f17f7ee2SSatish Balay PetscCall(PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL)); 312f17f7ee2SSatish Balay if (boundtocpu) { 313f17f7ee2SSatish Balay if (h2opus->xxs < xN) { h2opus->xx->resize(n*xN); h2opus->xxs = xN; } 314f17f7ee2SSatish Balay if (h2opus->yys < yN) { h2opus->yy->resize(n*yN); h2opus->yys = yN; } 315f17f7ee2SSatish Balay } 316f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 317f17f7ee2SSatish Balay if (!boundtocpu) { 318f17f7ee2SSatish Balay if (h2opus->xxs_gpu < xN) { h2opus->xx_gpu->resize(n*xN); h2opus->xxs_gpu = xN; } 319f17f7ee2SSatish Balay if (h2opus->yys_gpu < yN) { h2opus->yy_gpu->resize(n*yN); h2opus->yys_gpu = yN; } 320f17f7ee2SSatish Balay } 321f17f7ee2SSatish Balay #endif 322f17f7ee2SSatish Balay PetscFunctionReturn(0); 323f17f7ee2SSatish Balay } 324f17f7ee2SSatish Balay 325f17f7ee2SSatish Balay static PetscErrorCode MatMultNKernel_H2OPUS(Mat A, PetscBool transA, Mat B, Mat C) 326f17f7ee2SSatish Balay { 327f17f7ee2SSatish Balay Mat_H2OPUS *h2opus = (Mat_H2OPUS*)A->data; 328f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 329f17f7ee2SSatish Balay h2opusHandle_t handle = h2opus->handle->handle; 330f17f7ee2SSatish Balay #else 331f17f7ee2SSatish Balay h2opusHandle_t handle = h2opus->handle; 332f17f7ee2SSatish Balay #endif 333f17f7ee2SSatish Balay PetscBool boundtocpu = PETSC_TRUE; 334f17f7ee2SSatish Balay PetscScalar *xx,*yy,*uxx,*uyy; 335f17f7ee2SSatish Balay PetscInt blda,clda; 336f17f7ee2SSatish Balay PetscMPIInt size; 337f17f7ee2SSatish Balay PetscSF bsf,csf; 338f17f7ee2SSatish Balay PetscBool usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult); 339f17f7ee2SSatish Balay 340f17f7ee2SSatish Balay PetscFunctionBegin; 341f17f7ee2SSatish Balay HLibProfile::clear(); 342f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 343f17f7ee2SSatish Balay boundtocpu = A->boundtocpu; 344f17f7ee2SSatish Balay #endif 345f17f7ee2SSatish Balay PetscCall(MatDenseGetLDA(B,&blda)); 346f17f7ee2SSatish Balay PetscCall(MatDenseGetLDA(C,&clda)); 347f17f7ee2SSatish Balay if (usesf) { 348f17f7ee2SSatish Balay PetscInt n; 349f17f7ee2SSatish Balay 350f17f7ee2SSatish Balay PetscCall(MatDenseGetH2OpusVectorSF(B,h2opus->sf,&bsf)); 351f17f7ee2SSatish Balay PetscCall(MatDenseGetH2OpusVectorSF(C,h2opus->sf,&csf)); 352f17f7ee2SSatish Balay 353f17f7ee2SSatish Balay PetscCall(MatH2OpusResizeBuffers_Private(A,B->cmap->N,C->cmap->N)); 354f17f7ee2SSatish Balay PetscCall(PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL)); 355f17f7ee2SSatish Balay blda = n; 356f17f7ee2SSatish Balay clda = n; 357f17f7ee2SSatish Balay } 358f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 359f17f7ee2SSatish Balay if (boundtocpu) { 360f17f7ee2SSatish Balay PetscCall(MatDenseGetArrayRead(B,(const PetscScalar**)&xx)); 361f17f7ee2SSatish Balay PetscCall(MatDenseGetArrayWrite(C,&yy)); 362f17f7ee2SSatish Balay if (usesf) { 363f17f7ee2SSatish Balay uxx = MatH2OpusGetThrustPointer(*h2opus->xx); 364f17f7ee2SSatish Balay uyy = MatH2OpusGetThrustPointer(*h2opus->yy); 365f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE)); 366f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE)); 367f17f7ee2SSatish Balay } else { 368f17f7ee2SSatish Balay uxx = xx; 369f17f7ee2SSatish Balay uyy = yy; 370f17f7ee2SSatish Balay } 371f17f7ee2SSatish Balay if (size > 1) { 372f17f7ee2SSatish Balay PetscCheck(h2opus->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix"); 373*036e4bdcSSatish Balay PetscCheck(!transA || A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel"); 374f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 375f17f7ee2SSatish Balay distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle); 376f17f7ee2SSatish Balay #endif 377f17f7ee2SSatish Balay } else { 378f17f7ee2SSatish Balay PetscCheck(h2opus->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 379f17f7ee2SSatish Balay hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle); 380f17f7ee2SSatish Balay } 381f17f7ee2SSatish Balay PetscCall(MatDenseRestoreArrayRead(B,(const PetscScalar**)&xx)); 382f17f7ee2SSatish Balay if (usesf) { 383f17f7ee2SSatish Balay PetscCall(PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE)); 384f17f7ee2SSatish Balay PetscCall(PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE)); 385f17f7ee2SSatish Balay } 386f17f7ee2SSatish Balay PetscCall(MatDenseRestoreArrayWrite(C,&yy)); 387f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 388f17f7ee2SSatish Balay } else { 389f17f7ee2SSatish Balay PetscBool ciscuda,biscuda; 390f17f7ee2SSatish Balay 391f17f7ee2SSatish Balay /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */ 392f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)B,&biscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,"")); 393f17f7ee2SSatish Balay if (!biscuda) { 394f17f7ee2SSatish Balay PetscCall(MatConvert(B,MATDENSECUDA,MAT_INPLACE_MATRIX,&B)); 395f17f7ee2SSatish Balay } 396f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)C,&ciscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,"")); 397f17f7ee2SSatish Balay if (!ciscuda) { 398f17f7ee2SSatish Balay C->assembled = PETSC_TRUE; 399f17f7ee2SSatish Balay PetscCall(MatConvert(C,MATDENSECUDA,MAT_INPLACE_MATRIX,&C)); 400f17f7ee2SSatish Balay } 401f17f7ee2SSatish Balay PetscCall(MatDenseCUDAGetArrayRead(B,(const PetscScalar**)&xx)); 402f17f7ee2SSatish Balay PetscCall(MatDenseCUDAGetArrayWrite(C,&yy)); 403f17f7ee2SSatish Balay if (usesf) { 404f17f7ee2SSatish Balay uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu); 405f17f7ee2SSatish Balay uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu); 406f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE)); 407f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE)); 408f17f7ee2SSatish Balay } else { 409f17f7ee2SSatish Balay uxx = xx; 410f17f7ee2SSatish Balay uyy = yy; 411f17f7ee2SSatish Balay } 412f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeBegin()); 413f17f7ee2SSatish Balay if (size > 1) { 414f17f7ee2SSatish Balay PetscCheck(h2opus->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed GPU matrix"); 415*036e4bdcSSatish Balay PetscCheck(!transA || A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel"); 416f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 417f17f7ee2SSatish Balay distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle); 418f17f7ee2SSatish Balay #endif 419f17f7ee2SSatish Balay } else { 420f17f7ee2SSatish Balay PetscCheck(h2opus->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 421f17f7ee2SSatish Balay hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle); 422f17f7ee2SSatish Balay } 423f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeEnd()); 424f17f7ee2SSatish Balay PetscCall(MatDenseCUDARestoreArrayRead(B,(const PetscScalar**)&xx)); 425f17f7ee2SSatish Balay if (usesf) { 426f17f7ee2SSatish Balay PetscCall(PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE)); 427f17f7ee2SSatish Balay PetscCall(PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE)); 428f17f7ee2SSatish Balay } 429f17f7ee2SSatish Balay PetscCall(MatDenseCUDARestoreArrayWrite(C,&yy)); 430f17f7ee2SSatish Balay if (!biscuda) { 431f17f7ee2SSatish Balay PetscCall(MatConvert(B,MATDENSE,MAT_INPLACE_MATRIX,&B)); 432f17f7ee2SSatish Balay } 433f17f7ee2SSatish Balay if (!ciscuda) { 434f17f7ee2SSatish Balay PetscCall(MatConvert(C,MATDENSE,MAT_INPLACE_MATRIX,&C)); 435f17f7ee2SSatish Balay } 436f17f7ee2SSatish Balay #endif 437f17f7ee2SSatish Balay } 438f17f7ee2SSatish Balay { /* log flops */ 439f17f7ee2SSatish Balay double gops,time,perf,dev; 440f17f7ee2SSatish Balay HLibProfile::getHgemvPerf(gops,time,perf,dev); 441f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 442f17f7ee2SSatish Balay if (boundtocpu) { 443f17f7ee2SSatish Balay PetscCall(PetscLogFlops(1e9*gops)); 444f17f7ee2SSatish Balay } else { 445f17f7ee2SSatish Balay PetscCall(PetscLogGpuFlops(1e9*gops)); 446f17f7ee2SSatish Balay } 447f17f7ee2SSatish Balay #else 448f17f7ee2SSatish Balay PetscCall(PetscLogFlops(1e9*gops)); 449f17f7ee2SSatish Balay #endif 450f17f7ee2SSatish Balay } 451f17f7ee2SSatish Balay PetscFunctionReturn(0); 452f17f7ee2SSatish Balay } 453f17f7ee2SSatish Balay 454f17f7ee2SSatish Balay static PetscErrorCode MatProductNumeric_H2OPUS(Mat C) 455f17f7ee2SSatish Balay { 456f17f7ee2SSatish Balay Mat_Product *product = C->product; 457f17f7ee2SSatish Balay 458f17f7ee2SSatish Balay PetscFunctionBegin; 459f17f7ee2SSatish Balay MatCheckProduct(C,1); 460f17f7ee2SSatish Balay switch (product->type) { 461f17f7ee2SSatish Balay case MATPRODUCT_AB: 462f17f7ee2SSatish Balay PetscCall(MatMultNKernel_H2OPUS(product->A,PETSC_FALSE,product->B,C)); 463f17f7ee2SSatish Balay break; 464f17f7ee2SSatish Balay case MATPRODUCT_AtB: 465f17f7ee2SSatish Balay PetscCall(MatMultNKernel_H2OPUS(product->A,PETSC_TRUE,product->B,C)); 466f17f7ee2SSatish Balay break; 467f17f7ee2SSatish Balay default: 468f17f7ee2SSatish Balay SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]); 469f17f7ee2SSatish Balay } 470f17f7ee2SSatish Balay PetscFunctionReturn(0); 471f17f7ee2SSatish Balay } 472f17f7ee2SSatish Balay 473f17f7ee2SSatish Balay static PetscErrorCode MatProductSymbolic_H2OPUS(Mat C) 474f17f7ee2SSatish Balay { 475f17f7ee2SSatish Balay Mat_Product *product = C->product; 476f17f7ee2SSatish Balay PetscBool cisdense; 477f17f7ee2SSatish Balay Mat A,B; 478f17f7ee2SSatish Balay 479f17f7ee2SSatish Balay PetscFunctionBegin; 480f17f7ee2SSatish Balay MatCheckProduct(C,1); 481f17f7ee2SSatish Balay A = product->A; 482f17f7ee2SSatish Balay B = product->B; 483f17f7ee2SSatish Balay switch (product->type) { 484f17f7ee2SSatish Balay case MATPRODUCT_AB: 485f17f7ee2SSatish Balay PetscCall(MatSetSizes(C,A->rmap->n,B->cmap->n,A->rmap->N,B->cmap->N)); 486f17f7ee2SSatish Balay PetscCall(MatSetBlockSizesFromMats(C,product->A,product->B)); 487f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,"")); 488f17f7ee2SSatish Balay if (!cisdense) PetscCall(MatSetType(C,((PetscObject)product->B)->type_name)); 489f17f7ee2SSatish Balay PetscCall(MatSetUp(C)); 490f17f7ee2SSatish Balay break; 491f17f7ee2SSatish Balay case MATPRODUCT_AtB: 492f17f7ee2SSatish Balay PetscCall(MatSetSizes(C,A->cmap->n,B->cmap->n,A->cmap->N,B->cmap->N)); 493f17f7ee2SSatish Balay PetscCall(MatSetBlockSizesFromMats(C,product->A,product->B)); 494f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,"")); 495f17f7ee2SSatish Balay if (!cisdense) PetscCall(MatSetType(C,((PetscObject)product->B)->type_name)); 496f17f7ee2SSatish Balay PetscCall(MatSetUp(C)); 497f17f7ee2SSatish Balay break; 498f17f7ee2SSatish Balay default: 499f17f7ee2SSatish Balay SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]); 500f17f7ee2SSatish Balay } 501f17f7ee2SSatish Balay C->ops->productsymbolic = NULL; 502f17f7ee2SSatish Balay C->ops->productnumeric = MatProductNumeric_H2OPUS; 503f17f7ee2SSatish Balay PetscFunctionReturn(0); 504f17f7ee2SSatish Balay } 505f17f7ee2SSatish Balay 506f17f7ee2SSatish Balay static PetscErrorCode MatProductSetFromOptions_H2OPUS(Mat C) 507f17f7ee2SSatish Balay { 508f17f7ee2SSatish Balay PetscFunctionBegin; 509f17f7ee2SSatish Balay MatCheckProduct(C,1); 510f17f7ee2SSatish Balay if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) { 511f17f7ee2SSatish Balay C->ops->productsymbolic = MatProductSymbolic_H2OPUS; 512f17f7ee2SSatish Balay } 513f17f7ee2SSatish Balay PetscFunctionReturn(0); 514f17f7ee2SSatish Balay } 515f17f7ee2SSatish Balay 516f17f7ee2SSatish Balay static PetscErrorCode MatMultKernel_H2OPUS(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans) 517f17f7ee2SSatish Balay { 518f17f7ee2SSatish Balay Mat_H2OPUS *h2opus = (Mat_H2OPUS*)A->data; 519f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 520f17f7ee2SSatish Balay h2opusHandle_t handle = h2opus->handle->handle; 521f17f7ee2SSatish Balay #else 522f17f7ee2SSatish Balay h2opusHandle_t handle = h2opus->handle; 523f17f7ee2SSatish Balay #endif 524f17f7ee2SSatish Balay PetscBool boundtocpu = PETSC_TRUE; 525f17f7ee2SSatish Balay PetscInt n; 526f17f7ee2SSatish Balay PetscScalar *xx,*yy,*uxx,*uyy; 527f17f7ee2SSatish Balay PetscMPIInt size; 528f17f7ee2SSatish Balay PetscBool usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult); 529f17f7ee2SSatish Balay 530f17f7ee2SSatish Balay PetscFunctionBegin; 531f17f7ee2SSatish Balay HLibProfile::clear(); 532f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 533f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 534f17f7ee2SSatish Balay boundtocpu = A->boundtocpu; 535f17f7ee2SSatish Balay #endif 536f17f7ee2SSatish Balay if (usesf) { 537f17f7ee2SSatish Balay PetscCall(PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL)); 538f17f7ee2SSatish Balay } else n = A->rmap->n; 539f17f7ee2SSatish Balay if (boundtocpu) { 540f17f7ee2SSatish Balay PetscCall(VecGetArrayRead(x,(const PetscScalar**)&xx)); 541f17f7ee2SSatish Balay if (sy == 0.0) { 542f17f7ee2SSatish Balay PetscCall(VecGetArrayWrite(y,&yy)); 543f17f7ee2SSatish Balay } else { 544f17f7ee2SSatish Balay PetscCall(VecGetArray(y,&yy)); 545f17f7ee2SSatish Balay } 546f17f7ee2SSatish Balay if (usesf) { 547f17f7ee2SSatish Balay uxx = MatH2OpusGetThrustPointer(*h2opus->xx); 548f17f7ee2SSatish Balay uyy = MatH2OpusGetThrustPointer(*h2opus->yy); 549f17f7ee2SSatish Balay 550f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE)); 551f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE)); 552f17f7ee2SSatish Balay if (sy != 0.0) { 553f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE)); 554f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE)); 555f17f7ee2SSatish Balay } 556f17f7ee2SSatish Balay } else { 557f17f7ee2SSatish Balay uxx = xx; 558f17f7ee2SSatish Balay uyy = yy; 559f17f7ee2SSatish Balay } 560f17f7ee2SSatish Balay if (size > 1) { 561f17f7ee2SSatish Balay PetscCheck(h2opus->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix"); 562*036e4bdcSSatish Balay PetscCheck(!trans || A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel"); 563f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 564f17f7ee2SSatish Balay distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix, uxx, n, sy, uyy, n, 1, h2opus->handle); 565f17f7ee2SSatish Balay #endif 566f17f7ee2SSatish Balay } else { 567f17f7ee2SSatish Balay PetscCheck(h2opus->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 568f17f7ee2SSatish Balay hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, n, sy, uyy, n, 1, handle); 569f17f7ee2SSatish Balay } 570f17f7ee2SSatish Balay PetscCall(VecRestoreArrayRead(x,(const PetscScalar**)&xx)); 571f17f7ee2SSatish Balay if (usesf) { 572f17f7ee2SSatish Balay PetscCall(PetscSFReduceBegin(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE)); 573f17f7ee2SSatish Balay PetscCall(PetscSFReduceEnd(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE)); 574f17f7ee2SSatish Balay } 575f17f7ee2SSatish Balay if (sy == 0.0) { 576f17f7ee2SSatish Balay PetscCall(VecRestoreArrayWrite(y,&yy)); 577f17f7ee2SSatish Balay } else { 578f17f7ee2SSatish Balay PetscCall(VecRestoreArray(y,&yy)); 579f17f7ee2SSatish Balay } 580f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 581f17f7ee2SSatish Balay } else { 582f17f7ee2SSatish Balay PetscCall(VecCUDAGetArrayRead(x,(const PetscScalar**)&xx)); 583f17f7ee2SSatish Balay if (sy == 0.0) { 584f17f7ee2SSatish Balay PetscCall(VecCUDAGetArrayWrite(y,&yy)); 585f17f7ee2SSatish Balay } else { 586f17f7ee2SSatish Balay PetscCall(VecCUDAGetArray(y,&yy)); 587f17f7ee2SSatish Balay } 588f17f7ee2SSatish Balay if (usesf) { 589f17f7ee2SSatish Balay uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu); 590f17f7ee2SSatish Balay uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu); 591f17f7ee2SSatish Balay 592f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE)); 593f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE)); 594f17f7ee2SSatish Balay if (sy != 0.0) { 595f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE)); 596f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE)); 597f17f7ee2SSatish Balay } 598f17f7ee2SSatish Balay } else { 599f17f7ee2SSatish Balay uxx = xx; 600f17f7ee2SSatish Balay uyy = yy; 601f17f7ee2SSatish Balay } 602f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeBegin()); 603f17f7ee2SSatish Balay if (size > 1) { 604f17f7ee2SSatish Balay PetscCheck(h2opus->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed GPU matrix"); 605*036e4bdcSSatish Balay PetscCheck(!trans || A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel"); 606f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 607f17f7ee2SSatish Balay distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, h2opus->handle); 608f17f7ee2SSatish Balay #endif 609f17f7ee2SSatish Balay } else { 610f17f7ee2SSatish Balay PetscCheck(h2opus->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 611f17f7ee2SSatish Balay hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle); 612f17f7ee2SSatish Balay } 613f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeEnd()); 614f17f7ee2SSatish Balay PetscCall(VecCUDARestoreArrayRead(x,(const PetscScalar**)&xx)); 615f17f7ee2SSatish Balay if (usesf) { 616f17f7ee2SSatish Balay PetscCall(PetscSFReduceBegin(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE)); 617f17f7ee2SSatish Balay PetscCall(PetscSFReduceEnd(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE)); 618f17f7ee2SSatish Balay } 619f17f7ee2SSatish Balay if (sy == 0.0) { 620f17f7ee2SSatish Balay PetscCall(VecCUDARestoreArrayWrite(y,&yy)); 621f17f7ee2SSatish Balay } else { 622f17f7ee2SSatish Balay PetscCall(VecCUDARestoreArray(y,&yy)); 623f17f7ee2SSatish Balay } 624f17f7ee2SSatish Balay #endif 625f17f7ee2SSatish Balay } 626f17f7ee2SSatish Balay { /* log flops */ 627f17f7ee2SSatish Balay double gops,time,perf,dev; 628f17f7ee2SSatish Balay HLibProfile::getHgemvPerf(gops,time,perf,dev); 629f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 630f17f7ee2SSatish Balay if (boundtocpu) { 631f17f7ee2SSatish Balay PetscCall(PetscLogFlops(1e9*gops)); 632f17f7ee2SSatish Balay } else { 633f17f7ee2SSatish Balay PetscCall(PetscLogGpuFlops(1e9*gops)); 634f17f7ee2SSatish Balay } 635f17f7ee2SSatish Balay #else 636f17f7ee2SSatish Balay PetscCall(PetscLogFlops(1e9*gops)); 637f17f7ee2SSatish Balay #endif 638f17f7ee2SSatish Balay } 639f17f7ee2SSatish Balay PetscFunctionReturn(0); 640f17f7ee2SSatish Balay } 641f17f7ee2SSatish Balay 642f17f7ee2SSatish Balay static PetscErrorCode MatMultTranspose_H2OPUS(Mat A, Vec x, Vec y) 643f17f7ee2SSatish Balay { 644f17f7ee2SSatish Balay PetscBool xiscuda,yiscuda; 645f17f7ee2SSatish Balay 646f17f7ee2SSatish Balay PetscFunctionBegin; 647f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"")); 648f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)y,&yiscuda,VECSEQCUDA,VECMPICUDA,"")); 649f17f7ee2SSatish Balay PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !yiscuda)); 650f17f7ee2SSatish Balay PetscCall(MatMultKernel_H2OPUS(A,x,0.0,y,PETSC_TRUE)); 651f17f7ee2SSatish Balay PetscFunctionReturn(0); 652f17f7ee2SSatish Balay } 653f17f7ee2SSatish Balay 654f17f7ee2SSatish Balay static PetscErrorCode MatMult_H2OPUS(Mat A, Vec x, Vec y) 655f17f7ee2SSatish Balay { 656f17f7ee2SSatish Balay PetscBool xiscuda,yiscuda; 657f17f7ee2SSatish Balay 658f17f7ee2SSatish Balay PetscFunctionBegin; 659f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"")); 660f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)y,&yiscuda,VECSEQCUDA,VECMPICUDA,"")); 661f17f7ee2SSatish Balay PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !yiscuda)); 662f17f7ee2SSatish Balay PetscCall(MatMultKernel_H2OPUS(A,x,0.0,y,PETSC_FALSE)); 663f17f7ee2SSatish Balay PetscFunctionReturn(0); 664f17f7ee2SSatish Balay } 665f17f7ee2SSatish Balay 666f17f7ee2SSatish Balay static PetscErrorCode MatMultTransposeAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z) 667f17f7ee2SSatish Balay { 668f17f7ee2SSatish Balay PetscBool xiscuda,ziscuda; 669f17f7ee2SSatish Balay 670f17f7ee2SSatish Balay PetscFunctionBegin; 671f17f7ee2SSatish Balay PetscCall(VecCopy(y,z)); 672f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"")); 673f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)z,&ziscuda,VECSEQCUDA,VECMPICUDA,"")); 674f17f7ee2SSatish Balay PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !ziscuda)); 675f17f7ee2SSatish Balay PetscCall(MatMultKernel_H2OPUS(A,x,1.0,z,PETSC_TRUE)); 676f17f7ee2SSatish Balay PetscFunctionReturn(0); 677f17f7ee2SSatish Balay } 678f17f7ee2SSatish Balay 679f17f7ee2SSatish Balay static PetscErrorCode MatMultAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z) 680f17f7ee2SSatish Balay { 681f17f7ee2SSatish Balay PetscBool xiscuda,ziscuda; 682f17f7ee2SSatish Balay 683f17f7ee2SSatish Balay PetscFunctionBegin; 684f17f7ee2SSatish Balay PetscCall(VecCopy(y,z)); 685f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,"")); 686f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)z,&ziscuda,VECSEQCUDA,VECMPICUDA,"")); 687f17f7ee2SSatish Balay PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !ziscuda)); 688f17f7ee2SSatish Balay PetscCall(MatMultKernel_H2OPUS(A,x,1.0,z,PETSC_FALSE)); 689f17f7ee2SSatish Balay PetscFunctionReturn(0); 690f17f7ee2SSatish Balay } 691f17f7ee2SSatish Balay 692f17f7ee2SSatish Balay static PetscErrorCode MatScale_H2OPUS(Mat A, PetscScalar s) 693f17f7ee2SSatish Balay { 694f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 695f17f7ee2SSatish Balay 696f17f7ee2SSatish Balay PetscFunctionBegin; 697f17f7ee2SSatish Balay a->s *= s; 698f17f7ee2SSatish Balay PetscFunctionReturn(0); 699f17f7ee2SSatish Balay } 700f17f7ee2SSatish Balay 701f17f7ee2SSatish Balay static PetscErrorCode MatSetFromOptions_H2OPUS(PetscOptionItems *PetscOptionsObject,Mat A) 702f17f7ee2SSatish Balay { 703f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 704f17f7ee2SSatish Balay 705f17f7ee2SSatish Balay PetscFunctionBegin; 706*036e4bdcSSatish Balay PetscOptionsHeadBegin(PetscOptionsObject,"H2OPUS options"); 707f17f7ee2SSatish Balay PetscCall(PetscOptionsInt("-mat_h2opus_leafsize","Leaf size of cluster tree",NULL,a->leafsize,&a->leafsize,NULL)); 708f17f7ee2SSatish Balay PetscCall(PetscOptionsReal("-mat_h2opus_eta","Admissibility condition tolerance",NULL,a->eta,&a->eta,NULL)); 709f17f7ee2SSatish Balay PetscCall(PetscOptionsInt("-mat_h2opus_order","Basis order for off-diagonal sampling when constructed from kernel",NULL,a->basisord,&a->basisord,NULL)); 710f17f7ee2SSatish Balay PetscCall(PetscOptionsInt("-mat_h2opus_maxrank","Maximum rank when constructed from matvecs",NULL,a->max_rank,&a->max_rank,NULL)); 711f17f7ee2SSatish Balay PetscCall(PetscOptionsInt("-mat_h2opus_samples","Maximum number of samples to be taken concurrently when constructing from matvecs",NULL,a->bs,&a->bs,NULL)); 712f17f7ee2SSatish Balay PetscCall(PetscOptionsInt("-mat_h2opus_normsamples","Maximum bumber of samples to be when estimating norms",NULL,a->norm_max_samples,&a->norm_max_samples,NULL)); 713f17f7ee2SSatish Balay PetscCall(PetscOptionsReal("-mat_h2opus_rtol","Relative tolerance for construction from sampling",NULL,a->rtol,&a->rtol,NULL)); 714f17f7ee2SSatish Balay PetscCall(PetscOptionsBool("-mat_h2opus_check","Check error when constructing from sampling during MatAssemblyEnd()",NULL,a->check_construction,&a->check_construction,NULL)); 715f17f7ee2SSatish Balay PetscCall(PetscOptionsBool("-mat_h2opus_hara_verbose","Verbose output from hara construction",NULL,a->hara_verbose,&a->hara_verbose,NULL)); 716*036e4bdcSSatish Balay PetscCall(PetscOptionsBool("-mat_h2opus_resize","Resize after compression",NULL,a->resize,&a->resize,NULL)); 717*036e4bdcSSatish Balay PetscOptionsHeadEnd(); 718f17f7ee2SSatish Balay PetscFunctionReturn(0); 719f17f7ee2SSatish Balay } 720f17f7ee2SSatish Balay 721f17f7ee2SSatish Balay static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat,PetscInt,const PetscReal[],PetscBool,MatH2OpusKernel,void*); 722f17f7ee2SSatish Balay 723f17f7ee2SSatish Balay static PetscErrorCode MatH2OpusInferCoordinates_Private(Mat A) 724f17f7ee2SSatish Balay { 725f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 726f17f7ee2SSatish Balay Vec c; 727f17f7ee2SSatish Balay PetscInt spacedim; 728f17f7ee2SSatish Balay const PetscScalar *coords; 729f17f7ee2SSatish Balay 730f17f7ee2SSatish Balay PetscFunctionBegin; 731f17f7ee2SSatish Balay if (a->ptcloud) PetscFunctionReturn(0); 732f17f7ee2SSatish Balay PetscCall(PetscObjectQuery((PetscObject)A,"__math2opus_coords",(PetscObject*)&c)); 733f17f7ee2SSatish Balay if (!c && a->sampler) { 734f17f7ee2SSatish Balay Mat S = a->sampler->GetSamplingMat(); 735f17f7ee2SSatish Balay 736f17f7ee2SSatish Balay PetscCall(PetscObjectQuery((PetscObject)S,"__math2opus_coords",(PetscObject*)&c)); 737f17f7ee2SSatish Balay } 738f17f7ee2SSatish Balay if (!c) { 739f17f7ee2SSatish Balay PetscCall(MatH2OpusSetCoords_H2OPUS(A,-1,NULL,PETSC_FALSE,NULL,NULL)); 740f17f7ee2SSatish Balay } else { 741f17f7ee2SSatish Balay PetscCall(VecGetArrayRead(c,&coords)); 742f17f7ee2SSatish Balay PetscCall(VecGetBlockSize(c,&spacedim)); 743f17f7ee2SSatish Balay PetscCall(MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,PETSC_FALSE,NULL,NULL)); 744f17f7ee2SSatish Balay PetscCall(VecRestoreArrayRead(c,&coords)); 745f17f7ee2SSatish Balay } 746f17f7ee2SSatish Balay PetscFunctionReturn(0); 747f17f7ee2SSatish Balay } 748f17f7ee2SSatish Balay 749f17f7ee2SSatish Balay static PetscErrorCode MatSetUpMultiply_H2OPUS(Mat A) 750f17f7ee2SSatish Balay { 751f17f7ee2SSatish Balay MPI_Comm comm; 752f17f7ee2SSatish Balay PetscMPIInt size; 753f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 754f17f7ee2SSatish Balay PetscInt n = 0,*idx = NULL; 755f17f7ee2SSatish Balay int *iidx = NULL; 756f17f7ee2SSatish Balay PetscCopyMode own; 757f17f7ee2SSatish Balay PetscBool rid; 758f17f7ee2SSatish Balay 759f17f7ee2SSatish Balay PetscFunctionBegin; 760f17f7ee2SSatish Balay if (a->multsetup) PetscFunctionReturn(0); 761f17f7ee2SSatish Balay if (a->sf) { /* MatDuplicate_H2OPUS takes reference to the SF */ 762f17f7ee2SSatish Balay PetscCall(PetscSFGetGraph(a->sf,NULL,&n,NULL,NULL)); 763f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 764f17f7ee2SSatish Balay a->xx_gpu = new thrust::device_vector<PetscScalar>(n); 765f17f7ee2SSatish Balay a->yy_gpu = new thrust::device_vector<PetscScalar>(n); 766f17f7ee2SSatish Balay a->xxs_gpu = 1; 767f17f7ee2SSatish Balay a->yys_gpu = 1; 768f17f7ee2SSatish Balay #endif 769f17f7ee2SSatish Balay a->xx = new thrust::host_vector<PetscScalar>(n); 770f17f7ee2SSatish Balay a->yy = new thrust::host_vector<PetscScalar>(n); 771f17f7ee2SSatish Balay a->xxs = 1; 772f17f7ee2SSatish Balay a->yys = 1; 773f17f7ee2SSatish Balay } else { 774f17f7ee2SSatish Balay IS is; 775f17f7ee2SSatish Balay PetscCall(PetscObjectGetComm((PetscObject)A,&comm)); 776f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(comm,&size)); 777f17f7ee2SSatish Balay if (!a->h2opus_indexmap) { 778f17f7ee2SSatish Balay if (size > 1) { 779f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix"); 780f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 781f17f7ee2SSatish Balay iidx = MatH2OpusGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map); 782f17f7ee2SSatish Balay n = a->dist_hmatrix->basis_tree.basis_branch.index_map.size(); 783f17f7ee2SSatish Balay #endif 784f17f7ee2SSatish Balay } else { 785f17f7ee2SSatish Balay iidx = MatH2OpusGetThrustPointer(a->hmatrix->u_basis_tree.index_map); 786f17f7ee2SSatish Balay n = a->hmatrix->u_basis_tree.index_map.size(); 787f17f7ee2SSatish Balay } 788f17f7ee2SSatish Balay 789f17f7ee2SSatish Balay if (PetscDefined(USE_64BIT_INDICES)) { 790f17f7ee2SSatish Balay PetscInt i; 791f17f7ee2SSatish Balay 792f17f7ee2SSatish Balay own = PETSC_OWN_POINTER; 793f17f7ee2SSatish Balay PetscCall(PetscMalloc1(n,&idx)); 794f17f7ee2SSatish Balay for (i=0;i<n;i++) idx[i] = iidx[i]; 795f17f7ee2SSatish Balay } else { 796f17f7ee2SSatish Balay own = PETSC_COPY_VALUES; 797f17f7ee2SSatish Balay idx = (PetscInt*)iidx; 798f17f7ee2SSatish Balay } 799f17f7ee2SSatish Balay PetscCall(ISCreateGeneral(comm,n,idx,own,&is)); 800f17f7ee2SSatish Balay PetscCall(ISSetPermutation(is)); 801f17f7ee2SSatish Balay PetscCall(ISViewFromOptions(is,(PetscObject)A,"-mat_h2opus_indexmap_view")); 802f17f7ee2SSatish Balay a->h2opus_indexmap = is; 803f17f7ee2SSatish Balay } 804f17f7ee2SSatish Balay PetscCall(ISGetLocalSize(a->h2opus_indexmap,&n)); 805f17f7ee2SSatish Balay PetscCall(ISGetIndices(a->h2opus_indexmap,(const PetscInt **)&idx)); 806f17f7ee2SSatish Balay rid = (PetscBool)(n == A->rmap->n); 807f17f7ee2SSatish Balay PetscCall(MPIU_Allreduce(MPI_IN_PLACE,&rid,1,MPIU_BOOL,MPI_LAND,comm)); 808f17f7ee2SSatish Balay if (rid) { 809f17f7ee2SSatish Balay PetscCall(ISIdentity(a->h2opus_indexmap,&rid)); 810f17f7ee2SSatish Balay } 811f17f7ee2SSatish Balay if (!rid) { 812f17f7ee2SSatish Balay if (size > 1) { /* Parallel distribution may be different, save it here for fast path in MatMult (see MatH2OpusSetNativeMult) */ 813f17f7ee2SSatish Balay PetscCall(PetscLayoutCreate(comm,&a->h2opus_rmap)); 814f17f7ee2SSatish Balay PetscCall(PetscLayoutSetLocalSize(a->h2opus_rmap,n)); 815f17f7ee2SSatish Balay PetscCall(PetscLayoutSetUp(a->h2opus_rmap)); 816f17f7ee2SSatish Balay PetscCall(PetscLayoutReference(a->h2opus_rmap,&a->h2opus_cmap)); 817f17f7ee2SSatish Balay } 818f17f7ee2SSatish Balay PetscCall(PetscSFCreate(comm,&a->sf)); 819f17f7ee2SSatish Balay PetscCall(PetscSFSetGraphLayout(a->sf,A->rmap,n,NULL,PETSC_OWN_POINTER,idx)); 820*036e4bdcSSatish Balay PetscCall(PetscSFSetUp(a->sf)); 821f17f7ee2SSatish Balay PetscCall(PetscSFViewFromOptions(a->sf,(PetscObject)A,"-mat_h2opus_sf_view")); 822f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 823f17f7ee2SSatish Balay a->xx_gpu = new thrust::device_vector<PetscScalar>(n); 824f17f7ee2SSatish Balay a->yy_gpu = new thrust::device_vector<PetscScalar>(n); 825f17f7ee2SSatish Balay a->xxs_gpu = 1; 826f17f7ee2SSatish Balay a->yys_gpu = 1; 827f17f7ee2SSatish Balay #endif 828f17f7ee2SSatish Balay a->xx = new thrust::host_vector<PetscScalar>(n); 829f17f7ee2SSatish Balay a->yy = new thrust::host_vector<PetscScalar>(n); 830f17f7ee2SSatish Balay a->xxs = 1; 831f17f7ee2SSatish Balay a->yys = 1; 832f17f7ee2SSatish Balay } 833f17f7ee2SSatish Balay PetscCall(ISRestoreIndices(a->h2opus_indexmap,(const PetscInt **)&idx)); 834f17f7ee2SSatish Balay } 835f17f7ee2SSatish Balay a->multsetup = PETSC_TRUE; 836f17f7ee2SSatish Balay PetscFunctionReturn(0); 837f17f7ee2SSatish Balay } 838f17f7ee2SSatish Balay 839f17f7ee2SSatish Balay static PetscErrorCode MatAssemblyEnd_H2OPUS(Mat A, MatAssemblyType assemblytype) 840f17f7ee2SSatish Balay { 841f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 842f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 843f17f7ee2SSatish Balay h2opusHandle_t handle = a->handle->handle; 844f17f7ee2SSatish Balay #else 845f17f7ee2SSatish Balay h2opusHandle_t handle = a->handle; 846f17f7ee2SSatish Balay #endif 847f17f7ee2SSatish Balay PetscBool kernel = PETSC_FALSE; 848f17f7ee2SSatish Balay PetscBool boundtocpu = PETSC_TRUE; 849f17f7ee2SSatish Balay PetscBool samplingdone = PETSC_FALSE; 850f17f7ee2SSatish Balay MPI_Comm comm; 851f17f7ee2SSatish Balay PetscMPIInt size; 852f17f7ee2SSatish Balay 853f17f7ee2SSatish Balay PetscFunctionBegin; 854f17f7ee2SSatish Balay PetscCall(PetscObjectGetComm((PetscObject)A,&comm)); 855*036e4bdcSSatish Balay PetscCheck(A->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported"); 856*036e4bdcSSatish Balay PetscCheck(A->rmap->N == A->cmap->N,comm,PETSC_ERR_SUP,"Rectangular matrices are not supported"); 857f17f7ee2SSatish Balay 858f17f7ee2SSatish Balay /* XXX */ 859f17f7ee2SSatish Balay a->leafsize = PetscMin(a->leafsize, PetscMin(A->rmap->N, A->cmap->N)); 860f17f7ee2SSatish Balay 861f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(comm,&size)); 862f17f7ee2SSatish Balay /* TODO REUSABILITY of geometric construction */ 863f17f7ee2SSatish Balay delete a->hmatrix; 864f17f7ee2SSatish Balay delete a->dist_hmatrix; 865f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 866f17f7ee2SSatish Balay delete a->hmatrix_gpu; 867f17f7ee2SSatish Balay delete a->dist_hmatrix_gpu; 868f17f7ee2SSatish Balay #endif 869f17f7ee2SSatish Balay a->orthogonal = PETSC_FALSE; 870f17f7ee2SSatish Balay 871f17f7ee2SSatish Balay /* TODO: other? */ 872f17f7ee2SSatish Balay H2OpusBoxCenterAdmissibility adm(a->eta); 873f17f7ee2SSatish Balay 874f17f7ee2SSatish Balay PetscCall(PetscLogEventBegin(MAT_H2Opus_Build,A,0,0,0)); 875f17f7ee2SSatish Balay if (size > 1) { 876f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 877f17f7ee2SSatish Balay a->dist_hmatrix = new DistributedHMatrix(A->rmap->n/* ,A->symmetric */); 878f17f7ee2SSatish Balay #else 879f17f7ee2SSatish Balay a->dist_hmatrix = NULL; 880f17f7ee2SSatish Balay #endif 881f17f7ee2SSatish Balay } else { 882f17f7ee2SSatish Balay a->hmatrix = new HMatrix(A->rmap->n,A->symmetric); 883f17f7ee2SSatish Balay } 884f17f7ee2SSatish Balay PetscCall(MatH2OpusInferCoordinates_Private(A)); 885f17f7ee2SSatish Balay PetscCheck(a->ptcloud,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing pointcloud"); 886f17f7ee2SSatish Balay if (a->kernel) { 887f17f7ee2SSatish Balay BoxEntryGen<PetscScalar, H2OPUS_HWTYPE_CPU, PetscFunctionGenerator<PetscScalar>> entry_gen(*a->kernel); 888f17f7ee2SSatish Balay if (size > 1) { 889f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix"); 890f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 891f17f7ee2SSatish Balay buildDistributedHMatrix(*a->dist_hmatrix,a->ptcloud,adm,entry_gen,a->leafsize,a->basisord,a->handle); 892f17f7ee2SSatish Balay #endif 893f17f7ee2SSatish Balay } else { 894f17f7ee2SSatish Balay buildHMatrix(*a->hmatrix,a->ptcloud,adm,entry_gen,a->leafsize,a->basisord); 895f17f7ee2SSatish Balay } 896f17f7ee2SSatish Balay kernel = PETSC_TRUE; 897f17f7ee2SSatish Balay } else { 898*036e4bdcSSatish Balay PetscCheck(size <= 1,comm,PETSC_ERR_SUP,"Construction from sampling not supported in parallel"); 899f17f7ee2SSatish Balay buildHMatrixStructure(*a->hmatrix,a->ptcloud,a->leafsize,adm); 900f17f7ee2SSatish Balay } 901f17f7ee2SSatish Balay PetscCall(MatSetUpMultiply_H2OPUS(A)); 902f17f7ee2SSatish Balay 903f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 904f17f7ee2SSatish Balay boundtocpu = A->boundtocpu; 905f17f7ee2SSatish Balay if (!boundtocpu) { 906f17f7ee2SSatish Balay if (size > 1) { 907f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix"); 908f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 909f17f7ee2SSatish Balay a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix); 910f17f7ee2SSatish Balay #endif 911f17f7ee2SSatish Balay } else { 912f17f7ee2SSatish Balay a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix); 913f17f7ee2SSatish Balay } 914f17f7ee2SSatish Balay } 915f17f7ee2SSatish Balay #endif 916f17f7ee2SSatish Balay if (size == 1) { 917f17f7ee2SSatish Balay if (!kernel && a->sampler && a->sampler->GetSamplingMat()) { 918f17f7ee2SSatish Balay PetscReal Anorm; 919f17f7ee2SSatish Balay bool verbose; 920f17f7ee2SSatish Balay 921f17f7ee2SSatish Balay PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_hara_verbose",&a->hara_verbose,NULL)); 922f17f7ee2SSatish Balay verbose = a->hara_verbose; 923f17f7ee2SSatish Balay PetscCall(MatApproximateNorm_Private(a->sampler->GetSamplingMat(),NORM_2,a->norm_max_samples,&Anorm)); 924f17f7ee2SSatish Balay if (a->hara_verbose) PetscCall(PetscPrintf(PETSC_COMM_SELF,"Sampling uses max rank %d, tol %g (%g*%g), %s samples %d\n",a->max_rank,a->rtol*Anorm,a->rtol,Anorm,boundtocpu ? "CPU" : "GPU",a->bs)); 925f17f7ee2SSatish Balay if (a->sf && !a->nativemult) { 926f17f7ee2SSatish Balay a->sampler->SetIndexMap(a->hmatrix->u_basis_tree.index_map.size(),a->hmatrix->u_basis_tree.index_map.data()); 927f17f7ee2SSatish Balay } 928f17f7ee2SSatish Balay a->sampler->SetStream(handle->getMainStream()); 929f17f7ee2SSatish Balay if (boundtocpu) { 930f17f7ee2SSatish Balay a->sampler->SetGPUSampling(false); 931f17f7ee2SSatish Balay hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose); 932f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 933f17f7ee2SSatish Balay } else { 934f17f7ee2SSatish Balay a->sampler->SetGPUSampling(true); 935f17f7ee2SSatish Balay hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose); 936f17f7ee2SSatish Balay #endif 937f17f7ee2SSatish Balay } 938f17f7ee2SSatish Balay samplingdone = PETSC_TRUE; 939f17f7ee2SSatish Balay } 940f17f7ee2SSatish Balay } 941f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 942f17f7ee2SSatish Balay if (!boundtocpu) { 943f17f7ee2SSatish Balay delete a->hmatrix; 944f17f7ee2SSatish Balay delete a->dist_hmatrix; 945f17f7ee2SSatish Balay a->hmatrix = NULL; 946f17f7ee2SSatish Balay a->dist_hmatrix = NULL; 947f17f7ee2SSatish Balay } 948f17f7ee2SSatish Balay A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU; 949f17f7ee2SSatish Balay #endif 950f17f7ee2SSatish Balay PetscCall(PetscLogEventEnd(MAT_H2Opus_Build,A,0,0,0)); 951f17f7ee2SSatish Balay 952f17f7ee2SSatish Balay if (!a->s) a->s = 1.0; 953f17f7ee2SSatish Balay A->assembled = PETSC_TRUE; 954f17f7ee2SSatish Balay 955f17f7ee2SSatish Balay if (samplingdone) { 956f17f7ee2SSatish Balay PetscBool check = a->check_construction; 957f17f7ee2SSatish Balay PetscBool checke = PETSC_FALSE; 958f17f7ee2SSatish Balay 959f17f7ee2SSatish Balay PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_check",&check,NULL)); 960f17f7ee2SSatish Balay PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_check_explicit",&checke,NULL)); 961f17f7ee2SSatish Balay if (check) { 962f17f7ee2SSatish Balay Mat E,Ae; 963f17f7ee2SSatish Balay PetscReal n1,ni,n2; 964f17f7ee2SSatish Balay PetscReal n1A,niA,n2A; 965f17f7ee2SSatish Balay void (*normfunc)(void); 966f17f7ee2SSatish Balay 967f17f7ee2SSatish Balay Ae = a->sampler->GetSamplingMat(); 968f17f7ee2SSatish Balay PetscCall(MatConvert(A,MATSHELL,MAT_INITIAL_MATRIX,&E)); 969f17f7ee2SSatish Balay PetscCall(MatShellSetOperation(E,MATOP_NORM,(void (*)(void))MatNorm_H2OPUS)); 970f17f7ee2SSatish Balay PetscCall(MatAXPY(E,-1.0,Ae,DIFFERENT_NONZERO_PATTERN)); 971f17f7ee2SSatish Balay PetscCall(MatNorm(E,NORM_1,&n1)); 972f17f7ee2SSatish Balay PetscCall(MatNorm(E,NORM_INFINITY,&ni)); 973f17f7ee2SSatish Balay PetscCall(MatNorm(E,NORM_2,&n2)); 974f17f7ee2SSatish Balay if (checke) { 975f17f7ee2SSatish Balay Mat eA,eE,eAe; 976f17f7ee2SSatish Balay 977f17f7ee2SSatish Balay PetscCall(MatComputeOperator(A,MATAIJ,&eA)); 978f17f7ee2SSatish Balay PetscCall(MatComputeOperator(E,MATAIJ,&eE)); 979f17f7ee2SSatish Balay PetscCall(MatComputeOperator(Ae,MATAIJ,&eAe)); 980f17f7ee2SSatish Balay PetscCall(MatChop(eA,PETSC_SMALL)); 981f17f7ee2SSatish Balay PetscCall(MatChop(eE,PETSC_SMALL)); 982f17f7ee2SSatish Balay PetscCall(MatChop(eAe,PETSC_SMALL)); 983f17f7ee2SSatish Balay PetscCall(PetscObjectSetName((PetscObject)eA,"H2Mat")); 984f17f7ee2SSatish Balay PetscCall(MatView(eA,NULL)); 985f17f7ee2SSatish Balay PetscCall(PetscObjectSetName((PetscObject)eAe,"S")); 986f17f7ee2SSatish Balay PetscCall(MatView(eAe,NULL)); 987f17f7ee2SSatish Balay PetscCall(PetscObjectSetName((PetscObject)eE,"H2Mat - S")); 988f17f7ee2SSatish Balay PetscCall(MatView(eE,NULL)); 989f17f7ee2SSatish Balay PetscCall(MatDestroy(&eA)); 990f17f7ee2SSatish Balay PetscCall(MatDestroy(&eE)); 991f17f7ee2SSatish Balay PetscCall(MatDestroy(&eAe)); 992f17f7ee2SSatish Balay } 993f17f7ee2SSatish Balay 994f17f7ee2SSatish Balay PetscCall(MatGetOperation(Ae,MATOP_NORM,&normfunc)); 995f17f7ee2SSatish Balay PetscCall(MatSetOperation(Ae,MATOP_NORM,(void (*)(void))MatNorm_H2OPUS)); 996f17f7ee2SSatish Balay PetscCall(MatNorm(Ae,NORM_1,&n1A)); 997f17f7ee2SSatish Balay PetscCall(MatNorm(Ae,NORM_INFINITY,&niA)); 998f17f7ee2SSatish Balay PetscCall(MatNorm(Ae,NORM_2,&n2A)); 999f17f7ee2SSatish Balay n1A = PetscMax(n1A,PETSC_SMALL); 1000f17f7ee2SSatish Balay n2A = PetscMax(n2A,PETSC_SMALL); 1001f17f7ee2SSatish Balay niA = PetscMax(niA,PETSC_SMALL); 1002f17f7ee2SSatish Balay PetscCall(MatSetOperation(Ae,MATOP_NORM,normfunc)); 1003f17f7ee2SSatish Balay PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A),"MATH2OPUS construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n",(double)n1,(double)ni,(double)n2,(double)(n1/n1A),(double)(ni/niA),(double)(n2/n2A))); 1004f17f7ee2SSatish Balay PetscCall(MatDestroy(&E)); 1005f17f7ee2SSatish Balay } 1006f17f7ee2SSatish Balay a->sampler->SetSamplingMat(NULL); 1007f17f7ee2SSatish Balay } 1008f17f7ee2SSatish Balay PetscFunctionReturn(0); 1009f17f7ee2SSatish Balay } 1010f17f7ee2SSatish Balay 1011f17f7ee2SSatish Balay static PetscErrorCode MatZeroEntries_H2OPUS(Mat A) 1012f17f7ee2SSatish Balay { 1013f17f7ee2SSatish Balay PetscMPIInt size; 1014f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 1015f17f7ee2SSatish Balay 1016f17f7ee2SSatish Balay PetscFunctionBegin; 1017f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 1018*036e4bdcSSatish Balay PetscCheck(size <= 1,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not yet supported"); 1019f17f7ee2SSatish Balay else { 1020f17f7ee2SSatish Balay a->hmatrix->clearData(); 1021f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1022f17f7ee2SSatish Balay if (a->hmatrix_gpu) a->hmatrix_gpu->clearData(); 1023f17f7ee2SSatish Balay #endif 1024f17f7ee2SSatish Balay } 1025f17f7ee2SSatish Balay PetscFunctionReturn(0); 1026f17f7ee2SSatish Balay } 1027f17f7ee2SSatish Balay 1028f17f7ee2SSatish Balay static PetscErrorCode MatDuplicate_H2OPUS(Mat B, MatDuplicateOption op, Mat *nA) 1029f17f7ee2SSatish Balay { 1030f17f7ee2SSatish Balay Mat A; 1031f17f7ee2SSatish Balay Mat_H2OPUS *a, *b = (Mat_H2OPUS*)B->data; 1032f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1033f17f7ee2SSatish Balay PetscBool iscpu = PETSC_FALSE; 1034f17f7ee2SSatish Balay #else 1035f17f7ee2SSatish Balay PetscBool iscpu = PETSC_TRUE; 1036f17f7ee2SSatish Balay #endif 1037f17f7ee2SSatish Balay MPI_Comm comm; 1038f17f7ee2SSatish Balay 1039f17f7ee2SSatish Balay PetscFunctionBegin; 1040f17f7ee2SSatish Balay PetscCall(PetscObjectGetComm((PetscObject)B,&comm)); 1041f17f7ee2SSatish Balay PetscCall(MatCreate(comm,&A)); 1042f17f7ee2SSatish Balay PetscCall(MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N)); 1043f17f7ee2SSatish Balay PetscCall(MatSetType(A,MATH2OPUS)); 1044f17f7ee2SSatish Balay PetscCall(MatPropagateSymmetryOptions(B,A)); 1045f17f7ee2SSatish Balay a = (Mat_H2OPUS*)A->data; 1046f17f7ee2SSatish Balay 1047f17f7ee2SSatish Balay a->eta = b->eta; 1048f17f7ee2SSatish Balay a->leafsize = b->leafsize; 1049f17f7ee2SSatish Balay a->basisord = b->basisord; 1050f17f7ee2SSatish Balay a->max_rank = b->max_rank; 1051f17f7ee2SSatish Balay a->bs = b->bs; 1052f17f7ee2SSatish Balay a->rtol = b->rtol; 1053f17f7ee2SSatish Balay a->norm_max_samples = b->norm_max_samples; 1054f17f7ee2SSatish Balay if (op == MAT_COPY_VALUES) a->s = b->s; 1055f17f7ee2SSatish Balay 1056f17f7ee2SSatish Balay a->ptcloud = new PetscPointCloud<PetscReal>(*b->ptcloud); 1057f17f7ee2SSatish Balay if (op == MAT_COPY_VALUES && b->kernel) a->kernel = new PetscFunctionGenerator<PetscScalar>(*b->kernel); 1058f17f7ee2SSatish Balay 1059f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1060f17f7ee2SSatish Balay if (b->dist_hmatrix) { a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix); } 1061f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1062f17f7ee2SSatish Balay if (b->dist_hmatrix_gpu) { a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu); } 1063f17f7ee2SSatish Balay #endif 1064f17f7ee2SSatish Balay #endif 1065f17f7ee2SSatish Balay if (b->hmatrix) { 1066f17f7ee2SSatish Balay a->hmatrix = new HMatrix(*b->hmatrix); 1067f17f7ee2SSatish Balay if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData(); 1068f17f7ee2SSatish Balay } 1069f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1070f17f7ee2SSatish Balay if (b->hmatrix_gpu) { 1071f17f7ee2SSatish Balay a->hmatrix_gpu = new HMatrix_GPU(*b->hmatrix_gpu); 1072f17f7ee2SSatish Balay if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData(); 1073f17f7ee2SSatish Balay } 1074f17f7ee2SSatish Balay #endif 1075f17f7ee2SSatish Balay if (b->sf) { 1076f17f7ee2SSatish Balay PetscCall(PetscObjectReference((PetscObject)b->sf)); 1077f17f7ee2SSatish Balay a->sf = b->sf; 1078f17f7ee2SSatish Balay } 1079f17f7ee2SSatish Balay if (b->h2opus_indexmap) { 1080f17f7ee2SSatish Balay PetscCall(PetscObjectReference((PetscObject)b->h2opus_indexmap)); 1081f17f7ee2SSatish Balay a->h2opus_indexmap = b->h2opus_indexmap; 1082f17f7ee2SSatish Balay } 1083f17f7ee2SSatish Balay 1084f17f7ee2SSatish Balay PetscCall(MatSetUp(A)); 1085f17f7ee2SSatish Balay PetscCall(MatSetUpMultiply_H2OPUS(A)); 1086f17f7ee2SSatish Balay if (op == MAT_COPY_VALUES) { 1087f17f7ee2SSatish Balay A->assembled = PETSC_TRUE; 1088f17f7ee2SSatish Balay a->orthogonal = b->orthogonal; 1089f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1090f17f7ee2SSatish Balay A->offloadmask = B->offloadmask; 1091f17f7ee2SSatish Balay #endif 1092f17f7ee2SSatish Balay } 1093f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1094f17f7ee2SSatish Balay iscpu = B->boundtocpu; 1095f17f7ee2SSatish Balay #endif 1096f17f7ee2SSatish Balay PetscCall(MatBindToCPU(A,iscpu)); 1097f17f7ee2SSatish Balay 1098f17f7ee2SSatish Balay *nA = A; 1099f17f7ee2SSatish Balay PetscFunctionReturn(0); 1100f17f7ee2SSatish Balay } 1101f17f7ee2SSatish Balay 1102f17f7ee2SSatish Balay static PetscErrorCode MatView_H2OPUS(Mat A, PetscViewer view) 1103f17f7ee2SSatish Balay { 1104f17f7ee2SSatish Balay Mat_H2OPUS *h2opus = (Mat_H2OPUS*)A->data; 1105*036e4bdcSSatish Balay PetscBool isascii, vieweps; 1106f17f7ee2SSatish Balay PetscMPIInt size; 1107f17f7ee2SSatish Balay PetscViewerFormat format; 1108f17f7ee2SSatish Balay 1109f17f7ee2SSatish Balay PetscFunctionBegin; 1110f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)view,PETSCVIEWERASCII,&isascii)); 1111f17f7ee2SSatish Balay PetscCall(PetscViewerGetFormat(view,&format)); 1112f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 1113f17f7ee2SSatish Balay if (isascii) { 1114f17f7ee2SSatish Balay if (format == PETSC_VIEWER_ASCII_MATLAB) { 1115f17f7ee2SSatish Balay if (size == 1) { 1116f17f7ee2SSatish Balay FILE *fp; 1117f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIGetPointer(view,&fp)); 1118f17f7ee2SSatish Balay dumpHMatrix(*h2opus->hmatrix,6,fp); 1119f17f7ee2SSatish Balay } 1120f17f7ee2SSatish Balay } else { 1121f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view," H-Matrix constructed from %s\n",h2opus->kernel ? "Kernel" : "Mat")); 1122f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view," PointCloud dim %" PetscInt_FMT "\n",h2opus->ptcloud ? h2opus->ptcloud->getDimension() : 0)); 1123f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view," Admissibility parameters: leaf size %" PetscInt_FMT ", eta %g\n",h2opus->leafsize,(double)h2opus->eta)); 1124f17f7ee2SSatish Balay if (!h2opus->kernel) { 1125f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view," Sampling parameters: max_rank %" PetscInt_FMT ", samples %" PetscInt_FMT ", tolerance %g\n",h2opus->max_rank,h2opus->bs,(double)h2opus->rtol)); 1126f17f7ee2SSatish Balay } else { 1127f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view," Offdiagonal blocks approximation order %" PetscInt_FMT "\n",h2opus->basisord)); 1128f17f7ee2SSatish Balay } 1129f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view," Number of samples for norms %" PetscInt_FMT "\n",h2opus->norm_max_samples)); 1130f17f7ee2SSatish Balay if (size == 1) { 1131f17f7ee2SSatish Balay double dense_mem_cpu = h2opus->hmatrix ? h2opus->hmatrix->getDenseMemoryUsage() : 0; 1132f17f7ee2SSatish Balay double low_rank_cpu = h2opus->hmatrix ? h2opus->hmatrix->getLowRankMemoryUsage() : 0; 1133f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA) 1134f17f7ee2SSatish Balay double dense_mem_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getDenseMemoryUsage() : 0; 1135f17f7ee2SSatish Balay double low_rank_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getLowRankMemoryUsage() : 0; 1136f17f7ee2SSatish Balay #endif 1137f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view," Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu)); 1138f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA) 1139f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view," Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu)); 1140f17f7ee2SSatish Balay #endif 1141f17f7ee2SSatish Balay } else { 1142f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA) 1143f17f7ee2SSatish Balay double matrix_mem[4] = {0.,0.,0.,0.}; 1144f17f7ee2SSatish Balay PetscMPIInt rsize = 4; 1145f17f7ee2SSatish Balay #else 1146f17f7ee2SSatish Balay double matrix_mem[2] = {0.,0.}; 1147f17f7ee2SSatish Balay PetscMPIInt rsize = 2; 1148f17f7ee2SSatish Balay #endif 1149f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1150f17f7ee2SSatish Balay matrix_mem[0] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalDenseMemoryUsage() : 0; 1151f17f7ee2SSatish Balay matrix_mem[1] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalLowRankMemoryUsage() : 0; 1152f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA) 1153f17f7ee2SSatish Balay matrix_mem[2] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalDenseMemoryUsage() : 0; 1154f17f7ee2SSatish Balay matrix_mem[3] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalLowRankMemoryUsage() : 0; 1155f17f7ee2SSatish Balay #endif 1156f17f7ee2SSatish Balay #endif 1157f17f7ee2SSatish Balay PetscCall(MPIU_Allreduce(MPI_IN_PLACE,matrix_mem,rsize,MPI_DOUBLE_PRECISION,MPI_SUM,PetscObjectComm((PetscObject)A))); 1158f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view," Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[0], matrix_mem[1], matrix_mem[0] + matrix_mem[1])); 1159f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA) 1160f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view," Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[2], matrix_mem[3], matrix_mem[2] + matrix_mem[3])); 1161f17f7ee2SSatish Balay #endif 1162f17f7ee2SSatish Balay } 1163f17f7ee2SSatish Balay } 1164f17f7ee2SSatish Balay } 1165*036e4bdcSSatish Balay vieweps = PETSC_FALSE; 1166*036e4bdcSSatish Balay PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_vieweps",&vieweps,NULL)); 1167*036e4bdcSSatish Balay if (vieweps) { 1168f17f7ee2SSatish Balay char filename[256]; 1169f17f7ee2SSatish Balay const char *name; 1170f17f7ee2SSatish Balay 1171f17f7ee2SSatish Balay PetscCall(PetscObjectGetName((PetscObject)A,&name)); 1172f17f7ee2SSatish Balay PetscCall(PetscSNPrintf(filename,sizeof(filename),"%s_structure.eps",name)); 1173*036e4bdcSSatish Balay PetscCall(PetscOptionsGetString(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_vieweps_filename",filename,sizeof(filename),NULL)); 1174f17f7ee2SSatish Balay outputEps(*h2opus->hmatrix,filename); 1175f17f7ee2SSatish Balay } 1176f17f7ee2SSatish Balay PetscFunctionReturn(0); 1177f17f7ee2SSatish Balay } 1178f17f7ee2SSatish Balay 1179f17f7ee2SSatish Balay static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat A, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx) 1180f17f7ee2SSatish Balay { 1181f17f7ee2SSatish Balay Mat_H2OPUS *h2opus = (Mat_H2OPUS*)A->data; 1182f17f7ee2SSatish Balay PetscReal *gcoords; 1183f17f7ee2SSatish Balay PetscInt N; 1184f17f7ee2SSatish Balay MPI_Comm comm; 1185f17f7ee2SSatish Balay PetscMPIInt size; 1186f17f7ee2SSatish Balay PetscBool cong; 1187f17f7ee2SSatish Balay 1188f17f7ee2SSatish Balay PetscFunctionBegin; 1189f17f7ee2SSatish Balay PetscCall(PetscLayoutSetUp(A->rmap)); 1190f17f7ee2SSatish Balay PetscCall(PetscLayoutSetUp(A->cmap)); 1191f17f7ee2SSatish Balay PetscCall(PetscObjectGetComm((PetscObject)A,&comm)); 1192f17f7ee2SSatish Balay PetscCall(MatHasCongruentLayouts(A,&cong)); 1193f17f7ee2SSatish Balay PetscCheck(cong,comm,PETSC_ERR_SUP,"Only for square matrices with congruent layouts"); 1194f17f7ee2SSatish Balay N = A->rmap->N; 1195f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(comm,&size)); 1196f17f7ee2SSatish Balay if (spacedim > 0 && size > 1 && cdist) { 1197f17f7ee2SSatish Balay PetscSF sf; 1198f17f7ee2SSatish Balay MPI_Datatype dtype; 1199f17f7ee2SSatish Balay 1200f17f7ee2SSatish Balay PetscCallMPI(MPI_Type_contiguous(spacedim,MPIU_REAL,&dtype)); 1201f17f7ee2SSatish Balay PetscCallMPI(MPI_Type_commit(&dtype)); 1202f17f7ee2SSatish Balay 1203f17f7ee2SSatish Balay PetscCall(PetscSFCreate(comm,&sf)); 1204f17f7ee2SSatish Balay PetscCall(PetscSFSetGraphWithPattern(sf,A->rmap,PETSCSF_PATTERN_ALLGATHER)); 1205f17f7ee2SSatish Balay PetscCall(PetscMalloc1(spacedim*N,&gcoords)); 1206f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(sf,dtype,coords,gcoords,MPI_REPLACE)); 1207f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(sf,dtype,coords,gcoords,MPI_REPLACE)); 1208f17f7ee2SSatish Balay PetscCall(PetscSFDestroy(&sf)); 1209f17f7ee2SSatish Balay PetscCallMPI(MPI_Type_free(&dtype)); 1210f17f7ee2SSatish Balay } else gcoords = (PetscReal*)coords; 1211f17f7ee2SSatish Balay 1212f17f7ee2SSatish Balay delete h2opus->ptcloud; 1213f17f7ee2SSatish Balay delete h2opus->kernel; 1214f17f7ee2SSatish Balay h2opus->ptcloud = new PetscPointCloud<PetscReal>(spacedim,N,gcoords); 1215f17f7ee2SSatish Balay if (kernel) h2opus->kernel = new PetscFunctionGenerator<PetscScalar>(kernel,spacedim,kernelctx); 1216f17f7ee2SSatish Balay if (gcoords != coords) PetscCall(PetscFree(gcoords)); 1217f17f7ee2SSatish Balay A->preallocated = PETSC_TRUE; 1218f17f7ee2SSatish Balay PetscFunctionReturn(0); 1219f17f7ee2SSatish Balay } 1220f17f7ee2SSatish Balay 1221f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1222f17f7ee2SSatish Balay static PetscErrorCode MatBindToCPU_H2OPUS(Mat A, PetscBool flg) 1223f17f7ee2SSatish Balay { 1224f17f7ee2SSatish Balay PetscMPIInt size; 1225f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 1226f17f7ee2SSatish Balay 1227f17f7ee2SSatish Balay PetscFunctionBegin; 1228f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 1229f17f7ee2SSatish Balay if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) { 1230f17f7ee2SSatish Balay if (size > 1) { 1231f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 1232f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1233f17f7ee2SSatish Balay if (!a->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix_gpu); 1234f17f7ee2SSatish Balay else *a->dist_hmatrix = *a->dist_hmatrix_gpu; 1235f17f7ee2SSatish Balay #endif 1236f17f7ee2SSatish Balay } else { 1237f17f7ee2SSatish Balay PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 1238f17f7ee2SSatish Balay if (!a->hmatrix) a->hmatrix = new HMatrix(*a->hmatrix_gpu); 1239f17f7ee2SSatish Balay else *a->hmatrix = *a->hmatrix_gpu; 1240f17f7ee2SSatish Balay } 1241f17f7ee2SSatish Balay delete a->hmatrix_gpu; 1242f17f7ee2SSatish Balay delete a->dist_hmatrix_gpu; 1243f17f7ee2SSatish Balay a->hmatrix_gpu = NULL; 1244f17f7ee2SSatish Balay a->dist_hmatrix_gpu = NULL; 1245f17f7ee2SSatish Balay A->offloadmask = PETSC_OFFLOAD_CPU; 1246f17f7ee2SSatish Balay } else if (!flg && A->offloadmask == PETSC_OFFLOAD_CPU) { 1247f17f7ee2SSatish Balay if (size > 1) { 1248f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 1249f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1250f17f7ee2SSatish Balay if (!a->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix); 1251f17f7ee2SSatish Balay else *a->dist_hmatrix_gpu = *a->dist_hmatrix; 1252f17f7ee2SSatish Balay #endif 1253f17f7ee2SSatish Balay } else { 1254f17f7ee2SSatish Balay PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 1255f17f7ee2SSatish Balay if (!a->hmatrix_gpu) a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix); 1256f17f7ee2SSatish Balay else *a->hmatrix_gpu = *a->hmatrix; 1257f17f7ee2SSatish Balay } 1258f17f7ee2SSatish Balay delete a->hmatrix; 1259f17f7ee2SSatish Balay delete a->dist_hmatrix; 1260f17f7ee2SSatish Balay a->hmatrix = NULL; 1261f17f7ee2SSatish Balay a->dist_hmatrix = NULL; 1262f17f7ee2SSatish Balay A->offloadmask = PETSC_OFFLOAD_GPU; 1263f17f7ee2SSatish Balay } 1264f17f7ee2SSatish Balay PetscCall(PetscFree(A->defaultvectype)); 1265f17f7ee2SSatish Balay if (!flg) { 1266f17f7ee2SSatish Balay PetscCall(PetscStrallocpy(VECCUDA,&A->defaultvectype)); 1267f17f7ee2SSatish Balay } else { 1268f17f7ee2SSatish Balay PetscCall(PetscStrallocpy(VECSTANDARD,&A->defaultvectype)); 1269f17f7ee2SSatish Balay } 1270f17f7ee2SSatish Balay A->boundtocpu = flg; 1271f17f7ee2SSatish Balay PetscFunctionReturn(0); 1272f17f7ee2SSatish Balay } 1273f17f7ee2SSatish Balay #endif 1274f17f7ee2SSatish Balay 1275f17f7ee2SSatish Balay /*MC 1276f17f7ee2SSatish Balay MATH2OPUS = "h2opus" - A matrix type for hierarchical matrices using the H2Opus package. 1277f17f7ee2SSatish Balay 1278f17f7ee2SSatish Balay Options Database Keys: 1279f17f7ee2SSatish Balay . -mat_type h2opus - matrix type to "h2opus" during a call to MatSetFromOptions() 1280f17f7ee2SSatish Balay 1281f17f7ee2SSatish Balay Notes: 1282f17f7ee2SSatish Balay H2Opus implements hierarchical matrices in the H^2 flavour. 1283f17f7ee2SSatish Balay It supports CPU or NVIDIA GPUs. 1284f17f7ee2SSatish Balay For CPU only builds, use ./configure --download-h2opus --download-thrust to install PETSc to use H2Opus. 1285f17f7ee2SSatish Balay In order to run on NVIDIA GPUs, use ./configure --download-h2opus --download-magma --download-kblas. 1286f17f7ee2SSatish Balay For details and additional references, see 1287f17f7ee2SSatish Balay "H2Opus: A distributed-memory multi-GPU software package for non-local operators", 1288f17f7ee2SSatish Balay available at https://arxiv.org/abs/2109.05451. 1289f17f7ee2SSatish Balay 1290f17f7ee2SSatish Balay Level: beginner 1291f17f7ee2SSatish Balay 1292*036e4bdcSSatish Balay .seealso: `MATHTOOL`, `MATDENSE`, `MatCreateH2OpusFromKernel()`, `MatCreateH2OpusFromMat()` 1293f17f7ee2SSatish Balay M*/ 1294f17f7ee2SSatish Balay PETSC_EXTERN PetscErrorCode MatCreate_H2OPUS(Mat A) 1295f17f7ee2SSatish Balay { 1296f17f7ee2SSatish Balay Mat_H2OPUS *a; 1297f17f7ee2SSatish Balay PetscMPIInt size; 1298f17f7ee2SSatish Balay 1299f17f7ee2SSatish Balay PetscFunctionBegin; 1300f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1301f17f7ee2SSatish Balay PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 1302f17f7ee2SSatish Balay #endif 1303f17f7ee2SSatish Balay PetscCall(PetscNewLog(A,&a)); 1304f17f7ee2SSatish Balay A->data = (void*)a; 1305f17f7ee2SSatish Balay 1306f17f7ee2SSatish Balay a->eta = 0.9; 1307f17f7ee2SSatish Balay a->leafsize = 32; 1308f17f7ee2SSatish Balay a->basisord = 4; 1309f17f7ee2SSatish Balay a->max_rank = 64; 1310f17f7ee2SSatish Balay a->bs = 32; 1311f17f7ee2SSatish Balay a->rtol = 1.e-4; 1312f17f7ee2SSatish Balay a->s = 1.0; 1313f17f7ee2SSatish Balay a->norm_max_samples = 10; 1314*036e4bdcSSatish Balay a->resize = PETSC_TRUE; /* reallocate after compression */ 1315f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1316f17f7ee2SSatish Balay h2opusCreateDistributedHandleComm(&a->handle,PetscObjectComm((PetscObject)A)); 1317f17f7ee2SSatish Balay #else 1318f17f7ee2SSatish Balay h2opusCreateHandle(&a->handle); 1319f17f7ee2SSatish Balay #endif 1320f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 1321f17f7ee2SSatish Balay PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATH2OPUS)); 1322f17f7ee2SSatish Balay PetscCall(PetscMemzero(A->ops,sizeof(struct _MatOps))); 1323f17f7ee2SSatish Balay 1324f17f7ee2SSatish Balay A->ops->destroy = MatDestroy_H2OPUS; 1325f17f7ee2SSatish Balay A->ops->view = MatView_H2OPUS; 1326f17f7ee2SSatish Balay A->ops->assemblyend = MatAssemblyEnd_H2OPUS; 1327f17f7ee2SSatish Balay A->ops->mult = MatMult_H2OPUS; 1328f17f7ee2SSatish Balay A->ops->multtranspose = MatMultTranspose_H2OPUS; 1329f17f7ee2SSatish Balay A->ops->multadd = MatMultAdd_H2OPUS; 1330f17f7ee2SSatish Balay A->ops->multtransposeadd = MatMultTransposeAdd_H2OPUS; 1331f17f7ee2SSatish Balay A->ops->scale = MatScale_H2OPUS; 1332f17f7ee2SSatish Balay A->ops->duplicate = MatDuplicate_H2OPUS; 1333f17f7ee2SSatish Balay A->ops->setfromoptions = MatSetFromOptions_H2OPUS; 1334f17f7ee2SSatish Balay A->ops->norm = MatNorm_H2OPUS; 1335f17f7ee2SSatish Balay A->ops->zeroentries = MatZeroEntries_H2OPUS; 1336f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1337f17f7ee2SSatish Balay A->ops->bindtocpu = MatBindToCPU_H2OPUS; 1338f17f7ee2SSatish Balay #endif 1339f17f7ee2SSatish Balay 1340f17f7ee2SSatish Balay PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdense_C",MatProductSetFromOptions_H2OPUS)); 1341f17f7ee2SSatish Balay PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdensecuda_C",MatProductSetFromOptions_H2OPUS)); 1342f17f7ee2SSatish Balay PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidense_C",MatProductSetFromOptions_H2OPUS)); 1343f17f7ee2SSatish Balay PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidensecuda_C",MatProductSetFromOptions_H2OPUS)); 1344f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1345f17f7ee2SSatish Balay PetscCall(PetscFree(A->defaultvectype)); 1346f17f7ee2SSatish Balay PetscCall(PetscStrallocpy(VECCUDA,&A->defaultvectype)); 1347f17f7ee2SSatish Balay #endif 1348f17f7ee2SSatish Balay PetscFunctionReturn(0); 1349f17f7ee2SSatish Balay } 1350f17f7ee2SSatish Balay 1351f17f7ee2SSatish Balay /*@C 1352f17f7ee2SSatish Balay MatH2OpusOrthogonalize - Orthogonalize the basis tree of a hierarchical matrix. 1353f17f7ee2SSatish Balay 1354f17f7ee2SSatish Balay Input Parameter: 1355f17f7ee2SSatish Balay . A - the matrix 1356f17f7ee2SSatish Balay 1357f17f7ee2SSatish Balay Level: intermediate 1358f17f7ee2SSatish Balay 1359*036e4bdcSSatish Balay .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()` 1360f17f7ee2SSatish Balay @*/ 1361f17f7ee2SSatish Balay PetscErrorCode MatH2OpusOrthogonalize(Mat A) 1362f17f7ee2SSatish Balay { 1363f17f7ee2SSatish Balay PetscBool ish2opus; 1364f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 1365f17f7ee2SSatish Balay PetscMPIInt size; 1366f17f7ee2SSatish Balay PetscBool boundtocpu = PETSC_TRUE; 1367f17f7ee2SSatish Balay 1368f17f7ee2SSatish Balay PetscFunctionBegin; 1369f17f7ee2SSatish Balay PetscValidHeaderSpecific(A,MAT_CLASSID,1); 1370f17f7ee2SSatish Balay PetscValidType(A,1); 1371f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus)); 1372f17f7ee2SSatish Balay if (!ish2opus) PetscFunctionReturn(0); 1373f17f7ee2SSatish Balay if (a->orthogonal) PetscFunctionReturn(0); 1374f17f7ee2SSatish Balay HLibProfile::clear(); 1375f17f7ee2SSatish Balay PetscCall(PetscLogEventBegin(MAT_H2Opus_Orthog,A,0,0,0)); 1376f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1377f17f7ee2SSatish Balay boundtocpu = A->boundtocpu; 1378f17f7ee2SSatish Balay #endif 1379f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 1380f17f7ee2SSatish Balay if (size > 1) { 1381f17f7ee2SSatish Balay if (boundtocpu) { 1382f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 1383f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1384f17f7ee2SSatish Balay distributed_horthog(*a->dist_hmatrix, a->handle); 1385f17f7ee2SSatish Balay #endif 1386f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1387f17f7ee2SSatish Balay A->offloadmask = PETSC_OFFLOAD_CPU; 1388f17f7ee2SSatish Balay } else { 1389f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 1390f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeBegin()); 1391f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1392f17f7ee2SSatish Balay distributed_horthog(*a->dist_hmatrix_gpu, a->handle); 1393f17f7ee2SSatish Balay #endif 1394f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeEnd()); 1395f17f7ee2SSatish Balay #endif 1396f17f7ee2SSatish Balay } 1397f17f7ee2SSatish Balay } else { 1398f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1399f17f7ee2SSatish Balay h2opusHandle_t handle = a->handle->handle; 1400f17f7ee2SSatish Balay #else 1401f17f7ee2SSatish Balay h2opusHandle_t handle = a->handle; 1402f17f7ee2SSatish Balay #endif 1403f17f7ee2SSatish Balay if (boundtocpu) { 1404f17f7ee2SSatish Balay PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 1405f17f7ee2SSatish Balay horthog(*a->hmatrix, handle); 1406f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1407f17f7ee2SSatish Balay A->offloadmask = PETSC_OFFLOAD_CPU; 1408f17f7ee2SSatish Balay } else { 1409f17f7ee2SSatish Balay PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 1410f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeBegin()); 1411f17f7ee2SSatish Balay horthog(*a->hmatrix_gpu, handle); 1412f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeEnd()); 1413f17f7ee2SSatish Balay #endif 1414f17f7ee2SSatish Balay } 1415f17f7ee2SSatish Balay } 1416f17f7ee2SSatish Balay a->orthogonal = PETSC_TRUE; 1417f17f7ee2SSatish Balay { /* log flops */ 1418f17f7ee2SSatish Balay double gops,time,perf,dev; 1419f17f7ee2SSatish Balay HLibProfile::getHorthogPerf(gops,time,perf,dev); 1420f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1421f17f7ee2SSatish Balay if (boundtocpu) { 1422f17f7ee2SSatish Balay PetscCall(PetscLogFlops(1e9*gops)); 1423f17f7ee2SSatish Balay } else { 1424f17f7ee2SSatish Balay PetscCall(PetscLogGpuFlops(1e9*gops)); 1425f17f7ee2SSatish Balay } 1426f17f7ee2SSatish Balay #else 1427f17f7ee2SSatish Balay PetscCall(PetscLogFlops(1e9*gops)); 1428f17f7ee2SSatish Balay #endif 1429f17f7ee2SSatish Balay } 1430f17f7ee2SSatish Balay PetscCall(PetscLogEventEnd(MAT_H2Opus_Orthog,A,0,0,0)); 1431f17f7ee2SSatish Balay PetscFunctionReturn(0); 1432f17f7ee2SSatish Balay } 1433f17f7ee2SSatish Balay 1434f17f7ee2SSatish Balay /*@C 1435f17f7ee2SSatish Balay MatH2OpusCompress - Compress a hierarchical matrix. 1436f17f7ee2SSatish Balay 1437f17f7ee2SSatish Balay Input Parameters: 1438f17f7ee2SSatish Balay + A - the matrix 1439f17f7ee2SSatish Balay - tol - the absolute truncation threshold 1440f17f7ee2SSatish Balay 1441f17f7ee2SSatish Balay Level: intermediate 1442f17f7ee2SSatish Balay 1443*036e4bdcSSatish Balay .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusOrthogonalize()` 1444f17f7ee2SSatish Balay @*/ 1445f17f7ee2SSatish Balay PetscErrorCode MatH2OpusCompress(Mat A, PetscReal tol) 1446f17f7ee2SSatish Balay { 1447f17f7ee2SSatish Balay PetscBool ish2opus; 1448f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 1449f17f7ee2SSatish Balay PetscMPIInt size; 1450f17f7ee2SSatish Balay PetscBool boundtocpu = PETSC_TRUE; 1451f17f7ee2SSatish Balay 1452f17f7ee2SSatish Balay PetscFunctionBegin; 1453f17f7ee2SSatish Balay PetscValidHeaderSpecific(A,MAT_CLASSID,1); 1454f17f7ee2SSatish Balay PetscValidType(A,1); 1455f17f7ee2SSatish Balay PetscValidLogicalCollectiveReal(A,tol,2); 1456f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus)); 1457f17f7ee2SSatish Balay if (!ish2opus || tol <= 0.0) PetscFunctionReturn(0); 1458f17f7ee2SSatish Balay PetscCall(MatH2OpusOrthogonalize(A)); 1459f17f7ee2SSatish Balay HLibProfile::clear(); 1460f17f7ee2SSatish Balay PetscCall(PetscLogEventBegin(MAT_H2Opus_Compress,A,0,0,0)); 1461f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1462f17f7ee2SSatish Balay boundtocpu = A->boundtocpu; 1463f17f7ee2SSatish Balay #endif 1464f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 1465f17f7ee2SSatish Balay if (size > 1) { 1466f17f7ee2SSatish Balay if (boundtocpu) { 1467f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 1468f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1469f17f7ee2SSatish Balay distributed_hcompress(*a->dist_hmatrix, tol, a->handle); 1470*036e4bdcSSatish Balay if (a->resize) { 1471*036e4bdcSSatish Balay DistributedHMatrix *dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix); 1472*036e4bdcSSatish Balay delete a->dist_hmatrix; 1473*036e4bdcSSatish Balay a->dist_hmatrix = dist_hmatrix; 1474*036e4bdcSSatish Balay } 1475f17f7ee2SSatish Balay #endif 1476f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1477f17f7ee2SSatish Balay A->offloadmask = PETSC_OFFLOAD_CPU; 1478f17f7ee2SSatish Balay } else { 1479f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 1480f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeBegin()); 1481f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1482f17f7ee2SSatish Balay distributed_hcompress(*a->dist_hmatrix_gpu, tol, a->handle); 1483*036e4bdcSSatish Balay 1484*036e4bdcSSatish Balay if (a->resize) { 1485*036e4bdcSSatish Balay DistributedHMatrix_GPU *dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix_gpu); 1486*036e4bdcSSatish Balay delete a->dist_hmatrix_gpu; 1487*036e4bdcSSatish Balay a->dist_hmatrix_gpu = dist_hmatrix_gpu; 1488*036e4bdcSSatish Balay } 1489f17f7ee2SSatish Balay #endif 1490f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeEnd()); 1491f17f7ee2SSatish Balay #endif 1492f17f7ee2SSatish Balay } 1493f17f7ee2SSatish Balay } else { 1494f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1495f17f7ee2SSatish Balay h2opusHandle_t handle = a->handle->handle; 1496f17f7ee2SSatish Balay #else 1497f17f7ee2SSatish Balay h2opusHandle_t handle = a->handle; 1498f17f7ee2SSatish Balay #endif 1499f17f7ee2SSatish Balay if (boundtocpu) { 1500f17f7ee2SSatish Balay PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 1501f17f7ee2SSatish Balay hcompress(*a->hmatrix, tol, handle); 1502*036e4bdcSSatish Balay 1503*036e4bdcSSatish Balay if (a->resize) { 1504*036e4bdcSSatish Balay HMatrix *hmatrix = new HMatrix(*a->hmatrix); 1505*036e4bdcSSatish Balay delete a->hmatrix; 1506*036e4bdcSSatish Balay a->hmatrix = hmatrix; 1507*036e4bdcSSatish Balay } 1508f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1509f17f7ee2SSatish Balay A->offloadmask = PETSC_OFFLOAD_CPU; 1510f17f7ee2SSatish Balay } else { 1511f17f7ee2SSatish Balay PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 1512f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeBegin()); 1513f17f7ee2SSatish Balay hcompress(*a->hmatrix_gpu, tol, handle); 1514f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeEnd()); 1515*036e4bdcSSatish Balay 1516*036e4bdcSSatish Balay if (a->resize) { 1517*036e4bdcSSatish Balay HMatrix_GPU *hmatrix_gpu = new HMatrix_GPU(*a->hmatrix_gpu); 1518*036e4bdcSSatish Balay delete a->hmatrix_gpu; 1519*036e4bdcSSatish Balay a->hmatrix_gpu = hmatrix_gpu; 1520*036e4bdcSSatish Balay } 1521f17f7ee2SSatish Balay #endif 1522f17f7ee2SSatish Balay } 1523f17f7ee2SSatish Balay } 1524f17f7ee2SSatish Balay { /* log flops */ 1525f17f7ee2SSatish Balay double gops,time,perf,dev; 1526f17f7ee2SSatish Balay HLibProfile::getHcompressPerf(gops,time,perf,dev); 1527f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1528f17f7ee2SSatish Balay if (boundtocpu) { 1529f17f7ee2SSatish Balay PetscCall(PetscLogFlops(1e9*gops)); 1530f17f7ee2SSatish Balay } else { 1531f17f7ee2SSatish Balay PetscCall(PetscLogGpuFlops(1e9*gops)); 1532f17f7ee2SSatish Balay } 1533f17f7ee2SSatish Balay #else 1534f17f7ee2SSatish Balay PetscCall(PetscLogFlops(1e9*gops)); 1535f17f7ee2SSatish Balay #endif 1536f17f7ee2SSatish Balay } 1537f17f7ee2SSatish Balay PetscCall(PetscLogEventEnd(MAT_H2Opus_Compress,A,0,0,0)); 1538f17f7ee2SSatish Balay PetscFunctionReturn(0); 1539f17f7ee2SSatish Balay } 1540f17f7ee2SSatish Balay 1541f17f7ee2SSatish Balay /*@C 1542f17f7ee2SSatish Balay MatH2OpusSetSamplingMat - Set a matrix to be sampled from matrix vector product to construct a hierarchical matrix. 1543f17f7ee2SSatish Balay 1544f17f7ee2SSatish Balay Input Parameters: 1545f17f7ee2SSatish Balay + A - the hierarchical matrix 1546f17f7ee2SSatish Balay . B - the matrix to be sampled 1547f17f7ee2SSatish Balay . bs - maximum number of samples to be taken concurrently 1548f17f7ee2SSatish Balay - tol - relative tolerance for construction 1549f17f7ee2SSatish Balay 1550f17f7ee2SSatish Balay Notes: Need to call MatAssemblyBegin/End() to update the hierarchical matrix. 1551f17f7ee2SSatish Balay 1552f17f7ee2SSatish Balay Level: intermediate 1553f17f7ee2SSatish Balay 1554*036e4bdcSSatish Balay .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()` 1555f17f7ee2SSatish Balay @*/ 1556f17f7ee2SSatish Balay PetscErrorCode MatH2OpusSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol) 1557f17f7ee2SSatish Balay { 1558f17f7ee2SSatish Balay PetscBool ish2opus; 1559f17f7ee2SSatish Balay 1560f17f7ee2SSatish Balay PetscFunctionBegin; 1561f17f7ee2SSatish Balay PetscValidHeaderSpecific(A,MAT_CLASSID,1); 1562f17f7ee2SSatish Balay PetscValidType(A,1); 1563f17f7ee2SSatish Balay if (B) PetscValidHeaderSpecific(B,MAT_CLASSID,2); 1564f17f7ee2SSatish Balay PetscValidLogicalCollectiveInt(A,bs,3); 1565f17f7ee2SSatish Balay PetscValidLogicalCollectiveReal(A,tol,3); 1566f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus)); 1567f17f7ee2SSatish Balay if (ish2opus) { 1568f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 1569f17f7ee2SSatish Balay 1570f17f7ee2SSatish Balay if (!a->sampler) a->sampler = new PetscMatrixSampler(); 1571f17f7ee2SSatish Balay a->sampler->SetSamplingMat(B); 1572f17f7ee2SSatish Balay if (bs > 0) a->bs = bs; 1573f17f7ee2SSatish Balay if (tol > 0.) a->rtol = tol; 1574f17f7ee2SSatish Balay delete a->kernel; 1575f17f7ee2SSatish Balay } 1576f17f7ee2SSatish Balay PetscFunctionReturn(0); 1577f17f7ee2SSatish Balay } 1578f17f7ee2SSatish Balay 1579f17f7ee2SSatish Balay /*@C 1580f17f7ee2SSatish Balay MatCreateH2OpusFromKernel - Creates a MATH2OPUS from a user-supplied kernel. 1581f17f7ee2SSatish Balay 1582f17f7ee2SSatish Balay Input Parameters: 1583f17f7ee2SSatish Balay + comm - MPI communicator 1584f17f7ee2SSatish Balay . m - number of local rows (or PETSC_DECIDE to have calculated if M is given) 1585f17f7ee2SSatish Balay . n - number of local columns (or PETSC_DECIDE to have calculated if N is given) 1586f17f7ee2SSatish Balay . M - number of global rows (or PETSC_DETERMINE to have calculated if m is given) 1587f17f7ee2SSatish Balay . N - number of global columns (or PETSC_DETERMINE to have calculated if n is given) 1588f17f7ee2SSatish Balay . spacedim - dimension of the space coordinates 1589f17f7ee2SSatish Balay . coords - coordinates of the points 1590f17f7ee2SSatish Balay . cdist - whether or not coordinates are distributed 1591f17f7ee2SSatish Balay . kernel - computational kernel (or NULL) 1592f17f7ee2SSatish Balay . kernelctx - kernel context 1593f17f7ee2SSatish Balay . eta - admissibility condition tolerance 1594f17f7ee2SSatish Balay . leafsize - leaf size in cluster tree 1595f17f7ee2SSatish Balay - basisord - approximation order for Chebychev interpolation of low-rank blocks 1596f17f7ee2SSatish Balay 1597f17f7ee2SSatish Balay Output Parameter: 1598f17f7ee2SSatish Balay . nA - matrix 1599f17f7ee2SSatish Balay 1600f17f7ee2SSatish Balay Options Database Keys: 1601f17f7ee2SSatish Balay + -mat_h2opus_leafsize <PetscInt> - Leaf size of cluster tree 1602f17f7ee2SSatish Balay . -mat_h2opus_eta <PetscReal> - Admissibility condition tolerance 1603f17f7ee2SSatish Balay . -mat_h2opus_order <PetscInt> - Chebychev approximation order 1604f17f7ee2SSatish Balay - -mat_h2opus_normsamples <PetscInt> - Maximum number of samples to be used when estimating norms 1605f17f7ee2SSatish Balay 1606f17f7ee2SSatish Balay Level: intermediate 1607f17f7ee2SSatish Balay 1608*036e4bdcSSatish Balay .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()` 1609f17f7ee2SSatish Balay @*/ 1610f17f7ee2SSatish Balay PetscErrorCode MatCreateH2OpusFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat* nA) 1611f17f7ee2SSatish Balay { 1612f17f7ee2SSatish Balay Mat A; 1613f17f7ee2SSatish Balay Mat_H2OPUS *h2opus; 1614f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1615f17f7ee2SSatish Balay PetscBool iscpu = PETSC_FALSE; 1616f17f7ee2SSatish Balay #else 1617f17f7ee2SSatish Balay PetscBool iscpu = PETSC_TRUE; 1618f17f7ee2SSatish Balay #endif 1619f17f7ee2SSatish Balay 1620f17f7ee2SSatish Balay PetscFunctionBegin; 1621*036e4bdcSSatish Balay PetscCheck(m == n,PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported"); 1622f17f7ee2SSatish Balay PetscCall(MatCreate(comm,&A)); 1623f17f7ee2SSatish Balay PetscCall(MatSetSizes(A,m,n,M,N)); 1624*036e4bdcSSatish Balay PetscCheck(M == N,comm,PETSC_ERR_SUP,"Rectangular matrices are not supported"); 1625f17f7ee2SSatish Balay PetscCall(MatSetType(A,MATH2OPUS)); 1626f17f7ee2SSatish Balay PetscCall(MatBindToCPU(A,iscpu)); 1627f17f7ee2SSatish Balay PetscCall(MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,cdist,kernel,kernelctx)); 1628f17f7ee2SSatish Balay 1629f17f7ee2SSatish Balay h2opus = (Mat_H2OPUS*)A->data; 1630f17f7ee2SSatish Balay if (eta > 0.) h2opus->eta = eta; 1631f17f7ee2SSatish Balay if (leafsize > 0) h2opus->leafsize = leafsize; 1632f17f7ee2SSatish Balay if (basisord > 0) h2opus->basisord = basisord; 1633f17f7ee2SSatish Balay 1634f17f7ee2SSatish Balay *nA = A; 1635f17f7ee2SSatish Balay PetscFunctionReturn(0); 1636f17f7ee2SSatish Balay } 1637f17f7ee2SSatish Balay 1638f17f7ee2SSatish Balay /*@C 1639f17f7ee2SSatish Balay MatCreateH2OpusFromMat - Creates a MATH2OPUS sampling from a user-supplied operator. 1640f17f7ee2SSatish Balay 1641f17f7ee2SSatish Balay Input Parameters: 1642f17f7ee2SSatish Balay + B - the matrix to be sampled 1643f17f7ee2SSatish Balay . spacedim - dimension of the space coordinates 1644f17f7ee2SSatish Balay . coords - coordinates of the points 1645f17f7ee2SSatish Balay . cdist - whether or not coordinates are distributed 1646f17f7ee2SSatish Balay . eta - admissibility condition tolerance 1647f17f7ee2SSatish Balay . leafsize - leaf size in cluster tree 1648f17f7ee2SSatish Balay . maxrank - maximum rank allowed 1649f17f7ee2SSatish Balay . bs - maximum number of samples to be taken concurrently 1650f17f7ee2SSatish Balay - rtol - relative tolerance for construction 1651f17f7ee2SSatish Balay 1652f17f7ee2SSatish Balay Output Parameter: 1653f17f7ee2SSatish Balay . nA - matrix 1654f17f7ee2SSatish Balay 1655f17f7ee2SSatish Balay Options Database Keys: 1656f17f7ee2SSatish Balay + -mat_h2opus_leafsize <PetscInt> - Leaf size of cluster tree 1657f17f7ee2SSatish Balay . -mat_h2opus_eta <PetscReal> - Admissibility condition tolerance 1658f17f7ee2SSatish Balay . -mat_h2opus_maxrank <PetscInt> - Maximum rank when constructed from matvecs 1659f17f7ee2SSatish Balay . -mat_h2opus_samples <PetscInt> - Maximum number of samples to be taken concurrently when constructing from matvecs 1660f17f7ee2SSatish Balay . -mat_h2opus_rtol <PetscReal> - Relative tolerance for construction from sampling 1661f17f7ee2SSatish Balay . -mat_h2opus_check <PetscBool> - Check error when constructing from sampling during MatAssemblyEnd() 1662f17f7ee2SSatish Balay . -mat_h2opus_hara_verbose <PetscBool> - Verbose output from hara construction 1663f17f7ee2SSatish Balay - -mat_h2opus_normsamples <PetscInt> - Maximum bumber of samples to be when estimating norms 1664f17f7ee2SSatish Balay 1665f17f7ee2SSatish Balay Notes: not available in parallel 1666f17f7ee2SSatish Balay 1667f17f7ee2SSatish Balay Level: intermediate 1668f17f7ee2SSatish Balay 1669*036e4bdcSSatish Balay .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromKernel()` 1670f17f7ee2SSatish Balay @*/ 1671f17f7ee2SSatish Balay PetscErrorCode MatCreateH2OpusFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA) 1672f17f7ee2SSatish Balay { 1673f17f7ee2SSatish Balay Mat A; 1674f17f7ee2SSatish Balay Mat_H2OPUS *h2opus; 1675f17f7ee2SSatish Balay MPI_Comm comm; 1676f17f7ee2SSatish Balay PetscBool boundtocpu = PETSC_TRUE; 1677f17f7ee2SSatish Balay 1678f17f7ee2SSatish Balay PetscFunctionBegin; 1679f17f7ee2SSatish Balay PetscValidHeaderSpecific(B,MAT_CLASSID,1); 1680f17f7ee2SSatish Balay PetscValidLogicalCollectiveInt(B,spacedim,2); 1681f17f7ee2SSatish Balay PetscValidLogicalCollectiveBool(B,cdist,4); 1682f17f7ee2SSatish Balay PetscValidLogicalCollectiveReal(B,eta,5); 1683f17f7ee2SSatish Balay PetscValidLogicalCollectiveInt(B,leafsize,6); 1684f17f7ee2SSatish Balay PetscValidLogicalCollectiveInt(B,maxrank,7); 1685f17f7ee2SSatish Balay PetscValidLogicalCollectiveInt(B,bs,8); 1686f17f7ee2SSatish Balay PetscValidLogicalCollectiveReal(B,rtol,9); 1687f17f7ee2SSatish Balay PetscValidPointer(nA,10); 1688f17f7ee2SSatish Balay PetscCall(PetscObjectGetComm((PetscObject)B,&comm)); 1689*036e4bdcSSatish Balay PetscCheck(B->rmap->n == B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported"); 1690*036e4bdcSSatish Balay PetscCheck(B->rmap->N == B->cmap->N,comm,PETSC_ERR_SUP,"Rectangular matrices are not supported"); 1691f17f7ee2SSatish Balay PetscCall(MatCreate(comm,&A)); 1692f17f7ee2SSatish Balay PetscCall(MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N)); 1693f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1694f17f7ee2SSatish Balay { 1695f17f7ee2SSatish Balay PetscBool iscuda; 1696f17f7ee2SSatish Balay VecType vtype; 1697f17f7ee2SSatish Balay 1698f17f7ee2SSatish Balay PetscCall(MatGetVecType(B,&vtype)); 1699f17f7ee2SSatish Balay PetscCall(PetscStrcmp(vtype,VECCUDA,&iscuda)); 1700f17f7ee2SSatish Balay if (!iscuda) { 1701f17f7ee2SSatish Balay PetscCall(PetscStrcmp(vtype,VECSEQCUDA,&iscuda)); 1702f17f7ee2SSatish Balay if (!iscuda) { 1703f17f7ee2SSatish Balay PetscCall(PetscStrcmp(vtype,VECMPICUDA,&iscuda)); 1704f17f7ee2SSatish Balay } 1705f17f7ee2SSatish Balay } 1706f17f7ee2SSatish Balay if (iscuda && !B->boundtocpu) boundtocpu = PETSC_FALSE; 1707f17f7ee2SSatish Balay } 1708f17f7ee2SSatish Balay #endif 1709f17f7ee2SSatish Balay PetscCall(MatSetType(A,MATH2OPUS)); 1710f17f7ee2SSatish Balay PetscCall(MatBindToCPU(A,boundtocpu)); 1711f17f7ee2SSatish Balay if (spacedim) { 1712f17f7ee2SSatish Balay PetscCall(MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,cdist,NULL,NULL)); 1713f17f7ee2SSatish Balay } 1714f17f7ee2SSatish Balay PetscCall(MatPropagateSymmetryOptions(B,A)); 1715f17f7ee2SSatish Balay /* PetscCheck(A->symmetric,comm,PETSC_ERR_SUP,"Unsymmetric sampling does not work"); */ 1716f17f7ee2SSatish Balay 1717f17f7ee2SSatish Balay h2opus = (Mat_H2OPUS*)A->data; 1718f17f7ee2SSatish Balay h2opus->sampler = new PetscMatrixSampler(B); 1719f17f7ee2SSatish Balay if (eta > 0.) h2opus->eta = eta; 1720f17f7ee2SSatish Balay if (leafsize > 0) h2opus->leafsize = leafsize; 1721f17f7ee2SSatish Balay if (maxrank > 0) h2opus->max_rank = maxrank; 1722f17f7ee2SSatish Balay if (bs > 0) h2opus->bs = bs; 1723f17f7ee2SSatish Balay if (rtol > 0.) h2opus->rtol = rtol; 1724f17f7ee2SSatish Balay *nA = A; 1725f17f7ee2SSatish Balay A->preallocated = PETSC_TRUE; 1726f17f7ee2SSatish Balay PetscFunctionReturn(0); 1727f17f7ee2SSatish Balay } 1728f17f7ee2SSatish Balay 1729f17f7ee2SSatish Balay /*@C 1730f17f7ee2SSatish Balay MatH2OpusGetIndexMap - Access reordering index set. 1731f17f7ee2SSatish Balay 1732f17f7ee2SSatish Balay Input Parameters: 1733f17f7ee2SSatish Balay . A - the matrix 1734f17f7ee2SSatish Balay 1735f17f7ee2SSatish Balay Output Parameter: 1736f17f7ee2SSatish Balay . indexmap - the index set for the reordering 1737f17f7ee2SSatish Balay 1738f17f7ee2SSatish Balay Level: intermediate 1739f17f7ee2SSatish Balay 1740*036e4bdcSSatish Balay .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()` 1741f17f7ee2SSatish Balay @*/ 1742f17f7ee2SSatish Balay PetscErrorCode MatH2OpusGetIndexMap(Mat A, IS *indexmap) 1743f17f7ee2SSatish Balay { 1744f17f7ee2SSatish Balay PetscBool ish2opus; 1745f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 1746f17f7ee2SSatish Balay 1747f17f7ee2SSatish Balay PetscFunctionBegin; 1748f17f7ee2SSatish Balay PetscValidHeaderSpecific(A,MAT_CLASSID,1); 1749f17f7ee2SSatish Balay PetscValidType(A,1); 1750f17f7ee2SSatish Balay PetscValidPointer(indexmap,2); 1751f17f7ee2SSatish Balay PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 1752f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus)); 1753f17f7ee2SSatish Balay PetscCheck(ish2opus,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name); 1754f17f7ee2SSatish Balay *indexmap = a->h2opus_indexmap; 1755f17f7ee2SSatish Balay PetscFunctionReturn(0); 1756f17f7ee2SSatish Balay } 1757f17f7ee2SSatish Balay 1758f17f7ee2SSatish Balay /*@C 1759f17f7ee2SSatish Balay MatH2OpusMapVec - Maps a vector between PETSc and H2Opus ordering 1760f17f7ee2SSatish Balay 1761f17f7ee2SSatish Balay Input Parameters: 1762f17f7ee2SSatish Balay + A - the matrix 1763f17f7ee2SSatish Balay . nativetopetsc - if true, maps from H2Opus ordering to PETSc ordering. If false, applies the reverse map 1764f17f7ee2SSatish Balay - in - the vector to be mapped 1765f17f7ee2SSatish Balay 1766f17f7ee2SSatish Balay Output Parameter: 1767f17f7ee2SSatish Balay . out - the newly created mapped vector 1768f17f7ee2SSatish Balay 1769f17f7ee2SSatish Balay Level: intermediate 1770f17f7ee2SSatish Balay 1771*036e4bdcSSatish Balay .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()` 1772f17f7ee2SSatish Balay @*/ 1773f17f7ee2SSatish Balay PetscErrorCode MatH2OpusMapVec(Mat A, PetscBool nativetopetsc, Vec in, Vec* out) 1774f17f7ee2SSatish Balay { 1775f17f7ee2SSatish Balay PetscBool ish2opus; 1776f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 1777f17f7ee2SSatish Balay PetscScalar *xin,*xout; 1778f17f7ee2SSatish Balay PetscBool nm; 1779f17f7ee2SSatish Balay 1780f17f7ee2SSatish Balay PetscFunctionBegin; 1781f17f7ee2SSatish Balay PetscValidHeaderSpecific(A,MAT_CLASSID,1); 1782f17f7ee2SSatish Balay PetscValidType(A,1); 1783f17f7ee2SSatish Balay PetscValidLogicalCollectiveBool(A,nativetopetsc,2); 1784f17f7ee2SSatish Balay PetscValidHeaderSpecific(in,VEC_CLASSID,3); 1785f17f7ee2SSatish Balay PetscValidPointer(out,4); 1786f17f7ee2SSatish Balay PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 1787f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus)); 1788f17f7ee2SSatish Balay PetscCheck(ish2opus,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name); 1789f17f7ee2SSatish Balay nm = a->nativemult; 1790f17f7ee2SSatish Balay PetscCall(MatH2OpusSetNativeMult(A,(PetscBool)!nativetopetsc)); 1791f17f7ee2SSatish Balay PetscCall(MatCreateVecs(A,out,NULL)); 1792f17f7ee2SSatish Balay PetscCall(MatH2OpusSetNativeMult(A,nm)); 1793f17f7ee2SSatish Balay if (!a->sf) { /* same ordering */ 1794f17f7ee2SSatish Balay PetscCall(VecCopy(in,*out)); 1795f17f7ee2SSatish Balay PetscFunctionReturn(0); 1796f17f7ee2SSatish Balay } 1797f17f7ee2SSatish Balay PetscCall(VecGetArrayRead(in,(const PetscScalar**)&xin)); 1798f17f7ee2SSatish Balay PetscCall(VecGetArrayWrite(*out,&xout)); 1799f17f7ee2SSatish Balay if (nativetopetsc) { 1800f17f7ee2SSatish Balay PetscCall(PetscSFReduceBegin(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE)); 1801f17f7ee2SSatish Balay PetscCall(PetscSFReduceEnd(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE)); 1802f17f7ee2SSatish Balay } else { 1803f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE)); 1804f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE)); 1805f17f7ee2SSatish Balay } 1806f17f7ee2SSatish Balay PetscCall(VecRestoreArrayRead(in,(const PetscScalar**)&xin)); 1807f17f7ee2SSatish Balay PetscCall(VecRestoreArrayWrite(*out,&xout)); 1808f17f7ee2SSatish Balay PetscFunctionReturn(0); 1809f17f7ee2SSatish Balay } 1810f17f7ee2SSatish Balay 1811f17f7ee2SSatish Balay /*@C 1812f17f7ee2SSatish Balay MatH2OpusLowRankUpdate - Perform a low-rank update of the form A = A + s * U * V^T 1813f17f7ee2SSatish Balay 1814f17f7ee2SSatish Balay Input Parameters: 1815f17f7ee2SSatish Balay + A - the hierarchical matrix 1816f17f7ee2SSatish Balay . s - the scaling factor 1817f17f7ee2SSatish Balay . U - the dense low-rank update matrix 1818f17f7ee2SSatish Balay - V - (optional) the dense low-rank update matrix (if NULL, then V = U is assumed) 1819f17f7ee2SSatish Balay 1820f17f7ee2SSatish Balay Notes: The U and V matrices must be in dense format 1821f17f7ee2SSatish Balay 1822f17f7ee2SSatish Balay Level: intermediate 1823f17f7ee2SSatish Balay 1824*036e4bdcSSatish Balay .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`, `MATDENSE` 1825f17f7ee2SSatish Balay @*/ 1826f17f7ee2SSatish Balay PetscErrorCode MatH2OpusLowRankUpdate(Mat A, Mat U, Mat V, PetscScalar s) 1827f17f7ee2SSatish Balay { 1828f17f7ee2SSatish Balay PetscBool flg; 1829f17f7ee2SSatish Balay 1830f17f7ee2SSatish Balay PetscFunctionBegin; 1831f17f7ee2SSatish Balay PetscValidHeaderSpecific(A,MAT_CLASSID,1); 1832f17f7ee2SSatish Balay PetscValidType(A,1); 1833f17f7ee2SSatish Balay PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 1834f17f7ee2SSatish Balay PetscValidHeaderSpecific(U,MAT_CLASSID,2); 1835f17f7ee2SSatish Balay PetscCheckSameComm(A,1,U,2); 1836f17f7ee2SSatish Balay if (V) { 1837f17f7ee2SSatish Balay PetscValidHeaderSpecific(V,MAT_CLASSID,3); 1838f17f7ee2SSatish Balay PetscCheckSameComm(A,1,V,3); 1839f17f7ee2SSatish Balay } 1840f17f7ee2SSatish Balay PetscValidLogicalCollectiveScalar(A,s,4); 1841f17f7ee2SSatish Balay 1842f17f7ee2SSatish Balay if (!V) V = U; 1843*036e4bdcSSatish Balay PetscCheck(U->cmap->N == V->cmap->N,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Non matching rank update %" PetscInt_FMT " != %" PetscInt_FMT,U->cmap->N,V->cmap->N); 1844f17f7ee2SSatish Balay if (!U->cmap->N) PetscFunctionReturn(0); 1845f17f7ee2SSatish Balay PetscCall(PetscLayoutCompare(U->rmap,A->rmap,&flg)); 1846f17f7ee2SSatish Balay PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"A and U must have the same row layout"); 1847f17f7ee2SSatish Balay PetscCall(PetscLayoutCompare(V->rmap,A->cmap,&flg)); 1848f17f7ee2SSatish Balay PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"A column layout must match V row column layout"); 1849f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&flg)); 1850f17f7ee2SSatish Balay if (flg) { 1851f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS*)A->data; 1852f17f7ee2SSatish Balay const PetscScalar *u,*v,*uu,*vv; 1853f17f7ee2SSatish Balay PetscInt ldu,ldv; 1854f17f7ee2SSatish Balay PetscMPIInt size; 1855f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1856f17f7ee2SSatish Balay h2opusHandle_t handle = a->handle->handle; 1857f17f7ee2SSatish Balay #else 1858f17f7ee2SSatish Balay h2opusHandle_t handle = a->handle; 1859f17f7ee2SSatish Balay #endif 1860f17f7ee2SSatish Balay PetscBool usesf = (PetscBool)(a->sf && !a->nativemult); 1861f17f7ee2SSatish Balay PetscSF usf,vsf; 1862f17f7ee2SSatish Balay 1863f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 1864*036e4bdcSSatish Balay PetscCheck(size <= 1,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not yet implemented in parallel"); 1865f17f7ee2SSatish Balay PetscCall(PetscLogEventBegin(MAT_H2Opus_LR,A,0,0,0)); 1866f17f7ee2SSatish Balay PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)U,&flg,MATSEQDENSE,MATMPIDENSE,"")); 1867f17f7ee2SSatish Balay PetscCheck(flg,PetscObjectComm((PetscObject)U),PETSC_ERR_SUP,"Not for U of type %s",((PetscObject)U)->type_name); 1868f17f7ee2SSatish Balay PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)V,&flg,MATSEQDENSE,MATMPIDENSE,"")); 1869f17f7ee2SSatish Balay PetscCheck(flg,PetscObjectComm((PetscObject)V),PETSC_ERR_SUP,"Not for V of type %s",((PetscObject)V)->type_name); 1870f17f7ee2SSatish Balay PetscCall(MatDenseGetLDA(U,&ldu)); 1871f17f7ee2SSatish Balay PetscCall(MatDenseGetLDA(V,&ldv)); 1872f17f7ee2SSatish Balay PetscCall(MatBoundToCPU(A,&flg)); 1873f17f7ee2SSatish Balay if (usesf) { 1874f17f7ee2SSatish Balay PetscInt n; 1875f17f7ee2SSatish Balay 1876f17f7ee2SSatish Balay PetscCall(MatDenseGetH2OpusVectorSF(U,a->sf,&usf)); 1877f17f7ee2SSatish Balay PetscCall(MatDenseGetH2OpusVectorSF(V,a->sf,&vsf)); 1878f17f7ee2SSatish Balay PetscCall(MatH2OpusResizeBuffers_Private(A,U->cmap->N,V->cmap->N)); 1879f17f7ee2SSatish Balay PetscCall(PetscSFGetGraph(a->sf,NULL,&n,NULL,NULL)); 1880f17f7ee2SSatish Balay ldu = n; 1881f17f7ee2SSatish Balay ldv = n; 1882f17f7ee2SSatish Balay } 1883f17f7ee2SSatish Balay if (flg) { 1884f17f7ee2SSatish Balay PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix"); 1885f17f7ee2SSatish Balay PetscCall(MatDenseGetArrayRead(U,&u)); 1886f17f7ee2SSatish Balay PetscCall(MatDenseGetArrayRead(V,&v)); 1887f17f7ee2SSatish Balay if (usesf) { 1888f17f7ee2SSatish Balay vv = MatH2OpusGetThrustPointer(*a->yy); 1889f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE)); 1890f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE)); 1891f17f7ee2SSatish Balay if (U != V) { 1892f17f7ee2SSatish Balay uu = MatH2OpusGetThrustPointer(*a->xx); 1893f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE)); 1894f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE)); 1895f17f7ee2SSatish Balay } else uu = vv; 1896f17f7ee2SSatish Balay } else { uu = u; vv = v; } 1897f17f7ee2SSatish Balay hlru_global(*a->hmatrix,uu,ldu,vv,ldv,U->cmap->N,s,handle); 1898f17f7ee2SSatish Balay PetscCall(MatDenseRestoreArrayRead(U,&u)); 1899f17f7ee2SSatish Balay PetscCall(MatDenseRestoreArrayRead(V,&v)); 1900f17f7ee2SSatish Balay } else { 1901f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1902f17f7ee2SSatish Balay PetscBool flgU, flgV; 1903f17f7ee2SSatish Balay 1904f17f7ee2SSatish Balay PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix"); 1905f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)U,&flgU,MATSEQDENSE,MATMPIDENSE,"")); 1906f17f7ee2SSatish Balay if (flgU) PetscCall(MatConvert(U,MATDENSECUDA,MAT_INPLACE_MATRIX,&U)); 1907f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)V,&flgV,MATSEQDENSE,MATMPIDENSE,"")); 1908f17f7ee2SSatish Balay if (flgV) PetscCall(MatConvert(V,MATDENSECUDA,MAT_INPLACE_MATRIX,&V)); 1909f17f7ee2SSatish Balay PetscCall(MatDenseCUDAGetArrayRead(U,&u)); 1910f17f7ee2SSatish Balay PetscCall(MatDenseCUDAGetArrayRead(V,&v)); 1911f17f7ee2SSatish Balay if (usesf) { 1912f17f7ee2SSatish Balay vv = MatH2OpusGetThrustPointer(*a->yy_gpu); 1913f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE)); 1914f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE)); 1915f17f7ee2SSatish Balay if (U != V) { 1916f17f7ee2SSatish Balay uu = MatH2OpusGetThrustPointer(*a->xx_gpu); 1917f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE)); 1918f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE)); 1919f17f7ee2SSatish Balay } else uu = vv; 1920f17f7ee2SSatish Balay } else { uu = u; vv = v; } 1921f17f7ee2SSatish Balay #else 1922f17f7ee2SSatish Balay SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"This should not happen"); 1923f17f7ee2SSatish Balay #endif 1924f17f7ee2SSatish Balay hlru_global(*a->hmatrix_gpu,uu,ldu,vv,ldv,U->cmap->N,s,handle); 1925f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1926f17f7ee2SSatish Balay PetscCall(MatDenseCUDARestoreArrayRead(U,&u)); 1927f17f7ee2SSatish Balay PetscCall(MatDenseCUDARestoreArrayRead(V,&v)); 1928f17f7ee2SSatish Balay if (flgU) PetscCall(MatConvert(U,MATDENSE,MAT_INPLACE_MATRIX,&U)); 1929f17f7ee2SSatish Balay if (flgV) PetscCall(MatConvert(V,MATDENSE,MAT_INPLACE_MATRIX,&V)); 1930f17f7ee2SSatish Balay #endif 1931f17f7ee2SSatish Balay } 1932f17f7ee2SSatish Balay PetscCall(PetscLogEventEnd(MAT_H2Opus_LR,A,0,0,0)); 1933f17f7ee2SSatish Balay a->orthogonal = PETSC_FALSE; 1934f17f7ee2SSatish Balay } 1935f17f7ee2SSatish Balay PetscFunctionReturn(0); 1936f17f7ee2SSatish Balay } 1937f17f7ee2SSatish Balay #endif 1938