1f17f7ee2SSatish Balay #include <h2opusconf.h> 2f17f7ee2SSatish Balay /* skip compilation of this .cu file if H2OPUS is CPU only while PETSc has GPU support */ 3f17f7ee2SSatish Balay #if !defined(__CUDACC__) || defined(H2OPUS_USE_GPU) 4f17f7ee2SSatish Balay #include <h2opus.h> 5f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 6f17f7ee2SSatish Balay #include <h2opus/distributed/distributed_h2opus_handle.h> 7f17f7ee2SSatish Balay #include <h2opus/distributed/distributed_geometric_construction.h> 8f17f7ee2SSatish Balay #include <h2opus/distributed/distributed_hgemv.h> 9f17f7ee2SSatish Balay #include <h2opus/distributed/distributed_horthog.h> 10f17f7ee2SSatish Balay #include <h2opus/distributed/distributed_hcompress.h> 11f17f7ee2SSatish Balay #endif 12f17f7ee2SSatish Balay #include <h2opus/util/boxentrygen.h> 13f17f7ee2SSatish Balay #include <petsc/private/matimpl.h> 14f17f7ee2SSatish Balay #include <petsc/private/vecimpl.h> 15f17f7ee2SSatish Balay #include <petsc/private/deviceimpl.h> 16f17f7ee2SSatish Balay #include <petscsf.h> 17f17f7ee2SSatish Balay 18f17f7ee2SSatish Balay /* math2opusutils */ 19f17f7ee2SSatish Balay PETSC_INTERN PetscErrorCode PetscSFGetVectorSF(PetscSF, PetscInt, PetscInt, PetscInt, PetscSF *); 20f17f7ee2SSatish Balay PETSC_INTERN PetscErrorCode MatDenseGetH2OpusVectorSF(Mat, PetscSF, PetscSF *); 21f17f7ee2SSatish Balay PETSC_INTERN PetscErrorCode VecSign(Vec, Vec); 22f17f7ee2SSatish Balay PETSC_INTERN PetscErrorCode VecSetDelta(Vec, PetscInt); 23f17f7ee2SSatish Balay PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat, NormType, PetscInt, PetscReal *); 24f17f7ee2SSatish Balay 25f17f7ee2SSatish Balay #define MatH2OpusGetThrustPointer(v) thrust::raw_pointer_cast((v).data()) 26f17f7ee2SSatish Balay 27f17f7ee2SSatish Balay /* Use GPU only if H2OPUS is configured for GPU */ 28f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU) 29f17f7ee2SSatish Balay #define PETSC_H2OPUS_USE_GPU 30f17f7ee2SSatish Balay #endif 31f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 32f17f7ee2SSatish Balay #define MatH2OpusUpdateIfNeeded(A, B) MatBindToCPU(A, (PetscBool)((A)->boundtocpu || (B))) 33f17f7ee2SSatish Balay #else 343ba16761SJacob Faibussowitsch #define MatH2OpusUpdateIfNeeded(A, B) PETSC_SUCCESS 35f17f7ee2SSatish Balay #endif 36f17f7ee2SSatish Balay 37f17f7ee2SSatish Balay // TODO H2OPUS: 38f17f7ee2SSatish Balay // DistributedHMatrix 39f17f7ee2SSatish Balay // unsymmetric ? 40f17f7ee2SSatish Balay // transpose for distributed_hgemv? 41f17f7ee2SSatish Balay // clearData() 42f17f7ee2SSatish Balay // Unify interface for sequential and parallel? 43f17f7ee2SSatish Balay // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled) 44f17f7ee2SSatish Balay // 459371c9d4SSatish Balay template <class T> 469371c9d4SSatish Balay class PetscPointCloud : public H2OpusDataSet<T> { 47f17f7ee2SSatish Balay private: 48f17f7ee2SSatish Balay int dimension; 49f17f7ee2SSatish Balay size_t num_points; 50f17f7ee2SSatish Balay std::vector<T> pts; 51f17f7ee2SSatish Balay 52f17f7ee2SSatish Balay public: 53d71ae5a4SJacob Faibussowitsch PetscPointCloud(int dim, size_t num_pts, const T coords[]) 54d71ae5a4SJacob Faibussowitsch { 55f17f7ee2SSatish Balay dim = dim > 0 ? dim : 1; 56f17f7ee2SSatish Balay this->dimension = dim; 57f17f7ee2SSatish Balay this->num_points = num_pts; 58f17f7ee2SSatish Balay 59f17f7ee2SSatish Balay pts.resize(num_pts * dim); 60f17f7ee2SSatish Balay if (coords) { 61036e4bdcSSatish Balay for (size_t n = 0; n < num_pts; n++) 629371c9d4SSatish Balay for (int i = 0; i < dim; i++) pts[n * dim + i] = coords[n * dim + i]; 63f17f7ee2SSatish Balay } else { 64036e4bdcSSatish Balay PetscReal h = 1.0; //num_pts > 1 ? 1./(num_pts - 1) : 0.0; 65036e4bdcSSatish Balay for (size_t n = 0; n < num_pts; n++) { 66036e4bdcSSatish Balay pts[n * dim] = n * h; 679371c9d4SSatish Balay for (int i = 1; i < dim; i++) pts[n * dim + i] = 0.0; 68036e4bdcSSatish Balay } 69f17f7ee2SSatish Balay } 70f17f7ee2SSatish Balay } 71f17f7ee2SSatish Balay 72d71ae5a4SJacob Faibussowitsch PetscPointCloud(const PetscPointCloud<T> &other) 73d71ae5a4SJacob Faibussowitsch { 74f17f7ee2SSatish Balay size_t N = other.dimension * other.num_points; 75f17f7ee2SSatish Balay this->dimension = other.dimension; 76f17f7ee2SSatish Balay this->num_points = other.num_points; 77f17f7ee2SSatish Balay this->pts.resize(N); 789371c9d4SSatish Balay for (size_t i = 0; i < N; i++) this->pts[i] = other.pts[i]; 79f17f7ee2SSatish Balay } 80f17f7ee2SSatish Balay 819371c9d4SSatish Balay int getDimension() const { return dimension; } 82f17f7ee2SSatish Balay 839371c9d4SSatish Balay size_t getDataSetSize() const { return num_points; } 84f17f7ee2SSatish Balay 85d71ae5a4SJacob Faibussowitsch T getDataPoint(size_t idx, int dim) const 86d71ae5a4SJacob Faibussowitsch { 87f17f7ee2SSatish Balay assert(dim < dimension && idx < num_points); 88f17f7ee2SSatish Balay return pts[idx * dimension + dim]; 89f17f7ee2SSatish Balay } 90f17f7ee2SSatish Balay 91d71ae5a4SJacob Faibussowitsch void Print(std::ostream &out = std::cout) 92d71ae5a4SJacob Faibussowitsch { 93f17f7ee2SSatish Balay out << "Dimension: " << dimension << std::endl; 94f17f7ee2SSatish Balay out << "NumPoints: " << num_points << std::endl; 95f17f7ee2SSatish Balay for (size_t n = 0; n < num_points; n++) { 969371c9d4SSatish Balay for (int d = 0; d < dimension; d++) out << pts[n * dimension + d] << " "; 97f17f7ee2SSatish Balay out << std::endl; 98f17f7ee2SSatish Balay } 99f17f7ee2SSatish Balay } 100f17f7ee2SSatish Balay }; 101f17f7ee2SSatish Balay 1029371c9d4SSatish Balay template <class T> 1039371c9d4SSatish Balay class PetscFunctionGenerator { 104f17f7ee2SSatish Balay private: 105f17f7ee2SSatish Balay MatH2OpusKernel k; 106f17f7ee2SSatish Balay int dim; 107f17f7ee2SSatish Balay void *ctx; 108f17f7ee2SSatish Balay 109f17f7ee2SSatish Balay public: 110d71ae5a4SJacob Faibussowitsch PetscFunctionGenerator(MatH2OpusKernel k, int dim, void *ctx) 111d71ae5a4SJacob Faibussowitsch { 1129371c9d4SSatish Balay this->k = k; 1139371c9d4SSatish Balay this->dim = dim; 1149371c9d4SSatish Balay this->ctx = ctx; 115f17f7ee2SSatish Balay } 116d71ae5a4SJacob Faibussowitsch PetscFunctionGenerator(PetscFunctionGenerator &other) 117d71ae5a4SJacob Faibussowitsch { 1189371c9d4SSatish Balay this->k = other.k; 1199371c9d4SSatish Balay this->dim = other.dim; 1209371c9d4SSatish Balay this->ctx = other.ctx; 1219371c9d4SSatish Balay } 1229371c9d4SSatish Balay T operator()(PetscReal *pt1, PetscReal *pt2) { return (T)((*this->k)(this->dim, pt1, pt2, this->ctx)); } 123f17f7ee2SSatish Balay }; 124f17f7ee2SSatish Balay 125f17f7ee2SSatish Balay #include <../src/mat/impls/h2opus/math2opussampler.hpp> 126f17f7ee2SSatish Balay 127f17f7ee2SSatish Balay /* just to not clutter the code */ 128f17f7ee2SSatish Balay #if !defined(H2OPUS_USE_GPU) 129f17f7ee2SSatish Balay typedef HMatrix HMatrix_GPU; 130f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 131f17f7ee2SSatish Balay typedef DistributedHMatrix DistributedHMatrix_GPU; 132f17f7ee2SSatish Balay #endif 133f17f7ee2SSatish Balay #endif 134f17f7ee2SSatish Balay 135f17f7ee2SSatish Balay typedef struct { 136f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 137f17f7ee2SSatish Balay distributedH2OpusHandle_t handle; 138f17f7ee2SSatish Balay #else 139f17f7ee2SSatish Balay h2opusHandle_t handle; 140f17f7ee2SSatish Balay #endif 141f17f7ee2SSatish Balay /* Sequential and parallel matrices are two different classes at the moment */ 142f17f7ee2SSatish Balay HMatrix *hmatrix; 143f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 144f17f7ee2SSatish Balay DistributedHMatrix *dist_hmatrix; 145f17f7ee2SSatish Balay #else 146f17f7ee2SSatish Balay HMatrix *dist_hmatrix; /* just to not clutter the code */ 147f17f7ee2SSatish Balay #endif 148f17f7ee2SSatish Balay /* May use permutations */ 149f17f7ee2SSatish Balay PetscSF sf; 150f17f7ee2SSatish Balay PetscLayout h2opus_rmap, h2opus_cmap; 151f17f7ee2SSatish Balay IS h2opus_indexmap; 152f17f7ee2SSatish Balay thrust::host_vector<PetscScalar> *xx, *yy; 153f17f7ee2SSatish Balay PetscInt xxs, yys; 154f17f7ee2SSatish Balay PetscBool multsetup; 155f17f7ee2SSatish Balay 156f17f7ee2SSatish Balay /* GPU */ 157f17f7ee2SSatish Balay HMatrix_GPU *hmatrix_gpu; 158f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 159f17f7ee2SSatish Balay DistributedHMatrix_GPU *dist_hmatrix_gpu; 160f17f7ee2SSatish Balay #else 161f17f7ee2SSatish Balay HMatrix_GPU *dist_hmatrix_gpu; /* just to not clutter the code */ 162f17f7ee2SSatish Balay #endif 163f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 164f17f7ee2SSatish Balay thrust::device_vector<PetscScalar> *xx_gpu, *yy_gpu; 165f17f7ee2SSatish Balay PetscInt xxs_gpu, yys_gpu; 166f17f7ee2SSatish Balay #endif 167f17f7ee2SSatish Balay 168f17f7ee2SSatish Balay /* construction from matvecs */ 169f17f7ee2SSatish Balay PetscMatrixSampler *sampler; 170f17f7ee2SSatish Balay PetscBool nativemult; 171f17f7ee2SSatish Balay 172f17f7ee2SSatish Balay /* Admissibility */ 173f17f7ee2SSatish Balay PetscReal eta; 174f17f7ee2SSatish Balay PetscInt leafsize; 175f17f7ee2SSatish Balay 176f17f7ee2SSatish Balay /* for dof reordering */ 177f17f7ee2SSatish Balay PetscPointCloud<PetscReal> *ptcloud; 178f17f7ee2SSatish Balay 179f17f7ee2SSatish Balay /* kernel for generating matrix entries */ 180f17f7ee2SSatish Balay PetscFunctionGenerator<PetscScalar> *kernel; 181f17f7ee2SSatish Balay 182f17f7ee2SSatish Balay /* basis orthogonalized? */ 183f17f7ee2SSatish Balay PetscBool orthogonal; 184f17f7ee2SSatish Balay 185f17f7ee2SSatish Balay /* customization */ 186f17f7ee2SSatish Balay PetscInt basisord; 187f17f7ee2SSatish Balay PetscInt max_rank; 188f17f7ee2SSatish Balay PetscInt bs; 189f17f7ee2SSatish Balay PetscReal rtol; 190f17f7ee2SSatish Balay PetscInt norm_max_samples; 191f17f7ee2SSatish Balay PetscBool check_construction; 192f17f7ee2SSatish Balay PetscBool hara_verbose; 193036e4bdcSSatish Balay PetscBool resize; 194f17f7ee2SSatish Balay 195f17f7ee2SSatish Balay /* keeps track of MatScale values */ 196f17f7ee2SSatish Balay PetscScalar s; 197f17f7ee2SSatish Balay } Mat_H2OPUS; 198f17f7ee2SSatish Balay 199d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_H2OPUS(Mat A) 200d71ae5a4SJacob Faibussowitsch { 201f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 202f17f7ee2SSatish Balay 203f17f7ee2SSatish Balay PetscFunctionBegin; 204f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 205f17f7ee2SSatish Balay h2opusDestroyDistributedHandle(a->handle); 206f17f7ee2SSatish Balay #else 207f17f7ee2SSatish Balay h2opusDestroyHandle(a->handle); 208f17f7ee2SSatish Balay #endif 209f17f7ee2SSatish Balay delete a->dist_hmatrix; 210f17f7ee2SSatish Balay delete a->hmatrix; 211f17f7ee2SSatish Balay PetscCall(PetscSFDestroy(&a->sf)); 212f17f7ee2SSatish Balay PetscCall(PetscLayoutDestroy(&a->h2opus_rmap)); 213f17f7ee2SSatish Balay PetscCall(PetscLayoutDestroy(&a->h2opus_cmap)); 214f17f7ee2SSatish Balay PetscCall(ISDestroy(&a->h2opus_indexmap)); 215f17f7ee2SSatish Balay delete a->xx; 216f17f7ee2SSatish Balay delete a->yy; 217f17f7ee2SSatish Balay delete a->hmatrix_gpu; 218f17f7ee2SSatish Balay delete a->dist_hmatrix_gpu; 219f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 220f17f7ee2SSatish Balay delete a->xx_gpu; 221f17f7ee2SSatish Balay delete a->yy_gpu; 222f17f7ee2SSatish Balay #endif 223f17f7ee2SSatish Balay delete a->sampler; 224f17f7ee2SSatish Balay delete a->ptcloud; 225f17f7ee2SSatish Balay delete a->kernel; 226f17f7ee2SSatish Balay PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", NULL)); 227f17f7ee2SSatish Balay PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", NULL)); 228f17f7ee2SSatish Balay PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", NULL)); 229f17f7ee2SSatish Balay PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", NULL)); 230f17f7ee2SSatish Balay PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL)); 231f17f7ee2SSatish Balay PetscCall(PetscFree(A->data)); 2323ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 233f17f7ee2SSatish Balay } 234f17f7ee2SSatish Balay 235d71ae5a4SJacob Faibussowitsch PetscErrorCode MatH2OpusSetNativeMult(Mat A, PetscBool nm) 236d71ae5a4SJacob Faibussowitsch { 237f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 238f17f7ee2SSatish Balay PetscBool ish2opus; 239f17f7ee2SSatish Balay 240f17f7ee2SSatish Balay PetscFunctionBegin; 241f17f7ee2SSatish Balay PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 242f17f7ee2SSatish Balay PetscValidLogicalCollectiveBool(A, nm, 2); 243f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus)); 244f17f7ee2SSatish Balay if (ish2opus) { 245f17f7ee2SSatish Balay if (a->h2opus_rmap) { /* need to swap layouts for vector creation */ 246f17f7ee2SSatish Balay if ((!a->nativemult && nm) || (a->nativemult && !nm)) { 247f17f7ee2SSatish Balay PetscLayout t; 248f17f7ee2SSatish Balay t = A->rmap; 249f17f7ee2SSatish Balay A->rmap = a->h2opus_rmap; 250f17f7ee2SSatish Balay a->h2opus_rmap = t; 251f17f7ee2SSatish Balay t = A->cmap; 252f17f7ee2SSatish Balay A->cmap = a->h2opus_cmap; 253f17f7ee2SSatish Balay a->h2opus_cmap = t; 254f17f7ee2SSatish Balay } 255f17f7ee2SSatish Balay } 256f17f7ee2SSatish Balay a->nativemult = nm; 257f17f7ee2SSatish Balay } 2583ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 259f17f7ee2SSatish Balay } 260f17f7ee2SSatish Balay 261d71ae5a4SJacob Faibussowitsch PetscErrorCode MatH2OpusGetNativeMult(Mat A, PetscBool *nm) 262d71ae5a4SJacob Faibussowitsch { 263f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 264f17f7ee2SSatish Balay PetscBool ish2opus; 265f17f7ee2SSatish Balay 266f17f7ee2SSatish Balay PetscFunctionBegin; 267f17f7ee2SSatish Balay PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 268f17f7ee2SSatish Balay PetscValidPointer(nm, 2); 269f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus)); 270f17f7ee2SSatish Balay PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name); 271f17f7ee2SSatish Balay *nm = a->nativemult; 2723ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 273f17f7ee2SSatish Balay } 274f17f7ee2SSatish Balay 275d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat A, NormType normtype, PetscReal *n) 276d71ae5a4SJacob Faibussowitsch { 277f17f7ee2SSatish Balay PetscBool ish2opus; 278f17f7ee2SSatish Balay PetscInt nmax = PETSC_DECIDE; 279f17f7ee2SSatish Balay Mat_H2OPUS *a = NULL; 280f17f7ee2SSatish Balay PetscBool mult = PETSC_FALSE; 281f17f7ee2SSatish Balay 282f17f7ee2SSatish Balay PetscFunctionBegin; 283f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus)); 284f17f7ee2SSatish Balay if (ish2opus) { /* set userdefine number of samples and fastpath for mult (norms are order independent) */ 285f17f7ee2SSatish Balay a = (Mat_H2OPUS *)A->data; 286f17f7ee2SSatish Balay 287f17f7ee2SSatish Balay nmax = a->norm_max_samples; 288f17f7ee2SSatish Balay mult = a->nativemult; 289f17f7ee2SSatish Balay PetscCall(MatH2OpusSetNativeMult(A, PETSC_TRUE)); 290f17f7ee2SSatish Balay } else { 291f17f7ee2SSatish Balay PetscCall(PetscOptionsGetInt(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_approximate_norm_samples", &nmax, NULL)); 292f17f7ee2SSatish Balay } 293f17f7ee2SSatish Balay PetscCall(MatApproximateNorm_Private(A, normtype, nmax, n)); 294f17f7ee2SSatish Balay if (a) PetscCall(MatH2OpusSetNativeMult(A, mult)); 2953ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 296f17f7ee2SSatish Balay } 297f17f7ee2SSatish Balay 298d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatH2OpusResizeBuffers_Private(Mat A, PetscInt xN, PetscInt yN) 299d71ae5a4SJacob Faibussowitsch { 300f17f7ee2SSatish Balay Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data; 301f17f7ee2SSatish Balay PetscInt n; 302f17f7ee2SSatish Balay PetscBool boundtocpu = PETSC_TRUE; 303f17f7ee2SSatish Balay 304f17f7ee2SSatish Balay PetscFunctionBegin; 305f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 306f17f7ee2SSatish Balay boundtocpu = A->boundtocpu; 307f17f7ee2SSatish Balay #endif 308f17f7ee2SSatish Balay PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL)); 309f17f7ee2SSatish Balay if (boundtocpu) { 3109371c9d4SSatish Balay if (h2opus->xxs < xN) { 3119371c9d4SSatish Balay h2opus->xx->resize(n * xN); 3129371c9d4SSatish Balay h2opus->xxs = xN; 3139371c9d4SSatish Balay } 3149371c9d4SSatish Balay if (h2opus->yys < yN) { 3159371c9d4SSatish Balay h2opus->yy->resize(n * yN); 3169371c9d4SSatish Balay h2opus->yys = yN; 3179371c9d4SSatish Balay } 318f17f7ee2SSatish Balay } 319f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 320f17f7ee2SSatish Balay if (!boundtocpu) { 3219371c9d4SSatish Balay if (h2opus->xxs_gpu < xN) { 3229371c9d4SSatish Balay h2opus->xx_gpu->resize(n * xN); 3239371c9d4SSatish Balay h2opus->xxs_gpu = xN; 3249371c9d4SSatish Balay } 3259371c9d4SSatish Balay if (h2opus->yys_gpu < yN) { 3269371c9d4SSatish Balay h2opus->yy_gpu->resize(n * yN); 3279371c9d4SSatish Balay h2opus->yys_gpu = yN; 3289371c9d4SSatish Balay } 329f17f7ee2SSatish Balay } 330f17f7ee2SSatish Balay #endif 3313ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 332f17f7ee2SSatish Balay } 333f17f7ee2SSatish Balay 334d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultNKernel_H2OPUS(Mat A, PetscBool transA, Mat B, Mat C) 335d71ae5a4SJacob Faibussowitsch { 336f17f7ee2SSatish Balay Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data; 337f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 338f17f7ee2SSatish Balay h2opusHandle_t handle = h2opus->handle->handle; 339f17f7ee2SSatish Balay #else 340f17f7ee2SSatish Balay h2opusHandle_t handle = h2opus->handle; 341f17f7ee2SSatish Balay #endif 342f17f7ee2SSatish Balay PetscBool boundtocpu = PETSC_TRUE; 343f17f7ee2SSatish Balay PetscScalar *xx, *yy, *uxx, *uyy; 344f17f7ee2SSatish Balay PetscInt blda, clda; 345f17f7ee2SSatish Balay PetscMPIInt size; 346f17f7ee2SSatish Balay PetscSF bsf, csf; 347f17f7ee2SSatish Balay PetscBool usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult); 348f17f7ee2SSatish Balay 349f17f7ee2SSatish Balay PetscFunctionBegin; 350f17f7ee2SSatish Balay HLibProfile::clear(); 351f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 352f17f7ee2SSatish Balay boundtocpu = A->boundtocpu; 353f17f7ee2SSatish Balay #endif 354f17f7ee2SSatish Balay PetscCall(MatDenseGetLDA(B, &blda)); 355f17f7ee2SSatish Balay PetscCall(MatDenseGetLDA(C, &clda)); 356f17f7ee2SSatish Balay if (usesf) { 357f17f7ee2SSatish Balay PetscInt n; 358f17f7ee2SSatish Balay 359f17f7ee2SSatish Balay PetscCall(MatDenseGetH2OpusVectorSF(B, h2opus->sf, &bsf)); 360f17f7ee2SSatish Balay PetscCall(MatDenseGetH2OpusVectorSF(C, h2opus->sf, &csf)); 361f17f7ee2SSatish Balay 362f17f7ee2SSatish Balay PetscCall(MatH2OpusResizeBuffers_Private(A, B->cmap->N, C->cmap->N)); 363f17f7ee2SSatish Balay PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL)); 364f17f7ee2SSatish Balay blda = n; 365f17f7ee2SSatish Balay clda = n; 366f17f7ee2SSatish Balay } 367f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 368f17f7ee2SSatish Balay if (boundtocpu) { 369f17f7ee2SSatish Balay PetscCall(MatDenseGetArrayRead(B, (const PetscScalar **)&xx)); 370f17f7ee2SSatish Balay PetscCall(MatDenseGetArrayWrite(C, &yy)); 371f17f7ee2SSatish Balay if (usesf) { 372f17f7ee2SSatish Balay uxx = MatH2OpusGetThrustPointer(*h2opus->xx); 373f17f7ee2SSatish Balay uyy = MatH2OpusGetThrustPointer(*h2opus->yy); 374f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE)); 375f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE)); 376f17f7ee2SSatish Balay } else { 377f17f7ee2SSatish Balay uxx = xx; 378f17f7ee2SSatish Balay uyy = yy; 379f17f7ee2SSatish Balay } 380f17f7ee2SSatish Balay if (size > 1) { 381f17f7ee2SSatish Balay PetscCheck(h2opus->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix"); 382036e4bdcSSatish Balay PetscCheck(!transA || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel"); 383f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 384f17f7ee2SSatish Balay distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle); 385f17f7ee2SSatish Balay #endif 386f17f7ee2SSatish Balay } else { 387f17f7ee2SSatish Balay PetscCheck(h2opus->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 388f17f7ee2SSatish Balay hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle); 389f17f7ee2SSatish Balay } 390f17f7ee2SSatish Balay PetscCall(MatDenseRestoreArrayRead(B, (const PetscScalar **)&xx)); 391f17f7ee2SSatish Balay if (usesf) { 392f17f7ee2SSatish Balay PetscCall(PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE)); 393f17f7ee2SSatish Balay PetscCall(PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE)); 394f17f7ee2SSatish Balay } 395f17f7ee2SSatish Balay PetscCall(MatDenseRestoreArrayWrite(C, &yy)); 396f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 397f17f7ee2SSatish Balay } else { 398f17f7ee2SSatish Balay PetscBool ciscuda, biscuda; 399f17f7ee2SSatish Balay 400f17f7ee2SSatish Balay /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */ 401f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)B, &biscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, "")); 40248a46eb9SPierre Jolivet if (!biscuda) PetscCall(MatConvert(B, MATDENSECUDA, MAT_INPLACE_MATRIX, &B)); 403f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &ciscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, "")); 404f17f7ee2SSatish Balay if (!ciscuda) { 405f17f7ee2SSatish Balay C->assembled = PETSC_TRUE; 406f17f7ee2SSatish Balay PetscCall(MatConvert(C, MATDENSECUDA, MAT_INPLACE_MATRIX, &C)); 407f17f7ee2SSatish Balay } 408f17f7ee2SSatish Balay PetscCall(MatDenseCUDAGetArrayRead(B, (const PetscScalar **)&xx)); 409f17f7ee2SSatish Balay PetscCall(MatDenseCUDAGetArrayWrite(C, &yy)); 410f17f7ee2SSatish Balay if (usesf) { 411f17f7ee2SSatish Balay uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu); 412f17f7ee2SSatish Balay uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu); 413f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE)); 414f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE)); 415f17f7ee2SSatish Balay } else { 416f17f7ee2SSatish Balay uxx = xx; 417f17f7ee2SSatish Balay uyy = yy; 418f17f7ee2SSatish Balay } 419f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeBegin()); 420f17f7ee2SSatish Balay if (size > 1) { 421f17f7ee2SSatish Balay PetscCheck(h2opus->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed GPU matrix"); 422036e4bdcSSatish Balay PetscCheck(!transA || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel"); 423f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 424f17f7ee2SSatish Balay distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle); 425f17f7ee2SSatish Balay #endif 426f17f7ee2SSatish Balay } else { 427f17f7ee2SSatish Balay PetscCheck(h2opus->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 428f17f7ee2SSatish Balay hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle); 429f17f7ee2SSatish Balay } 430f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeEnd()); 431f17f7ee2SSatish Balay PetscCall(MatDenseCUDARestoreArrayRead(B, (const PetscScalar **)&xx)); 432f17f7ee2SSatish Balay if (usesf) { 433f17f7ee2SSatish Balay PetscCall(PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE)); 434f17f7ee2SSatish Balay PetscCall(PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE)); 435f17f7ee2SSatish Balay } 436f17f7ee2SSatish Balay PetscCall(MatDenseCUDARestoreArrayWrite(C, &yy)); 43748a46eb9SPierre Jolivet if (!biscuda) PetscCall(MatConvert(B, MATDENSE, MAT_INPLACE_MATRIX, &B)); 43848a46eb9SPierre Jolivet if (!ciscuda) PetscCall(MatConvert(C, MATDENSE, MAT_INPLACE_MATRIX, &C)); 439f17f7ee2SSatish Balay #endif 440f17f7ee2SSatish Balay } 441f17f7ee2SSatish Balay { /* log flops */ 442f17f7ee2SSatish Balay double gops, time, perf, dev; 443f17f7ee2SSatish Balay HLibProfile::getHgemvPerf(gops, time, perf, dev); 444f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 445f17f7ee2SSatish Balay if (boundtocpu) { 446f17f7ee2SSatish Balay PetscCall(PetscLogFlops(1e9 * gops)); 447f17f7ee2SSatish Balay } else { 448f17f7ee2SSatish Balay PetscCall(PetscLogGpuFlops(1e9 * gops)); 449f17f7ee2SSatish Balay } 450f17f7ee2SSatish Balay #else 451f17f7ee2SSatish Balay PetscCall(PetscLogFlops(1e9 * gops)); 452f17f7ee2SSatish Balay #endif 453f17f7ee2SSatish Balay } 4543ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 455f17f7ee2SSatish Balay } 456f17f7ee2SSatish Balay 457d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_H2OPUS(Mat C) 458d71ae5a4SJacob Faibussowitsch { 459f17f7ee2SSatish Balay Mat_Product *product = C->product; 460f17f7ee2SSatish Balay 461f17f7ee2SSatish Balay PetscFunctionBegin; 462f17f7ee2SSatish Balay MatCheckProduct(C, 1); 463f17f7ee2SSatish Balay switch (product->type) { 464d71ae5a4SJacob Faibussowitsch case MATPRODUCT_AB: 465d71ae5a4SJacob Faibussowitsch PetscCall(MatMultNKernel_H2OPUS(product->A, PETSC_FALSE, product->B, C)); 466d71ae5a4SJacob Faibussowitsch break; 467d71ae5a4SJacob Faibussowitsch case MATPRODUCT_AtB: 468d71ae5a4SJacob Faibussowitsch PetscCall(MatMultNKernel_H2OPUS(product->A, PETSC_TRUE, product->B, C)); 469d71ae5a4SJacob Faibussowitsch break; 470d71ae5a4SJacob Faibussowitsch default: 471d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]); 472f17f7ee2SSatish Balay } 4733ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 474f17f7ee2SSatish Balay } 475f17f7ee2SSatish Balay 476d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_H2OPUS(Mat C) 477d71ae5a4SJacob Faibussowitsch { 478f17f7ee2SSatish Balay Mat_Product *product = C->product; 479f17f7ee2SSatish Balay PetscBool cisdense; 480f17f7ee2SSatish Balay Mat A, B; 481f17f7ee2SSatish Balay 482f17f7ee2SSatish Balay PetscFunctionBegin; 483f17f7ee2SSatish Balay MatCheckProduct(C, 1); 484f17f7ee2SSatish Balay A = product->A; 485f17f7ee2SSatish Balay B = product->B; 486f17f7ee2SSatish Balay switch (product->type) { 487f17f7ee2SSatish Balay case MATPRODUCT_AB: 488f17f7ee2SSatish Balay PetscCall(MatSetSizes(C, A->rmap->n, B->cmap->n, A->rmap->N, B->cmap->N)); 489f17f7ee2SSatish Balay PetscCall(MatSetBlockSizesFromMats(C, product->A, product->B)); 490f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, "")); 491f17f7ee2SSatish Balay if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)product->B)->type_name)); 492f17f7ee2SSatish Balay PetscCall(MatSetUp(C)); 493f17f7ee2SSatish Balay break; 494f17f7ee2SSatish Balay case MATPRODUCT_AtB: 495f17f7ee2SSatish Balay PetscCall(MatSetSizes(C, A->cmap->n, B->cmap->n, A->cmap->N, B->cmap->N)); 496f17f7ee2SSatish Balay PetscCall(MatSetBlockSizesFromMats(C, product->A, product->B)); 497f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, "")); 498f17f7ee2SSatish Balay if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)product->B)->type_name)); 499f17f7ee2SSatish Balay PetscCall(MatSetUp(C)); 500f17f7ee2SSatish Balay break; 501d71ae5a4SJacob Faibussowitsch default: 502d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]); 503f17f7ee2SSatish Balay } 504f17f7ee2SSatish Balay C->ops->productsymbolic = NULL; 505f17f7ee2SSatish Balay C->ops->productnumeric = MatProductNumeric_H2OPUS; 5063ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 507f17f7ee2SSatish Balay } 508f17f7ee2SSatish Balay 509d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_H2OPUS(Mat C) 510d71ae5a4SJacob Faibussowitsch { 511f17f7ee2SSatish Balay PetscFunctionBegin; 512f17f7ee2SSatish Balay MatCheckProduct(C, 1); 513ad540459SPierre Jolivet if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) C->ops->productsymbolic = MatProductSymbolic_H2OPUS; 5143ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 515f17f7ee2SSatish Balay } 516f17f7ee2SSatish Balay 517d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultKernel_H2OPUS(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans) 518d71ae5a4SJacob Faibussowitsch { 519f17f7ee2SSatish Balay Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data; 520f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 521f17f7ee2SSatish Balay h2opusHandle_t handle = h2opus->handle->handle; 522f17f7ee2SSatish Balay #else 523f17f7ee2SSatish Balay h2opusHandle_t handle = h2opus->handle; 524f17f7ee2SSatish Balay #endif 525f17f7ee2SSatish Balay PetscBool boundtocpu = PETSC_TRUE; 526f17f7ee2SSatish Balay PetscInt n; 527f17f7ee2SSatish Balay PetscScalar *xx, *yy, *uxx, *uyy; 528f17f7ee2SSatish Balay PetscMPIInt size; 529f17f7ee2SSatish Balay PetscBool usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult); 530f17f7ee2SSatish Balay 531f17f7ee2SSatish Balay PetscFunctionBegin; 532f17f7ee2SSatish Balay HLibProfile::clear(); 533f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 534f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 535f17f7ee2SSatish Balay boundtocpu = A->boundtocpu; 536f17f7ee2SSatish Balay #endif 537f17f7ee2SSatish Balay if (usesf) { 538f17f7ee2SSatish Balay PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL)); 539f17f7ee2SSatish Balay } else n = A->rmap->n; 540f17f7ee2SSatish Balay if (boundtocpu) { 541f17f7ee2SSatish Balay PetscCall(VecGetArrayRead(x, (const PetscScalar **)&xx)); 542f17f7ee2SSatish Balay if (sy == 0.0) { 543f17f7ee2SSatish Balay PetscCall(VecGetArrayWrite(y, &yy)); 544f17f7ee2SSatish Balay } else { 545f17f7ee2SSatish Balay PetscCall(VecGetArray(y, &yy)); 546f17f7ee2SSatish Balay } 547f17f7ee2SSatish Balay if (usesf) { 548f17f7ee2SSatish Balay uxx = MatH2OpusGetThrustPointer(*h2opus->xx); 549f17f7ee2SSatish Balay uyy = MatH2OpusGetThrustPointer(*h2opus->yy); 550f17f7ee2SSatish Balay 551f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE)); 552f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE)); 553f17f7ee2SSatish Balay if (sy != 0.0) { 554f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE)); 555f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE)); 556f17f7ee2SSatish Balay } 557f17f7ee2SSatish Balay } else { 558f17f7ee2SSatish Balay uxx = xx; 559f17f7ee2SSatish Balay uyy = yy; 560f17f7ee2SSatish Balay } 561f17f7ee2SSatish Balay if (size > 1) { 562f17f7ee2SSatish Balay PetscCheck(h2opus->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix"); 563036e4bdcSSatish Balay PetscCheck(!trans || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel"); 564f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 565f17f7ee2SSatish Balay distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, n, sy, uyy, n, 1, h2opus->handle); 566f17f7ee2SSatish Balay #endif 567f17f7ee2SSatish Balay } else { 568f17f7ee2SSatish Balay PetscCheck(h2opus->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 569f17f7ee2SSatish Balay hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, n, sy, uyy, n, 1, handle); 570f17f7ee2SSatish Balay } 571f17f7ee2SSatish Balay PetscCall(VecRestoreArrayRead(x, (const PetscScalar **)&xx)); 572f17f7ee2SSatish Balay if (usesf) { 573f17f7ee2SSatish Balay PetscCall(PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE)); 574f17f7ee2SSatish Balay PetscCall(PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE)); 575f17f7ee2SSatish Balay } 576f17f7ee2SSatish Balay if (sy == 0.0) { 577f17f7ee2SSatish Balay PetscCall(VecRestoreArrayWrite(y, &yy)); 578f17f7ee2SSatish Balay } else { 579f17f7ee2SSatish Balay PetscCall(VecRestoreArray(y, &yy)); 580f17f7ee2SSatish Balay } 581f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 582f17f7ee2SSatish Balay } else { 583f17f7ee2SSatish Balay PetscCall(VecCUDAGetArrayRead(x, (const PetscScalar **)&xx)); 584f17f7ee2SSatish Balay if (sy == 0.0) { 585f17f7ee2SSatish Balay PetscCall(VecCUDAGetArrayWrite(y, &yy)); 586f17f7ee2SSatish Balay } else { 587f17f7ee2SSatish Balay PetscCall(VecCUDAGetArray(y, &yy)); 588f17f7ee2SSatish Balay } 589f17f7ee2SSatish Balay if (usesf) { 590f17f7ee2SSatish Balay uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu); 591f17f7ee2SSatish Balay uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu); 592f17f7ee2SSatish Balay 593f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE)); 594f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE)); 595f17f7ee2SSatish Balay if (sy != 0.0) { 596f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE)); 597f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE)); 598f17f7ee2SSatish Balay } 599f17f7ee2SSatish Balay } else { 600f17f7ee2SSatish Balay uxx = xx; 601f17f7ee2SSatish Balay uyy = yy; 602f17f7ee2SSatish Balay } 603f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeBegin()); 604f17f7ee2SSatish Balay if (size > 1) { 605f17f7ee2SSatish Balay PetscCheck(h2opus->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed GPU matrix"); 606036e4bdcSSatish Balay PetscCheck(!trans || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel"); 607f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 608f17f7ee2SSatish Balay distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, h2opus->handle); 609f17f7ee2SSatish Balay #endif 610f17f7ee2SSatish Balay } else { 611f17f7ee2SSatish Balay PetscCheck(h2opus->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 612f17f7ee2SSatish Balay hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle); 613f17f7ee2SSatish Balay } 614f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeEnd()); 615f17f7ee2SSatish Balay PetscCall(VecCUDARestoreArrayRead(x, (const PetscScalar **)&xx)); 616f17f7ee2SSatish Balay if (usesf) { 617f17f7ee2SSatish Balay PetscCall(PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE)); 618f17f7ee2SSatish Balay PetscCall(PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE)); 619f17f7ee2SSatish Balay } 620f17f7ee2SSatish Balay if (sy == 0.0) { 621f17f7ee2SSatish Balay PetscCall(VecCUDARestoreArrayWrite(y, &yy)); 622f17f7ee2SSatish Balay } else { 623f17f7ee2SSatish Balay PetscCall(VecCUDARestoreArray(y, &yy)); 624f17f7ee2SSatish Balay } 625f17f7ee2SSatish Balay #endif 626f17f7ee2SSatish Balay } 627f17f7ee2SSatish Balay { /* log flops */ 628f17f7ee2SSatish Balay double gops, time, perf, dev; 629f17f7ee2SSatish Balay HLibProfile::getHgemvPerf(gops, time, perf, dev); 630f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 631f17f7ee2SSatish Balay if (boundtocpu) { 632f17f7ee2SSatish Balay PetscCall(PetscLogFlops(1e9 * gops)); 633f17f7ee2SSatish Balay } else { 634f17f7ee2SSatish Balay PetscCall(PetscLogGpuFlops(1e9 * gops)); 635f17f7ee2SSatish Balay } 636f17f7ee2SSatish Balay #else 637f17f7ee2SSatish Balay PetscCall(PetscLogFlops(1e9 * gops)); 638f17f7ee2SSatish Balay #endif 639f17f7ee2SSatish Balay } 6403ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 641f17f7ee2SSatish Balay } 642f17f7ee2SSatish Balay 643d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_H2OPUS(Mat A, Vec x, Vec y) 644d71ae5a4SJacob Faibussowitsch { 645f17f7ee2SSatish Balay PetscBool xiscuda, yiscuda; 646f17f7ee2SSatish Balay 647f17f7ee2SSatish Balay PetscFunctionBegin; 648f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, "")); 649f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, "")); 650f17f7ee2SSatish Balay PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda)); 651f17f7ee2SSatish Balay PetscCall(MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_TRUE)); 6523ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 653f17f7ee2SSatish Balay } 654f17f7ee2SSatish Balay 655d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_H2OPUS(Mat A, Vec x, Vec y) 656d71ae5a4SJacob Faibussowitsch { 657f17f7ee2SSatish Balay PetscBool xiscuda, yiscuda; 658f17f7ee2SSatish Balay 659f17f7ee2SSatish Balay PetscFunctionBegin; 660f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, "")); 661f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, "")); 662f17f7ee2SSatish Balay PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda)); 663f17f7ee2SSatish Balay PetscCall(MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_FALSE)); 6643ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 665f17f7ee2SSatish Balay } 666f17f7ee2SSatish Balay 667d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z) 668d71ae5a4SJacob Faibussowitsch { 669f17f7ee2SSatish Balay PetscBool xiscuda, ziscuda; 670f17f7ee2SSatish Balay 671f17f7ee2SSatish Balay PetscFunctionBegin; 672f17f7ee2SSatish Balay PetscCall(VecCopy(y, z)); 673f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, "")); 674f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, "")); 675f17f7ee2SSatish Balay PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda)); 676f17f7ee2SSatish Balay PetscCall(MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_TRUE)); 6773ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 678f17f7ee2SSatish Balay } 679f17f7ee2SSatish Balay 680d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z) 681d71ae5a4SJacob Faibussowitsch { 682f17f7ee2SSatish Balay PetscBool xiscuda, ziscuda; 683f17f7ee2SSatish Balay 684f17f7ee2SSatish Balay PetscFunctionBegin; 685f17f7ee2SSatish Balay PetscCall(VecCopy(y, z)); 686f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, "")); 687f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, "")); 688f17f7ee2SSatish Balay PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda)); 689f17f7ee2SSatish Balay PetscCall(MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_FALSE)); 6903ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 691f17f7ee2SSatish Balay } 692f17f7ee2SSatish Balay 693d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_H2OPUS(Mat A, PetscScalar s) 694d71ae5a4SJacob Faibussowitsch { 695f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 696f17f7ee2SSatish Balay 697f17f7ee2SSatish Balay PetscFunctionBegin; 698f17f7ee2SSatish Balay a->s *= s; 6993ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 700f17f7ee2SSatish Balay } 701f17f7ee2SSatish Balay 702d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetFromOptions_H2OPUS(Mat A, PetscOptionItems *PetscOptionsObject) 703d71ae5a4SJacob Faibussowitsch { 704f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 705f17f7ee2SSatish Balay 706f17f7ee2SSatish Balay PetscFunctionBegin; 707036e4bdcSSatish Balay PetscOptionsHeadBegin(PetscOptionsObject, "H2OPUS options"); 708f17f7ee2SSatish Balay PetscCall(PetscOptionsInt("-mat_h2opus_leafsize", "Leaf size of cluster tree", NULL, a->leafsize, &a->leafsize, NULL)); 709f17f7ee2SSatish Balay PetscCall(PetscOptionsReal("-mat_h2opus_eta", "Admissibility condition tolerance", NULL, a->eta, &a->eta, NULL)); 710f17f7ee2SSatish Balay PetscCall(PetscOptionsInt("-mat_h2opus_order", "Basis order for off-diagonal sampling when constructed from kernel", NULL, a->basisord, &a->basisord, NULL)); 711f17f7ee2SSatish Balay PetscCall(PetscOptionsInt("-mat_h2opus_maxrank", "Maximum rank when constructed from matvecs", NULL, a->max_rank, &a->max_rank, NULL)); 712f17f7ee2SSatish Balay PetscCall(PetscOptionsInt("-mat_h2opus_samples", "Maximum number of samples to be taken concurrently when constructing from matvecs", NULL, a->bs, &a->bs, NULL)); 713aaa8cc7dSPierre Jolivet PetscCall(PetscOptionsInt("-mat_h2opus_normsamples", "Maximum number of samples to be when estimating norms", NULL, a->norm_max_samples, &a->norm_max_samples, NULL)); 714f17f7ee2SSatish Balay PetscCall(PetscOptionsReal("-mat_h2opus_rtol", "Relative tolerance for construction from sampling", NULL, a->rtol, &a->rtol, NULL)); 715f17f7ee2SSatish Balay PetscCall(PetscOptionsBool("-mat_h2opus_check", "Check error when constructing from sampling during MatAssemblyEnd()", NULL, a->check_construction, &a->check_construction, NULL)); 716f17f7ee2SSatish Balay PetscCall(PetscOptionsBool("-mat_h2opus_hara_verbose", "Verbose output from hara construction", NULL, a->hara_verbose, &a->hara_verbose, NULL)); 717036e4bdcSSatish Balay PetscCall(PetscOptionsBool("-mat_h2opus_resize", "Resize after compression", NULL, a->resize, &a->resize, NULL)); 718036e4bdcSSatish Balay PetscOptionsHeadEnd(); 7193ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 720f17f7ee2SSatish Balay } 721f17f7ee2SSatish Balay 722f17f7ee2SSatish Balay static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat, PetscInt, const PetscReal[], PetscBool, MatH2OpusKernel, void *); 723f17f7ee2SSatish Balay 724d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatH2OpusInferCoordinates_Private(Mat A) 725d71ae5a4SJacob Faibussowitsch { 726f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 727f17f7ee2SSatish Balay Vec c; 728f17f7ee2SSatish Balay PetscInt spacedim; 729f17f7ee2SSatish Balay const PetscScalar *coords; 730f17f7ee2SSatish Balay 731f17f7ee2SSatish Balay PetscFunctionBegin; 7323ba16761SJacob Faibussowitsch if (a->ptcloud) PetscFunctionReturn(PETSC_SUCCESS); 733f17f7ee2SSatish Balay PetscCall(PetscObjectQuery((PetscObject)A, "__math2opus_coords", (PetscObject *)&c)); 734f17f7ee2SSatish Balay if (!c && a->sampler) { 735f17f7ee2SSatish Balay Mat S = a->sampler->GetSamplingMat(); 736f17f7ee2SSatish Balay 737f17f7ee2SSatish Balay PetscCall(PetscObjectQuery((PetscObject)S, "__math2opus_coords", (PetscObject *)&c)); 738f17f7ee2SSatish Balay } 739f17f7ee2SSatish Balay if (!c) { 740f17f7ee2SSatish Balay PetscCall(MatH2OpusSetCoords_H2OPUS(A, -1, NULL, PETSC_FALSE, NULL, NULL)); 741f17f7ee2SSatish Balay } else { 742f17f7ee2SSatish Balay PetscCall(VecGetArrayRead(c, &coords)); 743f17f7ee2SSatish Balay PetscCall(VecGetBlockSize(c, &spacedim)); 744f17f7ee2SSatish Balay PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, PETSC_FALSE, NULL, NULL)); 745f17f7ee2SSatish Balay PetscCall(VecRestoreArrayRead(c, &coords)); 746f17f7ee2SSatish Balay } 7473ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 748f17f7ee2SSatish Balay } 749f17f7ee2SSatish Balay 750d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetUpMultiply_H2OPUS(Mat A) 751d71ae5a4SJacob Faibussowitsch { 752f17f7ee2SSatish Balay MPI_Comm comm; 753f17f7ee2SSatish Balay PetscMPIInt size; 754f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 755f17f7ee2SSatish Balay PetscInt n = 0, *idx = NULL; 756f17f7ee2SSatish Balay int *iidx = NULL; 757f17f7ee2SSatish Balay PetscCopyMode own; 758f17f7ee2SSatish Balay PetscBool rid; 759f17f7ee2SSatish Balay 760f17f7ee2SSatish Balay PetscFunctionBegin; 7613ba16761SJacob Faibussowitsch if (a->multsetup) PetscFunctionReturn(PETSC_SUCCESS); 762f17f7ee2SSatish Balay if (a->sf) { /* MatDuplicate_H2OPUS takes reference to the SF */ 763f17f7ee2SSatish Balay PetscCall(PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL)); 764f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 765f17f7ee2SSatish Balay a->xx_gpu = new thrust::device_vector<PetscScalar>(n); 766f17f7ee2SSatish Balay a->yy_gpu = new thrust::device_vector<PetscScalar>(n); 767f17f7ee2SSatish Balay a->xxs_gpu = 1; 768f17f7ee2SSatish Balay a->yys_gpu = 1; 769f17f7ee2SSatish Balay #endif 770f17f7ee2SSatish Balay a->xx = new thrust::host_vector<PetscScalar>(n); 771f17f7ee2SSatish Balay a->yy = new thrust::host_vector<PetscScalar>(n); 772f17f7ee2SSatish Balay a->xxs = 1; 773f17f7ee2SSatish Balay a->yys = 1; 774f17f7ee2SSatish Balay } else { 775f17f7ee2SSatish Balay IS is; 776f17f7ee2SSatish Balay PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 777f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(comm, &size)); 778f17f7ee2SSatish Balay if (!a->h2opus_indexmap) { 779f17f7ee2SSatish Balay if (size > 1) { 780f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix"); 781f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 782f17f7ee2SSatish Balay iidx = MatH2OpusGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map); 783f17f7ee2SSatish Balay n = a->dist_hmatrix->basis_tree.basis_branch.index_map.size(); 784f17f7ee2SSatish Balay #endif 785f17f7ee2SSatish Balay } else { 786f17f7ee2SSatish Balay iidx = MatH2OpusGetThrustPointer(a->hmatrix->u_basis_tree.index_map); 787f17f7ee2SSatish Balay n = a->hmatrix->u_basis_tree.index_map.size(); 788f17f7ee2SSatish Balay } 789f17f7ee2SSatish Balay 790f17f7ee2SSatish Balay if (PetscDefined(USE_64BIT_INDICES)) { 791f17f7ee2SSatish Balay PetscInt i; 792f17f7ee2SSatish Balay 793f17f7ee2SSatish Balay own = PETSC_OWN_POINTER; 794f17f7ee2SSatish Balay PetscCall(PetscMalloc1(n, &idx)); 795f17f7ee2SSatish Balay for (i = 0; i < n; i++) idx[i] = iidx[i]; 796f17f7ee2SSatish Balay } else { 797f17f7ee2SSatish Balay own = PETSC_COPY_VALUES; 798f17f7ee2SSatish Balay idx = (PetscInt *)iidx; 799f17f7ee2SSatish Balay } 800f17f7ee2SSatish Balay PetscCall(ISCreateGeneral(comm, n, idx, own, &is)); 801f17f7ee2SSatish Balay PetscCall(ISSetPermutation(is)); 802f17f7ee2SSatish Balay PetscCall(ISViewFromOptions(is, (PetscObject)A, "-mat_h2opus_indexmap_view")); 803f17f7ee2SSatish Balay a->h2opus_indexmap = is; 804f17f7ee2SSatish Balay } 805f17f7ee2SSatish Balay PetscCall(ISGetLocalSize(a->h2opus_indexmap, &n)); 806f17f7ee2SSatish Balay PetscCall(ISGetIndices(a->h2opus_indexmap, (const PetscInt **)&idx)); 807f17f7ee2SSatish Balay rid = (PetscBool)(n == A->rmap->n); 808f17f7ee2SSatish Balay PetscCall(MPIU_Allreduce(MPI_IN_PLACE, &rid, 1, MPIU_BOOL, MPI_LAND, comm)); 80948a46eb9SPierre Jolivet if (rid) PetscCall(ISIdentity(a->h2opus_indexmap, &rid)); 810f17f7ee2SSatish Balay if (!rid) { 811f17f7ee2SSatish Balay if (size > 1) { /* Parallel distribution may be different, save it here for fast path in MatMult (see MatH2OpusSetNativeMult) */ 812f17f7ee2SSatish Balay PetscCall(PetscLayoutCreate(comm, &a->h2opus_rmap)); 813f17f7ee2SSatish Balay PetscCall(PetscLayoutSetLocalSize(a->h2opus_rmap, n)); 814f17f7ee2SSatish Balay PetscCall(PetscLayoutSetUp(a->h2opus_rmap)); 815f17f7ee2SSatish Balay PetscCall(PetscLayoutReference(a->h2opus_rmap, &a->h2opus_cmap)); 816f17f7ee2SSatish Balay } 817f17f7ee2SSatish Balay PetscCall(PetscSFCreate(comm, &a->sf)); 818f17f7ee2SSatish Balay PetscCall(PetscSFSetGraphLayout(a->sf, A->rmap, n, NULL, PETSC_OWN_POINTER, idx)); 819036e4bdcSSatish Balay PetscCall(PetscSFSetUp(a->sf)); 820f17f7ee2SSatish Balay PetscCall(PetscSFViewFromOptions(a->sf, (PetscObject)A, "-mat_h2opus_sf_view")); 821f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 822f17f7ee2SSatish Balay a->xx_gpu = new thrust::device_vector<PetscScalar>(n); 823f17f7ee2SSatish Balay a->yy_gpu = new thrust::device_vector<PetscScalar>(n); 824f17f7ee2SSatish Balay a->xxs_gpu = 1; 825f17f7ee2SSatish Balay a->yys_gpu = 1; 826f17f7ee2SSatish Balay #endif 827f17f7ee2SSatish Balay a->xx = new thrust::host_vector<PetscScalar>(n); 828f17f7ee2SSatish Balay a->yy = new thrust::host_vector<PetscScalar>(n); 829f17f7ee2SSatish Balay a->xxs = 1; 830f17f7ee2SSatish Balay a->yys = 1; 831f17f7ee2SSatish Balay } 832f17f7ee2SSatish Balay PetscCall(ISRestoreIndices(a->h2opus_indexmap, (const PetscInt **)&idx)); 833f17f7ee2SSatish Balay } 834f17f7ee2SSatish Balay a->multsetup = PETSC_TRUE; 8353ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 836f17f7ee2SSatish Balay } 837f17f7ee2SSatish Balay 838d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_H2OPUS(Mat A, MatAssemblyType assemblytype) 839d71ae5a4SJacob Faibussowitsch { 840f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 841f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 842f17f7ee2SSatish Balay h2opusHandle_t handle = a->handle->handle; 843f17f7ee2SSatish Balay #else 844f17f7ee2SSatish Balay h2opusHandle_t handle = a->handle; 845f17f7ee2SSatish Balay #endif 846f17f7ee2SSatish Balay PetscBool kernel = PETSC_FALSE; 847f17f7ee2SSatish Balay PetscBool boundtocpu = PETSC_TRUE; 848f17f7ee2SSatish Balay PetscBool samplingdone = PETSC_FALSE; 849f17f7ee2SSatish Balay MPI_Comm comm; 850f17f7ee2SSatish Balay PetscMPIInt size; 851f17f7ee2SSatish Balay 852f17f7ee2SSatish Balay PetscFunctionBegin; 853f17f7ee2SSatish Balay PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 854036e4bdcSSatish Balay PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported"); 855036e4bdcSSatish Balay PetscCheck(A->rmap->N == A->cmap->N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported"); 856f17f7ee2SSatish Balay 857f17f7ee2SSatish Balay /* XXX */ 858f17f7ee2SSatish Balay a->leafsize = PetscMin(a->leafsize, PetscMin(A->rmap->N, A->cmap->N)); 859f17f7ee2SSatish Balay 860f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(comm, &size)); 861f17f7ee2SSatish Balay /* TODO REUSABILITY of geometric construction */ 862f17f7ee2SSatish Balay delete a->hmatrix; 863f17f7ee2SSatish Balay delete a->dist_hmatrix; 864f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 865f17f7ee2SSatish Balay delete a->hmatrix_gpu; 866f17f7ee2SSatish Balay delete a->dist_hmatrix_gpu; 867f17f7ee2SSatish Balay #endif 868f17f7ee2SSatish Balay a->orthogonal = PETSC_FALSE; 869f17f7ee2SSatish Balay 870f17f7ee2SSatish Balay /* TODO: other? */ 871f17f7ee2SSatish Balay H2OpusBoxCenterAdmissibility adm(a->eta); 872f17f7ee2SSatish Balay 873f17f7ee2SSatish Balay PetscCall(PetscLogEventBegin(MAT_H2Opus_Build, A, 0, 0, 0)); 874f17f7ee2SSatish Balay if (size > 1) { 875f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 876f17f7ee2SSatish Balay a->dist_hmatrix = new DistributedHMatrix(A->rmap->n /* ,A->symmetric */); 877f17f7ee2SSatish Balay #else 878f17f7ee2SSatish Balay a->dist_hmatrix = NULL; 879f17f7ee2SSatish Balay #endif 880b94d7dedSBarry Smith } else a->hmatrix = new HMatrix(A->rmap->n, A->symmetric == PETSC_BOOL3_TRUE); 881f17f7ee2SSatish Balay PetscCall(MatH2OpusInferCoordinates_Private(A)); 882f17f7ee2SSatish Balay PetscCheck(a->ptcloud, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing pointcloud"); 883f17f7ee2SSatish Balay if (a->kernel) { 884f17f7ee2SSatish Balay BoxEntryGen<PetscScalar, H2OPUS_HWTYPE_CPU, PetscFunctionGenerator<PetscScalar>> entry_gen(*a->kernel); 885f17f7ee2SSatish Balay if (size > 1) { 886f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix"); 887f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 888f17f7ee2SSatish Balay buildDistributedHMatrix(*a->dist_hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord, a->handle); 889f17f7ee2SSatish Balay #endif 890f17f7ee2SSatish Balay } else { 891f17f7ee2SSatish Balay buildHMatrix(*a->hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord); 892f17f7ee2SSatish Balay } 893f17f7ee2SSatish Balay kernel = PETSC_TRUE; 894f17f7ee2SSatish Balay } else { 895036e4bdcSSatish Balay PetscCheck(size <= 1, comm, PETSC_ERR_SUP, "Construction from sampling not supported in parallel"); 896f17f7ee2SSatish Balay buildHMatrixStructure(*a->hmatrix, a->ptcloud, a->leafsize, adm); 897f17f7ee2SSatish Balay } 898f17f7ee2SSatish Balay PetscCall(MatSetUpMultiply_H2OPUS(A)); 899f17f7ee2SSatish Balay 900f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 901f17f7ee2SSatish Balay boundtocpu = A->boundtocpu; 902f17f7ee2SSatish Balay if (!boundtocpu) { 903f17f7ee2SSatish Balay if (size > 1) { 904f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix"); 905f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 906f17f7ee2SSatish Balay a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix); 907f17f7ee2SSatish Balay #endif 908f17f7ee2SSatish Balay } else { 909f17f7ee2SSatish Balay a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix); 910f17f7ee2SSatish Balay } 911f17f7ee2SSatish Balay } 912f17f7ee2SSatish Balay #endif 913f17f7ee2SSatish Balay if (size == 1) { 914f17f7ee2SSatish Balay if (!kernel && a->sampler && a->sampler->GetSamplingMat()) { 915f17f7ee2SSatish Balay PetscReal Anorm; 916f17f7ee2SSatish Balay bool verbose; 917f17f7ee2SSatish Balay 918f17f7ee2SSatish Balay PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_hara_verbose", &a->hara_verbose, NULL)); 919f17f7ee2SSatish Balay verbose = a->hara_verbose; 920f17f7ee2SSatish Balay PetscCall(MatApproximateNorm_Private(a->sampler->GetSamplingMat(), NORM_2, a->norm_max_samples, &Anorm)); 921f17f7ee2SSatish Balay if (a->hara_verbose) PetscCall(PetscPrintf(PETSC_COMM_SELF, "Sampling uses max rank %d, tol %g (%g*%g), %s samples %d\n", a->max_rank, a->rtol * Anorm, a->rtol, Anorm, boundtocpu ? "CPU" : "GPU", a->bs)); 922ad540459SPierre Jolivet if (a->sf && !a->nativemult) a->sampler->SetIndexMap(a->hmatrix->u_basis_tree.index_map.size(), a->hmatrix->u_basis_tree.index_map.data()); 923f17f7ee2SSatish Balay a->sampler->SetStream(handle->getMainStream()); 924f17f7ee2SSatish Balay if (boundtocpu) { 925f17f7ee2SSatish Balay a->sampler->SetGPUSampling(false); 926f17f7ee2SSatish Balay hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose); 927f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 928f17f7ee2SSatish Balay } else { 929f17f7ee2SSatish Balay a->sampler->SetGPUSampling(true); 930f17f7ee2SSatish Balay hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose); 931f17f7ee2SSatish Balay #endif 932f17f7ee2SSatish Balay } 933f17f7ee2SSatish Balay samplingdone = PETSC_TRUE; 934f17f7ee2SSatish Balay } 935f17f7ee2SSatish Balay } 936f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 937f17f7ee2SSatish Balay if (!boundtocpu) { 938f17f7ee2SSatish Balay delete a->hmatrix; 939f17f7ee2SSatish Balay delete a->dist_hmatrix; 940f17f7ee2SSatish Balay a->hmatrix = NULL; 941f17f7ee2SSatish Balay a->dist_hmatrix = NULL; 942f17f7ee2SSatish Balay } 943f17f7ee2SSatish Balay A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU; 944f17f7ee2SSatish Balay #endif 945f17f7ee2SSatish Balay PetscCall(PetscLogEventEnd(MAT_H2Opus_Build, A, 0, 0, 0)); 946f17f7ee2SSatish Balay 947f17f7ee2SSatish Balay if (!a->s) a->s = 1.0; 948f17f7ee2SSatish Balay A->assembled = PETSC_TRUE; 949f17f7ee2SSatish Balay 950f17f7ee2SSatish Balay if (samplingdone) { 951f17f7ee2SSatish Balay PetscBool check = a->check_construction; 952f17f7ee2SSatish Balay PetscBool checke = PETSC_FALSE; 953f17f7ee2SSatish Balay 954f17f7ee2SSatish Balay PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check", &check, NULL)); 955f17f7ee2SSatish Balay PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check_explicit", &checke, NULL)); 956f17f7ee2SSatish Balay if (check) { 957f17f7ee2SSatish Balay Mat E, Ae; 958f17f7ee2SSatish Balay PetscReal n1, ni, n2; 959f17f7ee2SSatish Balay PetscReal n1A, niA, n2A; 960f17f7ee2SSatish Balay void (*normfunc)(void); 961f17f7ee2SSatish Balay 962f17f7ee2SSatish Balay Ae = a->sampler->GetSamplingMat(); 963f17f7ee2SSatish Balay PetscCall(MatConvert(A, MATSHELL, MAT_INITIAL_MATRIX, &E)); 964f17f7ee2SSatish Balay PetscCall(MatShellSetOperation(E, MATOP_NORM, (void (*)(void))MatNorm_H2OPUS)); 965f17f7ee2SSatish Balay PetscCall(MatAXPY(E, -1.0, Ae, DIFFERENT_NONZERO_PATTERN)); 966f17f7ee2SSatish Balay PetscCall(MatNorm(E, NORM_1, &n1)); 967f17f7ee2SSatish Balay PetscCall(MatNorm(E, NORM_INFINITY, &ni)); 968f17f7ee2SSatish Balay PetscCall(MatNorm(E, NORM_2, &n2)); 969f17f7ee2SSatish Balay if (checke) { 970f17f7ee2SSatish Balay Mat eA, eE, eAe; 971f17f7ee2SSatish Balay 972f17f7ee2SSatish Balay PetscCall(MatComputeOperator(A, MATAIJ, &eA)); 973f17f7ee2SSatish Balay PetscCall(MatComputeOperator(E, MATAIJ, &eE)); 974f17f7ee2SSatish Balay PetscCall(MatComputeOperator(Ae, MATAIJ, &eAe)); 975f17f7ee2SSatish Balay PetscCall(MatChop(eA, PETSC_SMALL)); 976f17f7ee2SSatish Balay PetscCall(MatChop(eE, PETSC_SMALL)); 977f17f7ee2SSatish Balay PetscCall(MatChop(eAe, PETSC_SMALL)); 978f17f7ee2SSatish Balay PetscCall(PetscObjectSetName((PetscObject)eA, "H2Mat")); 979f17f7ee2SSatish Balay PetscCall(MatView(eA, NULL)); 980f17f7ee2SSatish Balay PetscCall(PetscObjectSetName((PetscObject)eAe, "S")); 981f17f7ee2SSatish Balay PetscCall(MatView(eAe, NULL)); 982f17f7ee2SSatish Balay PetscCall(PetscObjectSetName((PetscObject)eE, "H2Mat - S")); 983f17f7ee2SSatish Balay PetscCall(MatView(eE, NULL)); 984f17f7ee2SSatish Balay PetscCall(MatDestroy(&eA)); 985f17f7ee2SSatish Balay PetscCall(MatDestroy(&eE)); 986f17f7ee2SSatish Balay PetscCall(MatDestroy(&eAe)); 987f17f7ee2SSatish Balay } 988f17f7ee2SSatish Balay 989f17f7ee2SSatish Balay PetscCall(MatGetOperation(Ae, MATOP_NORM, &normfunc)); 990f17f7ee2SSatish Balay PetscCall(MatSetOperation(Ae, MATOP_NORM, (void (*)(void))MatNorm_H2OPUS)); 991f17f7ee2SSatish Balay PetscCall(MatNorm(Ae, NORM_1, &n1A)); 992f17f7ee2SSatish Balay PetscCall(MatNorm(Ae, NORM_INFINITY, &niA)); 993f17f7ee2SSatish Balay PetscCall(MatNorm(Ae, NORM_2, &n2A)); 994f17f7ee2SSatish Balay n1A = PetscMax(n1A, PETSC_SMALL); 995f17f7ee2SSatish Balay n2A = PetscMax(n2A, PETSC_SMALL); 996f17f7ee2SSatish Balay niA = PetscMax(niA, PETSC_SMALL); 997f17f7ee2SSatish Balay PetscCall(MatSetOperation(Ae, MATOP_NORM, normfunc)); 998f17f7ee2SSatish Balay PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "MATH2OPUS construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n", (double)n1, (double)ni, (double)n2, (double)(n1 / n1A), (double)(ni / niA), (double)(n2 / n2A))); 999f17f7ee2SSatish Balay PetscCall(MatDestroy(&E)); 1000f17f7ee2SSatish Balay } 1001f17f7ee2SSatish Balay a->sampler->SetSamplingMat(NULL); 1002f17f7ee2SSatish Balay } 10033ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1004f17f7ee2SSatish Balay } 1005f17f7ee2SSatish Balay 1006d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_H2OPUS(Mat A) 1007d71ae5a4SJacob Faibussowitsch { 1008f17f7ee2SSatish Balay PetscMPIInt size; 1009f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 1010f17f7ee2SSatish Balay 1011f17f7ee2SSatish Balay PetscFunctionBegin; 1012f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1013036e4bdcSSatish Balay PetscCheck(size <= 1, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not yet supported"); 1014f17f7ee2SSatish Balay a->hmatrix->clearData(); 1015f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1016f17f7ee2SSatish Balay if (a->hmatrix_gpu) a->hmatrix_gpu->clearData(); 1017f17f7ee2SSatish Balay #endif 10183ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1019f17f7ee2SSatish Balay } 1020f17f7ee2SSatish Balay 1021d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDuplicate_H2OPUS(Mat B, MatDuplicateOption op, Mat *nA) 1022d71ae5a4SJacob Faibussowitsch { 1023f17f7ee2SSatish Balay Mat A; 1024f17f7ee2SSatish Balay Mat_H2OPUS *a, *b = (Mat_H2OPUS *)B->data; 1025f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1026f17f7ee2SSatish Balay PetscBool iscpu = PETSC_FALSE; 1027f17f7ee2SSatish Balay #else 1028f17f7ee2SSatish Balay PetscBool iscpu = PETSC_TRUE; 1029f17f7ee2SSatish Balay #endif 1030f17f7ee2SSatish Balay MPI_Comm comm; 1031f17f7ee2SSatish Balay 1032f17f7ee2SSatish Balay PetscFunctionBegin; 1033f17f7ee2SSatish Balay PetscCall(PetscObjectGetComm((PetscObject)B, &comm)); 1034f17f7ee2SSatish Balay PetscCall(MatCreate(comm, &A)); 1035f17f7ee2SSatish Balay PetscCall(MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N)); 1036f17f7ee2SSatish Balay PetscCall(MatSetType(A, MATH2OPUS)); 1037f17f7ee2SSatish Balay PetscCall(MatPropagateSymmetryOptions(B, A)); 1038f17f7ee2SSatish Balay a = (Mat_H2OPUS *)A->data; 1039f17f7ee2SSatish Balay 1040f17f7ee2SSatish Balay a->eta = b->eta; 1041f17f7ee2SSatish Balay a->leafsize = b->leafsize; 1042f17f7ee2SSatish Balay a->basisord = b->basisord; 1043f17f7ee2SSatish Balay a->max_rank = b->max_rank; 1044f17f7ee2SSatish Balay a->bs = b->bs; 1045f17f7ee2SSatish Balay a->rtol = b->rtol; 1046f17f7ee2SSatish Balay a->norm_max_samples = b->norm_max_samples; 1047f17f7ee2SSatish Balay if (op == MAT_COPY_VALUES) a->s = b->s; 1048f17f7ee2SSatish Balay 1049f17f7ee2SSatish Balay a->ptcloud = new PetscPointCloud<PetscReal>(*b->ptcloud); 1050f17f7ee2SSatish Balay if (op == MAT_COPY_VALUES && b->kernel) a->kernel = new PetscFunctionGenerator<PetscScalar>(*b->kernel); 1051f17f7ee2SSatish Balay 1052f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1053ad540459SPierre Jolivet if (b->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix); 1054f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1055ad540459SPierre Jolivet if (b->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu); 1056f17f7ee2SSatish Balay #endif 1057f17f7ee2SSatish Balay #endif 1058f17f7ee2SSatish Balay if (b->hmatrix) { 1059f17f7ee2SSatish Balay a->hmatrix = new HMatrix(*b->hmatrix); 1060f17f7ee2SSatish Balay if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData(); 1061f17f7ee2SSatish Balay } 1062f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1063f17f7ee2SSatish Balay if (b->hmatrix_gpu) { 1064f17f7ee2SSatish Balay a->hmatrix_gpu = new HMatrix_GPU(*b->hmatrix_gpu); 1065f17f7ee2SSatish Balay if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData(); 1066f17f7ee2SSatish Balay } 1067f17f7ee2SSatish Balay #endif 1068f17f7ee2SSatish Balay if (b->sf) { 1069f17f7ee2SSatish Balay PetscCall(PetscObjectReference((PetscObject)b->sf)); 1070f17f7ee2SSatish Balay a->sf = b->sf; 1071f17f7ee2SSatish Balay } 1072f17f7ee2SSatish Balay if (b->h2opus_indexmap) { 1073f17f7ee2SSatish Balay PetscCall(PetscObjectReference((PetscObject)b->h2opus_indexmap)); 1074f17f7ee2SSatish Balay a->h2opus_indexmap = b->h2opus_indexmap; 1075f17f7ee2SSatish Balay } 1076f17f7ee2SSatish Balay 1077f17f7ee2SSatish Balay PetscCall(MatSetUp(A)); 1078f17f7ee2SSatish Balay PetscCall(MatSetUpMultiply_H2OPUS(A)); 1079f17f7ee2SSatish Balay if (op == MAT_COPY_VALUES) { 1080f17f7ee2SSatish Balay A->assembled = PETSC_TRUE; 1081f17f7ee2SSatish Balay a->orthogonal = b->orthogonal; 1082f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1083f17f7ee2SSatish Balay A->offloadmask = B->offloadmask; 1084f17f7ee2SSatish Balay #endif 1085f17f7ee2SSatish Balay } 1086f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1087f17f7ee2SSatish Balay iscpu = B->boundtocpu; 1088f17f7ee2SSatish Balay #endif 1089f17f7ee2SSatish Balay PetscCall(MatBindToCPU(A, iscpu)); 1090f17f7ee2SSatish Balay 1091f17f7ee2SSatish Balay *nA = A; 10923ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1093f17f7ee2SSatish Balay } 1094f17f7ee2SSatish Balay 1095d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatView_H2OPUS(Mat A, PetscViewer view) 1096d71ae5a4SJacob Faibussowitsch { 1097f17f7ee2SSatish Balay Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data; 1098036e4bdcSSatish Balay PetscBool isascii, vieweps; 1099f17f7ee2SSatish Balay PetscMPIInt size; 1100f17f7ee2SSatish Balay PetscViewerFormat format; 1101f17f7ee2SSatish Balay 1102f17f7ee2SSatish Balay PetscFunctionBegin; 1103f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)view, PETSCVIEWERASCII, &isascii)); 1104f17f7ee2SSatish Balay PetscCall(PetscViewerGetFormat(view, &format)); 1105f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1106f17f7ee2SSatish Balay if (isascii) { 1107f17f7ee2SSatish Balay if (format == PETSC_VIEWER_ASCII_MATLAB) { 1108f17f7ee2SSatish Balay if (size == 1) { 1109f17f7ee2SSatish Balay FILE *fp; 1110f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIGetPointer(view, &fp)); 1111f17f7ee2SSatish Balay dumpHMatrix(*h2opus->hmatrix, 6, fp); 1112f17f7ee2SSatish Balay } 1113f17f7ee2SSatish Balay } else { 1114f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view, " H-Matrix constructed from %s\n", h2opus->kernel ? "Kernel" : "Mat")); 1115f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view, " PointCloud dim %" PetscInt_FMT "\n", h2opus->ptcloud ? h2opus->ptcloud->getDimension() : 0)); 1116f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view, " Admissibility parameters: leaf size %" PetscInt_FMT ", eta %g\n", h2opus->leafsize, (double)h2opus->eta)); 1117f17f7ee2SSatish Balay if (!h2opus->kernel) { 1118f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view, " Sampling parameters: max_rank %" PetscInt_FMT ", samples %" PetscInt_FMT ", tolerance %g\n", h2opus->max_rank, h2opus->bs, (double)h2opus->rtol)); 1119f17f7ee2SSatish Balay } else { 1120f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view, " Offdiagonal blocks approximation order %" PetscInt_FMT "\n", h2opus->basisord)); 1121f17f7ee2SSatish Balay } 1122f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view, " Number of samples for norms %" PetscInt_FMT "\n", h2opus->norm_max_samples)); 1123f17f7ee2SSatish Balay if (size == 1) { 1124f17f7ee2SSatish Balay double dense_mem_cpu = h2opus->hmatrix ? h2opus->hmatrix->getDenseMemoryUsage() : 0; 1125f17f7ee2SSatish Balay double low_rank_cpu = h2opus->hmatrix ? h2opus->hmatrix->getLowRankMemoryUsage() : 0; 1126f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA) 1127f17f7ee2SSatish Balay double dense_mem_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getDenseMemoryUsage() : 0; 1128f17f7ee2SSatish Balay double low_rank_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getLowRankMemoryUsage() : 0; 1129f17f7ee2SSatish Balay #endif 1130f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu)); 1131f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA) 1132f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu)); 1133f17f7ee2SSatish Balay #endif 1134f17f7ee2SSatish Balay } else { 1135f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA) 1136f17f7ee2SSatish Balay double matrix_mem[4] = {0., 0., 0., 0.}; 1137f17f7ee2SSatish Balay PetscMPIInt rsize = 4; 1138f17f7ee2SSatish Balay #else 1139f17f7ee2SSatish Balay double matrix_mem[2] = {0., 0.}; 1140f17f7ee2SSatish Balay PetscMPIInt rsize = 2; 1141f17f7ee2SSatish Balay #endif 1142f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1143f17f7ee2SSatish Balay matrix_mem[0] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalDenseMemoryUsage() : 0; 1144f17f7ee2SSatish Balay matrix_mem[1] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalLowRankMemoryUsage() : 0; 1145f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA) 1146f17f7ee2SSatish Balay matrix_mem[2] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalDenseMemoryUsage() : 0; 1147f17f7ee2SSatish Balay matrix_mem[3] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalLowRankMemoryUsage() : 0; 1148f17f7ee2SSatish Balay #endif 1149f17f7ee2SSatish Balay #endif 1150f17f7ee2SSatish Balay PetscCall(MPIU_Allreduce(MPI_IN_PLACE, matrix_mem, rsize, MPI_DOUBLE_PRECISION, MPI_SUM, PetscObjectComm((PetscObject)A))); 1151f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[0], matrix_mem[1], matrix_mem[0] + matrix_mem[1])); 1152f17f7ee2SSatish Balay #if defined(PETSC_HAVE_CUDA) 1153f17f7ee2SSatish Balay PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[2], matrix_mem[3], matrix_mem[2] + matrix_mem[3])); 1154f17f7ee2SSatish Balay #endif 1155f17f7ee2SSatish Balay } 1156f17f7ee2SSatish Balay } 1157f17f7ee2SSatish Balay } 1158036e4bdcSSatish Balay vieweps = PETSC_FALSE; 1159036e4bdcSSatish Balay PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps", &vieweps, NULL)); 1160036e4bdcSSatish Balay if (vieweps) { 1161f17f7ee2SSatish Balay char filename[256]; 1162f17f7ee2SSatish Balay const char *name; 1163f17f7ee2SSatish Balay 1164f17f7ee2SSatish Balay PetscCall(PetscObjectGetName((PetscObject)A, &name)); 1165f17f7ee2SSatish Balay PetscCall(PetscSNPrintf(filename, sizeof(filename), "%s_structure.eps", name)); 1166036e4bdcSSatish Balay PetscCall(PetscOptionsGetString(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps_filename", filename, sizeof(filename), NULL)); 1167f17f7ee2SSatish Balay outputEps(*h2opus->hmatrix, filename); 1168f17f7ee2SSatish Balay } 11693ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1170f17f7ee2SSatish Balay } 1171f17f7ee2SSatish Balay 1172d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat A, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx) 1173d71ae5a4SJacob Faibussowitsch { 1174f17f7ee2SSatish Balay Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data; 1175f17f7ee2SSatish Balay PetscReal *gcoords; 1176f17f7ee2SSatish Balay PetscInt N; 1177f17f7ee2SSatish Balay MPI_Comm comm; 1178f17f7ee2SSatish Balay PetscMPIInt size; 1179f17f7ee2SSatish Balay PetscBool cong; 1180f17f7ee2SSatish Balay 1181f17f7ee2SSatish Balay PetscFunctionBegin; 1182f17f7ee2SSatish Balay PetscCall(PetscLayoutSetUp(A->rmap)); 1183f17f7ee2SSatish Balay PetscCall(PetscLayoutSetUp(A->cmap)); 1184f17f7ee2SSatish Balay PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 1185f17f7ee2SSatish Balay PetscCall(MatHasCongruentLayouts(A, &cong)); 1186f17f7ee2SSatish Balay PetscCheck(cong, comm, PETSC_ERR_SUP, "Only for square matrices with congruent layouts"); 1187f17f7ee2SSatish Balay N = A->rmap->N; 1188f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(comm, &size)); 1189f17f7ee2SSatish Balay if (spacedim > 0 && size > 1 && cdist) { 1190f17f7ee2SSatish Balay PetscSF sf; 1191f17f7ee2SSatish Balay MPI_Datatype dtype; 1192f17f7ee2SSatish Balay 1193f17f7ee2SSatish Balay PetscCallMPI(MPI_Type_contiguous(spacedim, MPIU_REAL, &dtype)); 1194f17f7ee2SSatish Balay PetscCallMPI(MPI_Type_commit(&dtype)); 1195f17f7ee2SSatish Balay 1196f17f7ee2SSatish Balay PetscCall(PetscSFCreate(comm, &sf)); 1197f17f7ee2SSatish Balay PetscCall(PetscSFSetGraphWithPattern(sf, A->rmap, PETSCSF_PATTERN_ALLGATHER)); 1198f17f7ee2SSatish Balay PetscCall(PetscMalloc1(spacedim * N, &gcoords)); 1199f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(sf, dtype, coords, gcoords, MPI_REPLACE)); 1200f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(sf, dtype, coords, gcoords, MPI_REPLACE)); 1201f17f7ee2SSatish Balay PetscCall(PetscSFDestroy(&sf)); 1202f17f7ee2SSatish Balay PetscCallMPI(MPI_Type_free(&dtype)); 1203f17f7ee2SSatish Balay } else gcoords = (PetscReal *)coords; 1204f17f7ee2SSatish Balay 1205f17f7ee2SSatish Balay delete h2opus->ptcloud; 1206f17f7ee2SSatish Balay delete h2opus->kernel; 1207f17f7ee2SSatish Balay h2opus->ptcloud = new PetscPointCloud<PetscReal>(spacedim, N, gcoords); 1208f17f7ee2SSatish Balay if (kernel) h2opus->kernel = new PetscFunctionGenerator<PetscScalar>(kernel, spacedim, kernelctx); 1209f17f7ee2SSatish Balay if (gcoords != coords) PetscCall(PetscFree(gcoords)); 1210f17f7ee2SSatish Balay A->preallocated = PETSC_TRUE; 12113ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1212f17f7ee2SSatish Balay } 1213f17f7ee2SSatish Balay 1214f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1215d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatBindToCPU_H2OPUS(Mat A, PetscBool flg) 1216d71ae5a4SJacob Faibussowitsch { 1217f17f7ee2SSatish Balay PetscMPIInt size; 1218f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 1219f17f7ee2SSatish Balay 1220f17f7ee2SSatish Balay PetscFunctionBegin; 1221f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1222f17f7ee2SSatish Balay if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) { 1223f17f7ee2SSatish Balay if (size > 1) { 1224f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 1225f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1226f17f7ee2SSatish Balay if (!a->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix_gpu); 1227f17f7ee2SSatish Balay else *a->dist_hmatrix = *a->dist_hmatrix_gpu; 1228f17f7ee2SSatish Balay #endif 1229f17f7ee2SSatish Balay } else { 1230f17f7ee2SSatish Balay PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 1231f17f7ee2SSatish Balay if (!a->hmatrix) a->hmatrix = new HMatrix(*a->hmatrix_gpu); 1232f17f7ee2SSatish Balay else *a->hmatrix = *a->hmatrix_gpu; 1233f17f7ee2SSatish Balay } 1234f17f7ee2SSatish Balay delete a->hmatrix_gpu; 1235f17f7ee2SSatish Balay delete a->dist_hmatrix_gpu; 1236f17f7ee2SSatish Balay a->hmatrix_gpu = NULL; 1237f17f7ee2SSatish Balay a->dist_hmatrix_gpu = NULL; 1238f17f7ee2SSatish Balay A->offloadmask = PETSC_OFFLOAD_CPU; 1239f17f7ee2SSatish Balay } else if (!flg && A->offloadmask == PETSC_OFFLOAD_CPU) { 1240f17f7ee2SSatish Balay if (size > 1) { 1241f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 1242f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1243f17f7ee2SSatish Balay if (!a->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix); 1244f17f7ee2SSatish Balay else *a->dist_hmatrix_gpu = *a->dist_hmatrix; 1245f17f7ee2SSatish Balay #endif 1246f17f7ee2SSatish Balay } else { 1247f17f7ee2SSatish Balay PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 1248f17f7ee2SSatish Balay if (!a->hmatrix_gpu) a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix); 1249f17f7ee2SSatish Balay else *a->hmatrix_gpu = *a->hmatrix; 1250f17f7ee2SSatish Balay } 1251f17f7ee2SSatish Balay delete a->hmatrix; 1252f17f7ee2SSatish Balay delete a->dist_hmatrix; 1253f17f7ee2SSatish Balay a->hmatrix = NULL; 1254f17f7ee2SSatish Balay a->dist_hmatrix = NULL; 1255f17f7ee2SSatish Balay A->offloadmask = PETSC_OFFLOAD_GPU; 1256f17f7ee2SSatish Balay } 1257f17f7ee2SSatish Balay PetscCall(PetscFree(A->defaultvectype)); 1258f17f7ee2SSatish Balay if (!flg) { 1259f17f7ee2SSatish Balay PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype)); 1260f17f7ee2SSatish Balay } else { 1261f17f7ee2SSatish Balay PetscCall(PetscStrallocpy(VECSTANDARD, &A->defaultvectype)); 1262f17f7ee2SSatish Balay } 1263f17f7ee2SSatish Balay A->boundtocpu = flg; 12643ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1265f17f7ee2SSatish Balay } 1266f17f7ee2SSatish Balay #endif 1267f17f7ee2SSatish Balay 1268f17f7ee2SSatish Balay /*MC 1269f17f7ee2SSatish Balay MATH2OPUS = "h2opus" - A matrix type for hierarchical matrices using the H2Opus package. 1270f17f7ee2SSatish Balay 12712ef1f0ffSBarry Smith Options Database Key: 12722ef1f0ffSBarry Smith . -mat_type h2opus - matrix type to "h2opus" 12732ef1f0ffSBarry Smith 12742ef1f0ffSBarry Smith Level: beginner 1275f17f7ee2SSatish Balay 1276f17f7ee2SSatish Balay Notes: 127711a5261eSBarry Smith H2Opus implements hierarchical matrices in the H^2 flavour. It supports CPU or NVIDIA GPUs. 127811a5261eSBarry Smith 12792ef1f0ffSBarry Smith For CPU only builds, use `./configure --download-h2opus --download-thrust` to install PETSc to use H2Opus. 12802ef1f0ffSBarry Smith In order to run on NVIDIA GPUs, use `./configure --download-h2opus --download-magma --download-kblas`. 1281f17f7ee2SSatish Balay 128211a5261eSBarry Smith Reference: 128311a5261eSBarry Smith . * - "H2Opus: A distributed-memory multi-GPU software package for non-local operators", https://arxiv.org/abs/2109.05451 128411a5261eSBarry Smith 1285*1cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MATH2OPUS`, `MATHTOOL`, `MATDENSE`, `MatCreateH2OpusFromKernel()`, `MatCreateH2OpusFromMat()` 1286f17f7ee2SSatish Balay M*/ 1287d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_H2OPUS(Mat A) 1288d71ae5a4SJacob Faibussowitsch { 1289f17f7ee2SSatish Balay Mat_H2OPUS *a; 1290f17f7ee2SSatish Balay PetscMPIInt size; 1291f17f7ee2SSatish Balay 1292f17f7ee2SSatish Balay PetscFunctionBegin; 1293f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1294f17f7ee2SSatish Balay PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 1295f17f7ee2SSatish Balay #endif 12964dfa11a4SJacob Faibussowitsch PetscCall(PetscNew(&a)); 1297f17f7ee2SSatish Balay A->data = (void *)a; 1298f17f7ee2SSatish Balay 1299f17f7ee2SSatish Balay a->eta = 0.9; 1300f17f7ee2SSatish Balay a->leafsize = 32; 1301f17f7ee2SSatish Balay a->basisord = 4; 1302f17f7ee2SSatish Balay a->max_rank = 64; 1303f17f7ee2SSatish Balay a->bs = 32; 1304f17f7ee2SSatish Balay a->rtol = 1.e-4; 1305f17f7ee2SSatish Balay a->s = 1.0; 1306f17f7ee2SSatish Balay a->norm_max_samples = 10; 1307036e4bdcSSatish Balay a->resize = PETSC_TRUE; /* reallocate after compression */ 1308f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1309f17f7ee2SSatish Balay h2opusCreateDistributedHandleComm(&a->handle, PetscObjectComm((PetscObject)A)); 1310f17f7ee2SSatish Balay #else 1311f17f7ee2SSatish Balay h2opusCreateHandle(&a->handle); 1312f17f7ee2SSatish Balay #endif 1313f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1314f17f7ee2SSatish Balay PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATH2OPUS)); 1315f17f7ee2SSatish Balay PetscCall(PetscMemzero(A->ops, sizeof(struct _MatOps))); 1316f17f7ee2SSatish Balay 1317f17f7ee2SSatish Balay A->ops->destroy = MatDestroy_H2OPUS; 1318f17f7ee2SSatish Balay A->ops->view = MatView_H2OPUS; 1319f17f7ee2SSatish Balay A->ops->assemblyend = MatAssemblyEnd_H2OPUS; 1320f17f7ee2SSatish Balay A->ops->mult = MatMult_H2OPUS; 1321f17f7ee2SSatish Balay A->ops->multtranspose = MatMultTranspose_H2OPUS; 1322f17f7ee2SSatish Balay A->ops->multadd = MatMultAdd_H2OPUS; 1323f17f7ee2SSatish Balay A->ops->multtransposeadd = MatMultTransposeAdd_H2OPUS; 1324f17f7ee2SSatish Balay A->ops->scale = MatScale_H2OPUS; 1325f17f7ee2SSatish Balay A->ops->duplicate = MatDuplicate_H2OPUS; 1326f17f7ee2SSatish Balay A->ops->setfromoptions = MatSetFromOptions_H2OPUS; 1327f17f7ee2SSatish Balay A->ops->norm = MatNorm_H2OPUS; 1328f17f7ee2SSatish Balay A->ops->zeroentries = MatZeroEntries_H2OPUS; 1329f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1330f17f7ee2SSatish Balay A->ops->bindtocpu = MatBindToCPU_H2OPUS; 1331f17f7ee2SSatish Balay #endif 1332f17f7ee2SSatish Balay 1333f17f7ee2SSatish Balay PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", MatProductSetFromOptions_H2OPUS)); 1334f17f7ee2SSatish Balay PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", MatProductSetFromOptions_H2OPUS)); 1335f17f7ee2SSatish Balay PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", MatProductSetFromOptions_H2OPUS)); 1336f17f7ee2SSatish Balay PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", MatProductSetFromOptions_H2OPUS)); 1337f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1338f17f7ee2SSatish Balay PetscCall(PetscFree(A->defaultvectype)); 1339f17f7ee2SSatish Balay PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype)); 1340f17f7ee2SSatish Balay #endif 13413ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1342f17f7ee2SSatish Balay } 1343f17f7ee2SSatish Balay 1344f17f7ee2SSatish Balay /*@C 1345f17f7ee2SSatish Balay MatH2OpusOrthogonalize - Orthogonalize the basis tree of a hierarchical matrix. 1346f17f7ee2SSatish Balay 1347f17f7ee2SSatish Balay Input Parameter: 1348f17f7ee2SSatish Balay . A - the matrix 1349f17f7ee2SSatish Balay 1350f17f7ee2SSatish Balay Level: intermediate 1351f17f7ee2SSatish Balay 1352*1cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()` 1353f17f7ee2SSatish Balay @*/ 1354d71ae5a4SJacob Faibussowitsch PetscErrorCode MatH2OpusOrthogonalize(Mat A) 1355d71ae5a4SJacob Faibussowitsch { 1356f17f7ee2SSatish Balay PetscBool ish2opus; 1357f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 1358f17f7ee2SSatish Balay PetscMPIInt size; 1359f17f7ee2SSatish Balay PetscBool boundtocpu = PETSC_TRUE; 1360f17f7ee2SSatish Balay 1361f17f7ee2SSatish Balay PetscFunctionBegin; 1362f17f7ee2SSatish Balay PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1363f17f7ee2SSatish Balay PetscValidType(A, 1); 1364f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus)); 13653ba16761SJacob Faibussowitsch if (!ish2opus) PetscFunctionReturn(PETSC_SUCCESS); 13663ba16761SJacob Faibussowitsch if (a->orthogonal) PetscFunctionReturn(PETSC_SUCCESS); 1367f17f7ee2SSatish Balay HLibProfile::clear(); 1368f17f7ee2SSatish Balay PetscCall(PetscLogEventBegin(MAT_H2Opus_Orthog, A, 0, 0, 0)); 1369f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1370f17f7ee2SSatish Balay boundtocpu = A->boundtocpu; 1371f17f7ee2SSatish Balay #endif 1372f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1373f17f7ee2SSatish Balay if (size > 1) { 1374f17f7ee2SSatish Balay if (boundtocpu) { 1375f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 1376f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1377f17f7ee2SSatish Balay distributed_horthog(*a->dist_hmatrix, a->handle); 1378f17f7ee2SSatish Balay #endif 1379f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1380f17f7ee2SSatish Balay A->offloadmask = PETSC_OFFLOAD_CPU; 1381f17f7ee2SSatish Balay } else { 1382f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 1383f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeBegin()); 1384f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1385f17f7ee2SSatish Balay distributed_horthog(*a->dist_hmatrix_gpu, a->handle); 1386f17f7ee2SSatish Balay #endif 1387f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeEnd()); 1388f17f7ee2SSatish Balay #endif 1389f17f7ee2SSatish Balay } 1390f17f7ee2SSatish Balay } else { 1391f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1392f17f7ee2SSatish Balay h2opusHandle_t handle = a->handle->handle; 1393f17f7ee2SSatish Balay #else 1394f17f7ee2SSatish Balay h2opusHandle_t handle = a->handle; 1395f17f7ee2SSatish Balay #endif 1396f17f7ee2SSatish Balay if (boundtocpu) { 1397f17f7ee2SSatish Balay PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 1398f17f7ee2SSatish Balay horthog(*a->hmatrix, handle); 1399f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1400f17f7ee2SSatish Balay A->offloadmask = PETSC_OFFLOAD_CPU; 1401f17f7ee2SSatish Balay } else { 1402f17f7ee2SSatish Balay PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 1403f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeBegin()); 1404f17f7ee2SSatish Balay horthog(*a->hmatrix_gpu, handle); 1405f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeEnd()); 1406f17f7ee2SSatish Balay #endif 1407f17f7ee2SSatish Balay } 1408f17f7ee2SSatish Balay } 1409f17f7ee2SSatish Balay a->orthogonal = PETSC_TRUE; 1410f17f7ee2SSatish Balay { /* log flops */ 1411f17f7ee2SSatish Balay double gops, time, perf, dev; 1412f17f7ee2SSatish Balay HLibProfile::getHorthogPerf(gops, time, perf, dev); 1413f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1414f17f7ee2SSatish Balay if (boundtocpu) { 1415f17f7ee2SSatish Balay PetscCall(PetscLogFlops(1e9 * gops)); 1416f17f7ee2SSatish Balay } else { 1417f17f7ee2SSatish Balay PetscCall(PetscLogGpuFlops(1e9 * gops)); 1418f17f7ee2SSatish Balay } 1419f17f7ee2SSatish Balay #else 1420f17f7ee2SSatish Balay PetscCall(PetscLogFlops(1e9 * gops)); 1421f17f7ee2SSatish Balay #endif 1422f17f7ee2SSatish Balay } 1423f17f7ee2SSatish Balay PetscCall(PetscLogEventEnd(MAT_H2Opus_Orthog, A, 0, 0, 0)); 14243ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1425f17f7ee2SSatish Balay } 1426f17f7ee2SSatish Balay 1427f17f7ee2SSatish Balay /*@C 1428f17f7ee2SSatish Balay MatH2OpusCompress - Compress a hierarchical matrix. 1429f17f7ee2SSatish Balay 1430f17f7ee2SSatish Balay Input Parameters: 1431f17f7ee2SSatish Balay + A - the matrix 1432f17f7ee2SSatish Balay - tol - the absolute truncation threshold 1433f17f7ee2SSatish Balay 1434f17f7ee2SSatish Balay Level: intermediate 1435f17f7ee2SSatish Balay 1436*1cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusOrthogonalize()` 1437f17f7ee2SSatish Balay @*/ 1438d71ae5a4SJacob Faibussowitsch PetscErrorCode MatH2OpusCompress(Mat A, PetscReal tol) 1439d71ae5a4SJacob Faibussowitsch { 1440f17f7ee2SSatish Balay PetscBool ish2opus; 1441f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 1442f17f7ee2SSatish Balay PetscMPIInt size; 1443f17f7ee2SSatish Balay PetscBool boundtocpu = PETSC_TRUE; 1444f17f7ee2SSatish Balay 1445f17f7ee2SSatish Balay PetscFunctionBegin; 1446f17f7ee2SSatish Balay PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1447f17f7ee2SSatish Balay PetscValidType(A, 1); 1448f17f7ee2SSatish Balay PetscValidLogicalCollectiveReal(A, tol, 2); 1449f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus)); 14503ba16761SJacob Faibussowitsch if (!ish2opus || tol <= 0.0) PetscFunctionReturn(PETSC_SUCCESS); 1451f17f7ee2SSatish Balay PetscCall(MatH2OpusOrthogonalize(A)); 1452f17f7ee2SSatish Balay HLibProfile::clear(); 1453f17f7ee2SSatish Balay PetscCall(PetscLogEventBegin(MAT_H2Opus_Compress, A, 0, 0, 0)); 1454f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1455f17f7ee2SSatish Balay boundtocpu = A->boundtocpu; 1456f17f7ee2SSatish Balay #endif 1457f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1458f17f7ee2SSatish Balay if (size > 1) { 1459f17f7ee2SSatish Balay if (boundtocpu) { 1460f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 1461f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1462f17f7ee2SSatish Balay distributed_hcompress(*a->dist_hmatrix, tol, a->handle); 1463036e4bdcSSatish Balay if (a->resize) { 1464036e4bdcSSatish Balay DistributedHMatrix *dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix); 1465036e4bdcSSatish Balay delete a->dist_hmatrix; 1466036e4bdcSSatish Balay a->dist_hmatrix = dist_hmatrix; 1467036e4bdcSSatish Balay } 1468f17f7ee2SSatish Balay #endif 1469f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1470f17f7ee2SSatish Balay A->offloadmask = PETSC_OFFLOAD_CPU; 1471f17f7ee2SSatish Balay } else { 1472f17f7ee2SSatish Balay PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 1473f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeBegin()); 1474f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1475f17f7ee2SSatish Balay distributed_hcompress(*a->dist_hmatrix_gpu, tol, a->handle); 1476036e4bdcSSatish Balay 1477036e4bdcSSatish Balay if (a->resize) { 1478036e4bdcSSatish Balay DistributedHMatrix_GPU *dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix_gpu); 1479036e4bdcSSatish Balay delete a->dist_hmatrix_gpu; 1480036e4bdcSSatish Balay a->dist_hmatrix_gpu = dist_hmatrix_gpu; 1481036e4bdcSSatish Balay } 1482f17f7ee2SSatish Balay #endif 1483f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeEnd()); 1484f17f7ee2SSatish Balay #endif 1485f17f7ee2SSatish Balay } 1486f17f7ee2SSatish Balay } else { 1487f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1488f17f7ee2SSatish Balay h2opusHandle_t handle = a->handle->handle; 1489f17f7ee2SSatish Balay #else 1490f17f7ee2SSatish Balay h2opusHandle_t handle = a->handle; 1491f17f7ee2SSatish Balay #endif 1492f17f7ee2SSatish Balay if (boundtocpu) { 1493f17f7ee2SSatish Balay PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 1494f17f7ee2SSatish Balay hcompress(*a->hmatrix, tol, handle); 1495036e4bdcSSatish Balay 1496036e4bdcSSatish Balay if (a->resize) { 1497036e4bdcSSatish Balay HMatrix *hmatrix = new HMatrix(*a->hmatrix); 1498036e4bdcSSatish Balay delete a->hmatrix; 1499036e4bdcSSatish Balay a->hmatrix = hmatrix; 1500036e4bdcSSatish Balay } 1501f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1502f17f7ee2SSatish Balay A->offloadmask = PETSC_OFFLOAD_CPU; 1503f17f7ee2SSatish Balay } else { 1504f17f7ee2SSatish Balay PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 1505f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeBegin()); 1506f17f7ee2SSatish Balay hcompress(*a->hmatrix_gpu, tol, handle); 1507f17f7ee2SSatish Balay PetscCall(PetscLogGpuTimeEnd()); 1508036e4bdcSSatish Balay 1509036e4bdcSSatish Balay if (a->resize) { 1510036e4bdcSSatish Balay HMatrix_GPU *hmatrix_gpu = new HMatrix_GPU(*a->hmatrix_gpu); 1511036e4bdcSSatish Balay delete a->hmatrix_gpu; 1512036e4bdcSSatish Balay a->hmatrix_gpu = hmatrix_gpu; 1513036e4bdcSSatish Balay } 1514f17f7ee2SSatish Balay #endif 1515f17f7ee2SSatish Balay } 1516f17f7ee2SSatish Balay } 1517f17f7ee2SSatish Balay { /* log flops */ 1518f17f7ee2SSatish Balay double gops, time, perf, dev; 1519f17f7ee2SSatish Balay HLibProfile::getHcompressPerf(gops, time, perf, dev); 1520f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1521f17f7ee2SSatish Balay if (boundtocpu) { 1522f17f7ee2SSatish Balay PetscCall(PetscLogFlops(1e9 * gops)); 1523f17f7ee2SSatish Balay } else { 1524f17f7ee2SSatish Balay PetscCall(PetscLogGpuFlops(1e9 * gops)); 1525f17f7ee2SSatish Balay } 1526f17f7ee2SSatish Balay #else 1527f17f7ee2SSatish Balay PetscCall(PetscLogFlops(1e9 * gops)); 1528f17f7ee2SSatish Balay #endif 1529f17f7ee2SSatish Balay } 1530f17f7ee2SSatish Balay PetscCall(PetscLogEventEnd(MAT_H2Opus_Compress, A, 0, 0, 0)); 15313ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1532f17f7ee2SSatish Balay } 1533f17f7ee2SSatish Balay 1534f17f7ee2SSatish Balay /*@C 15352ef1f0ffSBarry Smith MatH2OpusSetSamplingMat - Set a matrix to be sampled from matrix-vector products on another matrix to construct a hierarchical matrix. 1536f17f7ee2SSatish Balay 1537f17f7ee2SSatish Balay Input Parameters: 1538f17f7ee2SSatish Balay + A - the hierarchical matrix 1539f17f7ee2SSatish Balay . B - the matrix to be sampled 1540f17f7ee2SSatish Balay . bs - maximum number of samples to be taken concurrently 1541f17f7ee2SSatish Balay - tol - relative tolerance for construction 1542f17f7ee2SSatish Balay 154311a5261eSBarry Smith Notes: 154411a5261eSBarry Smith You need to call `MatAssemblyBegin()` and `MatAssemblyEnd()` to update the hierarchical matrix. 1545f17f7ee2SSatish Balay 1546f17f7ee2SSatish Balay Level: intermediate 1547f17f7ee2SSatish Balay 1548*1cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()` 1549f17f7ee2SSatish Balay @*/ 1550d71ae5a4SJacob Faibussowitsch PetscErrorCode MatH2OpusSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol) 1551d71ae5a4SJacob Faibussowitsch { 1552f17f7ee2SSatish Balay PetscBool ish2opus; 1553f17f7ee2SSatish Balay 1554f17f7ee2SSatish Balay PetscFunctionBegin; 1555f17f7ee2SSatish Balay PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1556f17f7ee2SSatish Balay PetscValidType(A, 1); 1557f17f7ee2SSatish Balay if (B) PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 1558f17f7ee2SSatish Balay PetscValidLogicalCollectiveInt(A, bs, 3); 1559f17f7ee2SSatish Balay PetscValidLogicalCollectiveReal(A, tol, 3); 1560f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus)); 1561f17f7ee2SSatish Balay if (ish2opus) { 1562f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 1563f17f7ee2SSatish Balay 1564f17f7ee2SSatish Balay if (!a->sampler) a->sampler = new PetscMatrixSampler(); 1565f17f7ee2SSatish Balay a->sampler->SetSamplingMat(B); 1566f17f7ee2SSatish Balay if (bs > 0) a->bs = bs; 1567f17f7ee2SSatish Balay if (tol > 0.) a->rtol = tol; 1568f17f7ee2SSatish Balay delete a->kernel; 1569f17f7ee2SSatish Balay } 15703ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1571f17f7ee2SSatish Balay } 1572f17f7ee2SSatish Balay 1573f17f7ee2SSatish Balay /*@C 157411a5261eSBarry Smith MatCreateH2OpusFromKernel - Creates a `MATH2OPUS` from a user-supplied kernel. 1575f17f7ee2SSatish Balay 1576f17f7ee2SSatish Balay Input Parameters: 1577f17f7ee2SSatish Balay + comm - MPI communicator 15782ef1f0ffSBarry Smith . m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given) 15792ef1f0ffSBarry Smith . n - number of local columns (or `PETSC_DECIDE` to have calculated if `N` is given) 15802ef1f0ffSBarry Smith . M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given) 15812ef1f0ffSBarry Smith . N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given) 1582f17f7ee2SSatish Balay . spacedim - dimension of the space coordinates 1583f17f7ee2SSatish Balay . coords - coordinates of the points 1584f17f7ee2SSatish Balay . cdist - whether or not coordinates are distributed 15852ef1f0ffSBarry Smith . kernel - computational kernel (or `NULL`) 1586f17f7ee2SSatish Balay . kernelctx - kernel context 1587f17f7ee2SSatish Balay . eta - admissibility condition tolerance 1588f17f7ee2SSatish Balay . leafsize - leaf size in cluster tree 1589f17f7ee2SSatish Balay - basisord - approximation order for Chebychev interpolation of low-rank blocks 1590f17f7ee2SSatish Balay 1591f17f7ee2SSatish Balay Output Parameter: 1592f17f7ee2SSatish Balay . nA - matrix 1593f17f7ee2SSatish Balay 1594f17f7ee2SSatish Balay Options Database Keys: 159511a5261eSBarry Smith + -mat_h2opus_leafsize <`PetscInt`> - Leaf size of cluster tree 159611a5261eSBarry Smith . -mat_h2opus_eta <`PetscReal`> - Admissibility condition tolerance 159711a5261eSBarry Smith . -mat_h2opus_order <`PetscInt`> - Chebychev approximation order 159811a5261eSBarry Smith - -mat_h2opus_normsamples <`PetscInt`> - Maximum number of samples to be used when estimating norms 1599f17f7ee2SSatish Balay 1600f17f7ee2SSatish Balay Level: intermediate 1601f17f7ee2SSatish Balay 1602*1cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()` 1603f17f7ee2SSatish Balay @*/ 1604d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateH2OpusFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat *nA) 1605d71ae5a4SJacob Faibussowitsch { 1606f17f7ee2SSatish Balay Mat A; 1607f17f7ee2SSatish Balay Mat_H2OPUS *h2opus; 1608f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1609f17f7ee2SSatish Balay PetscBool iscpu = PETSC_FALSE; 1610f17f7ee2SSatish Balay #else 1611f17f7ee2SSatish Balay PetscBool iscpu = PETSC_TRUE; 1612f17f7ee2SSatish Balay #endif 1613f17f7ee2SSatish Balay 1614f17f7ee2SSatish Balay PetscFunctionBegin; 1615036e4bdcSSatish Balay PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported"); 1616f17f7ee2SSatish Balay PetscCall(MatCreate(comm, &A)); 1617f17f7ee2SSatish Balay PetscCall(MatSetSizes(A, m, n, M, N)); 1618036e4bdcSSatish Balay PetscCheck(M == N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported"); 1619f17f7ee2SSatish Balay PetscCall(MatSetType(A, MATH2OPUS)); 1620f17f7ee2SSatish Balay PetscCall(MatBindToCPU(A, iscpu)); 1621f17f7ee2SSatish Balay PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, kernel, kernelctx)); 1622f17f7ee2SSatish Balay 1623f17f7ee2SSatish Balay h2opus = (Mat_H2OPUS *)A->data; 1624f17f7ee2SSatish Balay if (eta > 0.) h2opus->eta = eta; 1625f17f7ee2SSatish Balay if (leafsize > 0) h2opus->leafsize = leafsize; 1626f17f7ee2SSatish Balay if (basisord > 0) h2opus->basisord = basisord; 1627f17f7ee2SSatish Balay 1628f17f7ee2SSatish Balay *nA = A; 16293ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1630f17f7ee2SSatish Balay } 1631f17f7ee2SSatish Balay 1632f17f7ee2SSatish Balay /*@C 163311a5261eSBarry Smith MatCreateH2OpusFromMat - Creates a `MATH2OPUS` sampling from a user-supplied operator. 1634f17f7ee2SSatish Balay 1635f17f7ee2SSatish Balay Input Parameters: 1636f17f7ee2SSatish Balay + B - the matrix to be sampled 1637f17f7ee2SSatish Balay . spacedim - dimension of the space coordinates 1638f17f7ee2SSatish Balay . coords - coordinates of the points 1639f17f7ee2SSatish Balay . cdist - whether or not coordinates are distributed 1640f17f7ee2SSatish Balay . eta - admissibility condition tolerance 1641f17f7ee2SSatish Balay . leafsize - leaf size in cluster tree 1642f17f7ee2SSatish Balay . maxrank - maximum rank allowed 1643f17f7ee2SSatish Balay . bs - maximum number of samples to be taken concurrently 1644f17f7ee2SSatish Balay - rtol - relative tolerance for construction 1645f17f7ee2SSatish Balay 1646f17f7ee2SSatish Balay Output Parameter: 1647f17f7ee2SSatish Balay . nA - matrix 1648f17f7ee2SSatish Balay 1649f17f7ee2SSatish Balay Options Database Keys: 165011a5261eSBarry Smith + -mat_h2opus_leafsize <`PetscInt`> - Leaf size of cluster tree 165111a5261eSBarry Smith . -mat_h2opus_eta <`PetscReal`> - Admissibility condition tolerance 165211a5261eSBarry Smith . -mat_h2opus_maxrank <`PetscInt`> - Maximum rank when constructed from matvecs 165311a5261eSBarry Smith . -mat_h2opus_samples <`PetscInt`> - Maximum number of samples to be taken concurrently when constructing from matvecs 165411a5261eSBarry Smith . -mat_h2opus_rtol <`PetscReal`> - Relative tolerance for construction from sampling 165511a5261eSBarry Smith . -mat_h2opus_check <`PetscBool`> - Check error when constructing from sampling during MatAssemblyEnd() 165611a5261eSBarry Smith . -mat_h2opus_hara_verbose <`PetscBool`> - Verbose output from hara construction 1657aaa8cc7dSPierre Jolivet - -mat_h2opus_normsamples <`PetscInt`> - Maximum number of samples to be when estimating norms 1658f17f7ee2SSatish Balay 16592ef1f0ffSBarry Smith Level: intermediate 16602ef1f0ffSBarry Smith 166111a5261eSBarry Smith Note: 166211a5261eSBarry Smith Not available in parallel 1663f17f7ee2SSatish Balay 1664*1cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromKernel()` 1665f17f7ee2SSatish Balay @*/ 1666d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateH2OpusFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA) 1667d71ae5a4SJacob Faibussowitsch { 1668f17f7ee2SSatish Balay Mat A; 1669f17f7ee2SSatish Balay Mat_H2OPUS *h2opus; 1670f17f7ee2SSatish Balay MPI_Comm comm; 1671f17f7ee2SSatish Balay PetscBool boundtocpu = PETSC_TRUE; 1672f17f7ee2SSatish Balay 1673f17f7ee2SSatish Balay PetscFunctionBegin; 1674f17f7ee2SSatish Balay PetscValidHeaderSpecific(B, MAT_CLASSID, 1); 1675f17f7ee2SSatish Balay PetscValidLogicalCollectiveInt(B, spacedim, 2); 1676f17f7ee2SSatish Balay PetscValidLogicalCollectiveBool(B, cdist, 4); 1677f17f7ee2SSatish Balay PetscValidLogicalCollectiveReal(B, eta, 5); 1678f17f7ee2SSatish Balay PetscValidLogicalCollectiveInt(B, leafsize, 6); 1679f17f7ee2SSatish Balay PetscValidLogicalCollectiveInt(B, maxrank, 7); 1680f17f7ee2SSatish Balay PetscValidLogicalCollectiveInt(B, bs, 8); 1681f17f7ee2SSatish Balay PetscValidLogicalCollectiveReal(B, rtol, 9); 1682f17f7ee2SSatish Balay PetscValidPointer(nA, 10); 1683f17f7ee2SSatish Balay PetscCall(PetscObjectGetComm((PetscObject)B, &comm)); 1684036e4bdcSSatish Balay PetscCheck(B->rmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported"); 1685036e4bdcSSatish Balay PetscCheck(B->rmap->N == B->cmap->N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported"); 1686f17f7ee2SSatish Balay PetscCall(MatCreate(comm, &A)); 1687f17f7ee2SSatish Balay PetscCall(MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N)); 1688f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1689f17f7ee2SSatish Balay { 1690f17f7ee2SSatish Balay PetscBool iscuda; 1691f17f7ee2SSatish Balay VecType vtype; 1692f17f7ee2SSatish Balay 1693f17f7ee2SSatish Balay PetscCall(MatGetVecType(B, &vtype)); 1694f17f7ee2SSatish Balay PetscCall(PetscStrcmp(vtype, VECCUDA, &iscuda)); 1695f17f7ee2SSatish Balay if (!iscuda) { 1696f17f7ee2SSatish Balay PetscCall(PetscStrcmp(vtype, VECSEQCUDA, &iscuda)); 169748a46eb9SPierre Jolivet if (!iscuda) PetscCall(PetscStrcmp(vtype, VECMPICUDA, &iscuda)); 1698f17f7ee2SSatish Balay } 1699f17f7ee2SSatish Balay if (iscuda && !B->boundtocpu) boundtocpu = PETSC_FALSE; 1700f17f7ee2SSatish Balay } 1701f17f7ee2SSatish Balay #endif 1702f17f7ee2SSatish Balay PetscCall(MatSetType(A, MATH2OPUS)); 1703f17f7ee2SSatish Balay PetscCall(MatBindToCPU(A, boundtocpu)); 170448a46eb9SPierre Jolivet if (spacedim) PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, NULL, NULL)); 1705f17f7ee2SSatish Balay PetscCall(MatPropagateSymmetryOptions(B, A)); 1706f17f7ee2SSatish Balay /* PetscCheck(A->symmetric,comm,PETSC_ERR_SUP,"Unsymmetric sampling does not work"); */ 1707f17f7ee2SSatish Balay 1708f17f7ee2SSatish Balay h2opus = (Mat_H2OPUS *)A->data; 1709f17f7ee2SSatish Balay h2opus->sampler = new PetscMatrixSampler(B); 1710f17f7ee2SSatish Balay if (eta > 0.) h2opus->eta = eta; 1711f17f7ee2SSatish Balay if (leafsize > 0) h2opus->leafsize = leafsize; 1712f17f7ee2SSatish Balay if (maxrank > 0) h2opus->max_rank = maxrank; 1713f17f7ee2SSatish Balay if (bs > 0) h2opus->bs = bs; 1714f17f7ee2SSatish Balay if (rtol > 0.) h2opus->rtol = rtol; 1715f17f7ee2SSatish Balay *nA = A; 1716f17f7ee2SSatish Balay A->preallocated = PETSC_TRUE; 17173ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1718f17f7ee2SSatish Balay } 1719f17f7ee2SSatish Balay 1720f17f7ee2SSatish Balay /*@C 1721f17f7ee2SSatish Balay MatH2OpusGetIndexMap - Access reordering index set. 1722f17f7ee2SSatish Balay 17232fe279fdSBarry Smith Input Parameter: 1724f17f7ee2SSatish Balay . A - the matrix 1725f17f7ee2SSatish Balay 1726f17f7ee2SSatish Balay Output Parameter: 1727f17f7ee2SSatish Balay . indexmap - the index set for the reordering 1728f17f7ee2SSatish Balay 1729f17f7ee2SSatish Balay Level: intermediate 1730f17f7ee2SSatish Balay 1731*1cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()` 1732f17f7ee2SSatish Balay @*/ 1733d71ae5a4SJacob Faibussowitsch PetscErrorCode MatH2OpusGetIndexMap(Mat A, IS *indexmap) 1734d71ae5a4SJacob Faibussowitsch { 1735f17f7ee2SSatish Balay PetscBool ish2opus; 1736f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 1737f17f7ee2SSatish Balay 1738f17f7ee2SSatish Balay PetscFunctionBegin; 1739f17f7ee2SSatish Balay PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1740f17f7ee2SSatish Balay PetscValidType(A, 1); 1741f17f7ee2SSatish Balay PetscValidPointer(indexmap, 2); 1742f17f7ee2SSatish Balay PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix"); 1743f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus)); 1744f17f7ee2SSatish Balay PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name); 1745f17f7ee2SSatish Balay *indexmap = a->h2opus_indexmap; 17463ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1747f17f7ee2SSatish Balay } 1748f17f7ee2SSatish Balay 1749f17f7ee2SSatish Balay /*@C 1750f17f7ee2SSatish Balay MatH2OpusMapVec - Maps a vector between PETSc and H2Opus ordering 1751f17f7ee2SSatish Balay 1752f17f7ee2SSatish Balay Input Parameters: 1753f17f7ee2SSatish Balay + A - the matrix 1754f17f7ee2SSatish Balay . nativetopetsc - if true, maps from H2Opus ordering to PETSc ordering. If false, applies the reverse map 1755f17f7ee2SSatish Balay - in - the vector to be mapped 1756f17f7ee2SSatish Balay 1757f17f7ee2SSatish Balay Output Parameter: 1758f17f7ee2SSatish Balay . out - the newly created mapped vector 1759f17f7ee2SSatish Balay 1760f17f7ee2SSatish Balay Level: intermediate 1761f17f7ee2SSatish Balay 1762*1cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()` 1763f17f7ee2SSatish Balay @*/ 1764d71ae5a4SJacob Faibussowitsch PetscErrorCode MatH2OpusMapVec(Mat A, PetscBool nativetopetsc, Vec in, Vec *out) 1765d71ae5a4SJacob Faibussowitsch { 1766f17f7ee2SSatish Balay PetscBool ish2opus; 1767f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 1768f17f7ee2SSatish Balay PetscScalar *xin, *xout; 1769f17f7ee2SSatish Balay PetscBool nm; 1770f17f7ee2SSatish Balay 1771f17f7ee2SSatish Balay PetscFunctionBegin; 1772f17f7ee2SSatish Balay PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1773f17f7ee2SSatish Balay PetscValidType(A, 1); 1774f17f7ee2SSatish Balay PetscValidLogicalCollectiveBool(A, nativetopetsc, 2); 1775f17f7ee2SSatish Balay PetscValidHeaderSpecific(in, VEC_CLASSID, 3); 1776f17f7ee2SSatish Balay PetscValidPointer(out, 4); 1777f17f7ee2SSatish Balay PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix"); 1778f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus)); 1779f17f7ee2SSatish Balay PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name); 1780f17f7ee2SSatish Balay nm = a->nativemult; 1781f17f7ee2SSatish Balay PetscCall(MatH2OpusSetNativeMult(A, (PetscBool)!nativetopetsc)); 1782f17f7ee2SSatish Balay PetscCall(MatCreateVecs(A, out, NULL)); 1783f17f7ee2SSatish Balay PetscCall(MatH2OpusSetNativeMult(A, nm)); 1784f17f7ee2SSatish Balay if (!a->sf) { /* same ordering */ 1785f17f7ee2SSatish Balay PetscCall(VecCopy(in, *out)); 17863ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1787f17f7ee2SSatish Balay } 1788f17f7ee2SSatish Balay PetscCall(VecGetArrayRead(in, (const PetscScalar **)&xin)); 1789f17f7ee2SSatish Balay PetscCall(VecGetArrayWrite(*out, &xout)); 1790f17f7ee2SSatish Balay if (nativetopetsc) { 1791f17f7ee2SSatish Balay PetscCall(PetscSFReduceBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE)); 1792f17f7ee2SSatish Balay PetscCall(PetscSFReduceEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE)); 1793f17f7ee2SSatish Balay } else { 1794f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE)); 1795f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE)); 1796f17f7ee2SSatish Balay } 1797f17f7ee2SSatish Balay PetscCall(VecRestoreArrayRead(in, (const PetscScalar **)&xin)); 1798f17f7ee2SSatish Balay PetscCall(VecRestoreArrayWrite(*out, &xout)); 17993ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1800f17f7ee2SSatish Balay } 1801f17f7ee2SSatish Balay 1802f17f7ee2SSatish Balay /*@C 1803f17f7ee2SSatish Balay MatH2OpusLowRankUpdate - Perform a low-rank update of the form A = A + s * U * V^T 1804f17f7ee2SSatish Balay 1805f17f7ee2SSatish Balay Input Parameters: 180611a5261eSBarry Smith + A - the hierarchical `MATH2OPUS` matrix 1807f17f7ee2SSatish Balay . s - the scaling factor 1808f17f7ee2SSatish Balay . U - the dense low-rank update matrix 18092ef1f0ffSBarry Smith - V - (optional) the dense low-rank update matrix (if `NULL`, then `V` = `U` is assumed) 1810f17f7ee2SSatish Balay 181111a5261eSBarry Smith Note: 18122ef1f0ffSBarry Smith The `U` and `V` matrices must be in dense format 1813f17f7ee2SSatish Balay 1814f17f7ee2SSatish Balay Level: intermediate 1815f17f7ee2SSatish Balay 1816*1cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`, `MATDENSE` 1817f17f7ee2SSatish Balay @*/ 1818d71ae5a4SJacob Faibussowitsch PetscErrorCode MatH2OpusLowRankUpdate(Mat A, Mat U, Mat V, PetscScalar s) 1819d71ae5a4SJacob Faibussowitsch { 1820f17f7ee2SSatish Balay PetscBool flg; 1821f17f7ee2SSatish Balay 1822f17f7ee2SSatish Balay PetscFunctionBegin; 1823f17f7ee2SSatish Balay PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1824f17f7ee2SSatish Balay PetscValidType(A, 1); 1825f17f7ee2SSatish Balay PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix"); 1826f17f7ee2SSatish Balay PetscValidHeaderSpecific(U, MAT_CLASSID, 2); 1827f17f7ee2SSatish Balay PetscCheckSameComm(A, 1, U, 2); 1828f17f7ee2SSatish Balay if (V) { 1829f17f7ee2SSatish Balay PetscValidHeaderSpecific(V, MAT_CLASSID, 3); 1830f17f7ee2SSatish Balay PetscCheckSameComm(A, 1, V, 3); 1831f17f7ee2SSatish Balay } 1832f17f7ee2SSatish Balay PetscValidLogicalCollectiveScalar(A, s, 4); 1833f17f7ee2SSatish Balay 1834f17f7ee2SSatish Balay if (!V) V = U; 1835036e4bdcSSatish Balay PetscCheck(U->cmap->N == V->cmap->N, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Non matching rank update %" PetscInt_FMT " != %" PetscInt_FMT, U->cmap->N, V->cmap->N); 18363ba16761SJacob Faibussowitsch if (!U->cmap->N) PetscFunctionReturn(PETSC_SUCCESS); 1837f17f7ee2SSatish Balay PetscCall(PetscLayoutCompare(U->rmap, A->rmap, &flg)); 1838f17f7ee2SSatish Balay PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "A and U must have the same row layout"); 1839f17f7ee2SSatish Balay PetscCall(PetscLayoutCompare(V->rmap, A->cmap, &flg)); 1840f17f7ee2SSatish Balay PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "A column layout must match V row column layout"); 1841f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &flg)); 1842f17f7ee2SSatish Balay if (flg) { 1843f17f7ee2SSatish Balay Mat_H2OPUS *a = (Mat_H2OPUS *)A->data; 1844f17f7ee2SSatish Balay const PetscScalar *u, *v, *uu, *vv; 1845f17f7ee2SSatish Balay PetscInt ldu, ldv; 1846f17f7ee2SSatish Balay PetscMPIInt size; 1847f17f7ee2SSatish Balay #if defined(H2OPUS_USE_MPI) 1848f17f7ee2SSatish Balay h2opusHandle_t handle = a->handle->handle; 1849f17f7ee2SSatish Balay #else 1850f17f7ee2SSatish Balay h2opusHandle_t handle = a->handle; 1851f17f7ee2SSatish Balay #endif 1852f17f7ee2SSatish Balay PetscBool usesf = (PetscBool)(a->sf && !a->nativemult); 1853f17f7ee2SSatish Balay PetscSF usf, vsf; 1854f17f7ee2SSatish Balay 1855f17f7ee2SSatish Balay PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1856036e4bdcSSatish Balay PetscCheck(size <= 1, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not yet implemented in parallel"); 1857f17f7ee2SSatish Balay PetscCall(PetscLogEventBegin(MAT_H2Opus_LR, A, 0, 0, 0)); 1858f17f7ee2SSatish Balay PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)U, &flg, MATSEQDENSE, MATMPIDENSE, "")); 1859f17f7ee2SSatish Balay PetscCheck(flg, PetscObjectComm((PetscObject)U), PETSC_ERR_SUP, "Not for U of type %s", ((PetscObject)U)->type_name); 1860f17f7ee2SSatish Balay PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)V, &flg, MATSEQDENSE, MATMPIDENSE, "")); 1861f17f7ee2SSatish Balay PetscCheck(flg, PetscObjectComm((PetscObject)V), PETSC_ERR_SUP, "Not for V of type %s", ((PetscObject)V)->type_name); 1862f17f7ee2SSatish Balay PetscCall(MatDenseGetLDA(U, &ldu)); 1863f17f7ee2SSatish Balay PetscCall(MatDenseGetLDA(V, &ldv)); 1864f17f7ee2SSatish Balay PetscCall(MatBoundToCPU(A, &flg)); 1865f17f7ee2SSatish Balay if (usesf) { 1866f17f7ee2SSatish Balay PetscInt n; 1867f17f7ee2SSatish Balay 1868f17f7ee2SSatish Balay PetscCall(MatDenseGetH2OpusVectorSF(U, a->sf, &usf)); 1869f17f7ee2SSatish Balay PetscCall(MatDenseGetH2OpusVectorSF(V, a->sf, &vsf)); 1870f17f7ee2SSatish Balay PetscCall(MatH2OpusResizeBuffers_Private(A, U->cmap->N, V->cmap->N)); 1871f17f7ee2SSatish Balay PetscCall(PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL)); 1872f17f7ee2SSatish Balay ldu = n; 1873f17f7ee2SSatish Balay ldv = n; 1874f17f7ee2SSatish Balay } 1875f17f7ee2SSatish Balay if (flg) { 1876f17f7ee2SSatish Balay PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix"); 1877f17f7ee2SSatish Balay PetscCall(MatDenseGetArrayRead(U, &u)); 1878f17f7ee2SSatish Balay PetscCall(MatDenseGetArrayRead(V, &v)); 1879f17f7ee2SSatish Balay if (usesf) { 1880f17f7ee2SSatish Balay vv = MatH2OpusGetThrustPointer(*a->yy); 1881f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE)); 1882f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE)); 1883f17f7ee2SSatish Balay if (U != V) { 1884f17f7ee2SSatish Balay uu = MatH2OpusGetThrustPointer(*a->xx); 1885f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE)); 1886f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE)); 1887f17f7ee2SSatish Balay } else uu = vv; 18889371c9d4SSatish Balay } else { 18899371c9d4SSatish Balay uu = u; 18909371c9d4SSatish Balay vv = v; 18919371c9d4SSatish Balay } 1892f17f7ee2SSatish Balay hlru_global(*a->hmatrix, uu, ldu, vv, ldv, U->cmap->N, s, handle); 1893f17f7ee2SSatish Balay PetscCall(MatDenseRestoreArrayRead(U, &u)); 1894f17f7ee2SSatish Balay PetscCall(MatDenseRestoreArrayRead(V, &v)); 1895f17f7ee2SSatish Balay } else { 1896f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1897f17f7ee2SSatish Balay PetscBool flgU, flgV; 1898f17f7ee2SSatish Balay 1899f17f7ee2SSatish Balay PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix"); 1900f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)U, &flgU, MATSEQDENSE, MATMPIDENSE, "")); 1901f17f7ee2SSatish Balay if (flgU) PetscCall(MatConvert(U, MATDENSECUDA, MAT_INPLACE_MATRIX, &U)); 1902f17f7ee2SSatish Balay PetscCall(PetscObjectTypeCompareAny((PetscObject)V, &flgV, MATSEQDENSE, MATMPIDENSE, "")); 1903f17f7ee2SSatish Balay if (flgV) PetscCall(MatConvert(V, MATDENSECUDA, MAT_INPLACE_MATRIX, &V)); 1904f17f7ee2SSatish Balay PetscCall(MatDenseCUDAGetArrayRead(U, &u)); 1905f17f7ee2SSatish Balay PetscCall(MatDenseCUDAGetArrayRead(V, &v)); 1906f17f7ee2SSatish Balay if (usesf) { 1907f17f7ee2SSatish Balay vv = MatH2OpusGetThrustPointer(*a->yy_gpu); 1908f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE)); 1909f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE)); 1910f17f7ee2SSatish Balay if (U != V) { 1911f17f7ee2SSatish Balay uu = MatH2OpusGetThrustPointer(*a->xx_gpu); 1912f17f7ee2SSatish Balay PetscCall(PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE)); 1913f17f7ee2SSatish Balay PetscCall(PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE)); 1914f17f7ee2SSatish Balay } else uu = vv; 19159371c9d4SSatish Balay } else { 19169371c9d4SSatish Balay uu = u; 19179371c9d4SSatish Balay vv = v; 19189371c9d4SSatish Balay } 1919f17f7ee2SSatish Balay #else 1920f17f7ee2SSatish Balay SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "This should not happen"); 1921f17f7ee2SSatish Balay #endif 1922f17f7ee2SSatish Balay hlru_global(*a->hmatrix_gpu, uu, ldu, vv, ldv, U->cmap->N, s, handle); 1923f17f7ee2SSatish Balay #if defined(PETSC_H2OPUS_USE_GPU) 1924f17f7ee2SSatish Balay PetscCall(MatDenseCUDARestoreArrayRead(U, &u)); 1925f17f7ee2SSatish Balay PetscCall(MatDenseCUDARestoreArrayRead(V, &v)); 1926f17f7ee2SSatish Balay if (flgU) PetscCall(MatConvert(U, MATDENSE, MAT_INPLACE_MATRIX, &U)); 1927f17f7ee2SSatish Balay if (flgV) PetscCall(MatConvert(V, MATDENSE, MAT_INPLACE_MATRIX, &V)); 1928f17f7ee2SSatish Balay #endif 1929f17f7ee2SSatish Balay } 1930f17f7ee2SSatish Balay PetscCall(PetscLogEventEnd(MAT_H2Opus_LR, A, 0, 0, 0)); 1931f17f7ee2SSatish Balay a->orthogonal = PETSC_FALSE; 1932f17f7ee2SSatish Balay } 19333ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1934f17f7ee2SSatish Balay } 1935f17f7ee2SSatish Balay #endif 1936