1*a4963045SJacob Faibussowitsch #pragma once 240e23c03SJunchao Zhang 3cd620004SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h> 47fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_CUDA) 50e6b6b59SJacob Faibussowitsch #include <petscdevice_cuda.h> 671438e86SJunchao Zhang typedef cudaStream_t cupmStream_t; 771438e86SJunchao Zhang typedef cudaEvent_t cupmEvent_t; 87fd2d3dbSJunchao Zhang #endif 97fd2d3dbSJunchao Zhang 107fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_HIP) 110e6b6b59SJacob Faibussowitsch #include <petscdevice_hip.h> 1271438e86SJunchao Zhang typedef hipStream_t cupmStream_t; 1371438e86SJunchao Zhang typedef hipEvent_t cupmEvent_t; 147fd2d3dbSJunchao Zhang #endif 15cd620004SJunchao Zhang 16874d28e3SJunchao Zhang /* In terms of function overloading, long long int is a different type than int64_t, which PetscInt might be defined to. 17da81f932SPierre Jolivet We prefer long long int over PetscInt (int64_t), since CUDA atomics are built around (unsigned) long long int. 18874d28e3SJunchao Zhang */ 19874d28e3SJunchao Zhang typedef long long int llint; 20874d28e3SJunchao Zhang typedef unsigned long long int ullint; 21874d28e3SJunchao Zhang 22cd620004SJunchao Zhang /* We separate SF communications for SFBasic and SFNeighbor in two parts: local (self,intra-rank) and remote (inter-rank) */ 239371c9d4SSatish Balay typedef enum { 249371c9d4SSatish Balay PETSCSF_LOCAL = 0, 259371c9d4SSatish Balay PETSCSF_REMOTE 269371c9d4SSatish Balay } PetscSFScope; 2740e23c03SJunchao Zhang 28fcc7397dSJunchao Zhang /* Optimizations in packing & unpacking for destination ranks. 2940e23c03SJunchao Zhang 30fcc7397dSJunchao Zhang Suppose there are m indices stored in idx[], and two addresses u, p. We want to do packing: 31fcc7397dSJunchao Zhang p[i] = u[idx[i]], for i in [0,m) 3240e23c03SJunchao Zhang 33fcc7397dSJunchao Zhang Indices are associated with n ranks and each rank's indices are stored consecutively in idx[]. 34fcc7397dSJunchao Zhang We go through indices for each rank and see if they are indices of a 3D submatrix of size [dx,dy,dz] in 35fcc7397dSJunchao Zhang a parent matrix of size [X,Y,Z], with the submatrix's first index being <start>. 36cd620004SJunchao Zhang 37fcc7397dSJunchao Zhang E.g., for indices 1,2,3, 6,7,8, 11,12,13, the submatrix size is [3,3,1] with start=1, and the parent matrix's size 38fcc7397dSJunchao Zhang is [5,3,1]. For simplicity, if any destination rank does not have this pattern, we give up the optimization. 39fcc7397dSJunchao Zhang 40fcc7397dSJunchao Zhang Note before using this per-rank optimization, one should check leafcontig[], rootcontig[], which say 41fcc7397dSJunchao Zhang indices in whole are contiguous, and therefore much more useful than this one when true. 4240e23c03SJunchao Zhang */ 4340e23c03SJunchao Zhang struct _n_PetscSFPackOpt { 44fcc7397dSJunchao Zhang PetscInt *array; /* [7*n+2] Memory pool for other fields in this struct. Used to easily copy this struct to GPU */ 45b23bfdefSJunchao Zhang PetscInt n; /* Number of destination ranks */ 46fcc7397dSJunchao Zhang PetscInt *offset; /* [n+1] Offsets of indices for each rank. offset[0]=0, offset[i+1]=offset[i]+dx[i]*dy[i]*dz[i] */ 47fcc7397dSJunchao Zhang PetscInt *start; /* [n] First index */ 48fcc7397dSJunchao Zhang PetscInt *dx, *dy, *dz; /* [n] Lengths of the submatrix in X, Y, Z dimension. */ 49fcc7397dSJunchao Zhang PetscInt *X, *Y; /* [n] Lengths of the outer matrix in X, Y. We do not care Z. */ 5040e23c03SJunchao Zhang }; 5140e23c03SJunchao Zhang 52eb02082bSJunchao Zhang /* An abstract class that defines a communication link, which includes how to pack/unpack data and send/recv buffers 5340e23c03SJunchao Zhang */ 54fcc7397dSJunchao Zhang struct _n_PetscSFLink { 5571438e86SJunchao Zhang PetscErrorCode (*Memcpy)(PetscSFLink, PetscMemType, void *, PetscMemType, const void *, size_t); /* Async device memcopy might use stream in the link */ 5671438e86SJunchao Zhang PetscErrorCode (*PrePack)(PetscSF, PetscSFLink, PetscSFDirection); 5771438e86SJunchao Zhang PetscErrorCode (*PostUnpack)(PetscSF, PetscSFLink, PetscSFDirection); 5871438e86SJunchao Zhang PetscErrorCode (*StartCommunication)(PetscSF, PetscSFLink, PetscSFDirection); 5971438e86SJunchao Zhang PetscErrorCode (*FinishCommunication)(PetscSF, PetscSFLink, PetscSFDirection); 6071438e86SJunchao Zhang PetscErrorCode (*SyncDevice)(PetscSFLink); 6171438e86SJunchao Zhang PetscErrorCode (*SyncStream)(PetscSFLink); 6271438e86SJunchao Zhang PetscErrorCode (*Destroy)(PetscSF, PetscSFLink); 6371438e86SJunchao Zhang 6471438e86SJunchao Zhang PetscErrorCode (*BuildDependenceBegin)(PetscSF, PetscSFLink, PetscSFDirection); 6571438e86SJunchao Zhang PetscErrorCode (*BuildDependenceEnd)(PetscSF, PetscSFLink, PetscSFDirection); 6620c24465SJunchao Zhang 67fcc7397dSJunchao Zhang PetscErrorCode (*h_Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *); 68fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 69fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 70fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 71fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 72fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 73fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 74fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 75fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 76fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 77fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 78fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 79fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 80fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 81fcc7397dSJunchao Zhang PetscErrorCode (*h_FetchAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *); 82fcc7397dSJunchao Zhang 83fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 84fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 85fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 86fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 87fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 88fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 89fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 90fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 91fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 92fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 93fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 94fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 95fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 96fcc7397dSJunchao Zhang 97fcc7397dSJunchao Zhang PetscErrorCode (*h_FetchAndAddLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *); 98cd620004SJunchao Zhang 99cd620004SJunchao Zhang PetscBool deviceinited; /* Are device related fields initialized? */ 1007fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 101eb02082bSJunchao Zhang /* These fields are lazily initialized in a sense that only when device pointers are passed to an SF, the SF 1027fd2d3dbSJunchao Zhang will set them, otherwise it just leaves them alone. Packing routines using regular ops when there are no data race chances. 103eb02082bSJunchao Zhang */ 104fcc7397dSJunchao Zhang PetscErrorCode (*d_Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *); 105fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 106fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 107fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 108fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 109fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 110fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 111fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 112fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 113fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 114fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 115fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 116fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 117fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 118fcc7397dSJunchao Zhang PetscErrorCode (*d_FetchAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *); 119eb02082bSJunchao Zhang 120fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 121fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 122fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 123fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 124fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 125fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 126fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 127fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 128fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 129fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 130fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 131fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 132fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 133fcc7397dSJunchao Zhang PetscErrorCode (*d_FetchAndAddLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *); 134eb02082bSJunchao Zhang 135eb02082bSJunchao Zhang /* Packing routines using atomics when there are data race chances */ 136fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 137fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 138fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 139fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 140fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 141fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 142fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 143fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 144fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 145fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 146fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 147fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 148fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 149fcc7397dSJunchao Zhang PetscErrorCode (*da_FetchAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *); 150cd620004SJunchao Zhang 151fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 152fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 153fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 154fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 155fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 156fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 157fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 158fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 159fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 160fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 161fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 162fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 163fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 164fcc7397dSJunchao Zhang PetscErrorCode (*da_FetchAndAddLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *); 16571438e86SJunchao Zhang #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP) 166e315309dSJunchao Zhang PetscInt maxResidentThreadsPerGPU; /* It is a copy from SF for convenience */ 16771438e86SJunchao Zhang cupmStream_t stream; /* stream on which input/output root/leafdata is computed on (default is PetscDefaultCudaStream) */ 168eb02082bSJunchao Zhang #endif 1697fd2d3dbSJunchao Zhang #endif 170eb02082bSJunchao Zhang PetscMPIInt tag; /* Each link has a tag so we can perform multiple SF ops at the same time */ 171cd620004SJunchao Zhang MPI_Datatype unit; /* The MPI datatype this PetscSFLink is built for */ 172eb02082bSJunchao Zhang MPI_Datatype basicunit; /* unit is made of MPI builtin dataype basicunit */ 173e07844bfSJunchao Zhang PetscBool isbuiltin; /* Is unit an MPI/PETSc builtin datatype? If it is true, then bs=1 and basicunit is equivalent to unit */ 174eb02082bSJunchao Zhang size_t unitbytes; /* Number of bytes in a unit */ 175eb02082bSJunchao Zhang PetscInt bs; /* Number of basic units in a unit */ 176cd620004SJunchao Zhang const void *rootdata, *leafdata; /* rootdata and leafdata the link is working on. They are used as keys for pending links. */ 177cd620004SJunchao Zhang PetscMemType rootmtype, leafmtype; /* root/leafdata's memory type */ 178cd620004SJunchao Zhang 179cd620004SJunchao Zhang /* For local and remote communication */ 180cd620004SJunchao Zhang PetscMemType rootmtype_mpi, leafmtype_mpi; /* Mtypes of buffers passed to MPI. If use_gpu_aware_mpi, they are same as root/leafmtype. Otherwise they are PETSC_MEMTYPE_HOST */ 181cd620004SJunchao Zhang PetscBool rootdirect[2], leafdirect[2]; /* Can root/leafdata be directly passed to SF (i.e., without buffering). In layout of [PETSCSF_LOCAL/REMOTE]. See more in PetscSFLinkCreate() */ 182cd620004SJunchao Zhang PetscInt rootdirect_mpi, leafdirect_mpi; /* Can root/leafdata for remote be directly passed to MPI? 1: yes, 0: no. See more in PetscSFLinkCreate() */ 183cd620004SJunchao Zhang const void *rootdatadirect[2][2]; /* The root/leafdata used to init root/leaf requests, in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE]. */ 184cd620004SJunchao Zhang const void *leafdatadirect[2][2]; /* ... We need them to look up links when root/leafdirect_mpi are true */ 18571438e86SJunchao Zhang char *rootbuf[2][2]; /* Buffers for packed roots, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE]. PETSCSF_LOCAL does not need MPI, .. */ 18671438e86SJunchao Zhang /* .. but in case rootmtype is different from leafmtype, we still need to pack local roots and then copy them to memory of leafmtype */ 187cd620004SJunchao Zhang char *rootbuf_alloc[2][2]; /* Log memory allocated by petsc. We need it since rootbuf[][] may point to rootdata given by user */ 188cd620004SJunchao Zhang char *leafbuf[2][2]; /* Buffers for packed leaves, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */ 189cd620004SJunchao Zhang char *leafbuf_alloc[2][2]; 190cd620004SJunchao Zhang MPI_Request *rootreqs[2][2][2]; /* Root requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi] */ 191cd620004SJunchao Zhang MPI_Request *leafreqs[2][2][2]; /* Leaf requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi] */ 192cd620004SJunchao Zhang PetscBool rootreqsinited[2][2][2]; /* Are root requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi]*/ 193cd620004SJunchao Zhang PetscBool leafreqsinited[2][2][2]; /* Are leaf requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi]*/ 194cd620004SJunchao Zhang MPI_Request *reqs; /* An array of length (nrootreqs+nleafreqs)*8. Pointers in rootreqs[][][] and leafreqs[][][] point here */ 195cd620004SJunchao Zhang PetscSFLink next; 19671438e86SJunchao Zhang 19771438e86SJunchao Zhang PetscBool use_nvshmem; /* Does this link use nvshem (vs. MPI) for communication? */ 19871438e86SJunchao Zhang #if defined(PETSC_HAVE_NVSHMEM) 19971438e86SJunchao Zhang cupmEvent_t dataReady; /* Events to mark readiness of root/leafdata */ 20071438e86SJunchao Zhang cupmEvent_t endRemoteComm; /* Events to mark end of local/remote communication */ 20171438e86SJunchao Zhang cupmStream_t remoteCommStream; /* Streams for remote (i.e., inter-rank) communication */ 20271438e86SJunchao Zhang 20371438e86SJunchao Zhang /* The buffers are allocated in device symmetric heap. Their length is the maximal length over all ranks in the comm, and therefore is the same. */ 20471438e86SJunchao Zhang uint64_t *rootSendSig, *rootRecvSig; /* [max{niranks-ndiranks}], signals used when rootbuf works as send/recv buf */ 20571438e86SJunchao Zhang uint64_t *leafSendSig, *leafRecvSig; /* [max{nranks-ndranks}], signals used when leafbuf works as send/recv buf */ 20671438e86SJunchao Zhang #endif 20740e23c03SJunchao Zhang }; 20840e23c03SJunchao Zhang 209cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF, MPI_Datatype, const void *, const void *); 210b7c0d12aSJunchao Zhang 211cd620004SJunchao Zhang /* Create/setup/retrieve/destroy a link */ 212cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkCreate(PetscSF, MPI_Datatype, PetscMemType, const void *, PetscMemType, const void *, MPI_Op, PetscSFOperation, PetscSFLink *); 213cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Host(PetscSF, PetscSFLink, MPI_Datatype); 214cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetInUse(PetscSF, MPI_Datatype, const void *, const void *, PetscCopyMode, PetscSFLink *); 215cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkReclaim(PetscSF, PetscSFLink *); 21671438e86SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkDestroy(PetscSF, PetscSFLink); 217cd620004SJunchao Zhang 218cd620004SJunchao Zhang /* Get pack/unpack function pointers from a link */ 219d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkGetPack(PetscSFLink link, PetscMemType mtype, PetscErrorCode (**Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *)) 220d71ae5a4SJacob Faibussowitsch { 221eb02082bSJunchao Zhang PetscFunctionBegin; 22271438e86SJunchao Zhang if (PetscMemTypeHost(mtype)) *Pack = link->h_Pack; 2237fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 224cd620004SJunchao Zhang else *Pack = link->d_Pack; 225eb02082bSJunchao Zhang #endif 2263ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 227eb02082bSJunchao Zhang } 2287fd2d3dbSJunchao Zhang 229fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink, PetscMemType, MPI_Op, PetscBool, PetscErrorCode (**UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *)); 230fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink, PetscMemType, MPI_Op, PetscBool, PetscErrorCode (**FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *)); 231fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink, PetscMemType, MPI_Op, PetscBool, PetscErrorCode (**ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *)); 232fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink, PetscMemType, MPI_Op, PetscBool, PetscErrorCode (**FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *)); 233cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF, PetscSFLink, PetscSFDirection, void **, void **, MPI_Request **, MPI_Request **); 234b7c0d12aSJunchao Zhang 235cd620004SJunchao Zhang /* Do Pack/Unpack/Fetch/Scatter with the link */ 236cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkPackRootData(PetscSF, PetscSFLink, PetscSFScope, const void *); 237cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkPackLeafData(PetscSF, PetscSFLink, PetscSFScope, const void *); 238cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkUnpackRootData(PetscSF, PetscSFLink, PetscSFScope, void *, MPI_Op); 239cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF, PetscSFLink, PetscSFScope, void *, MPI_Op); 24071438e86SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF, PetscSFLink, void *, MPI_Op); 241cd620004SJunchao Zhang 24271438e86SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkScatterLocal(PetscSF, PetscSFLink, PetscSFDirection, void *, void *, MPI_Op); 243cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF, PetscSFLink, void *, const void *, void *, MPI_Op); 244cd620004SJunchao Zhang 2457fd2d3dbSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFSetUpPackFields(PetscSF); 2467fd2d3dbSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFResetPackFields(PetscSF); 24771438e86SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkCreate_MPI(PetscSF, MPI_Datatype, PetscMemType, const void *, PetscMemType, const void *, MPI_Op, PetscSFOperation, PetscSFLink *); 2487fd2d3dbSJunchao Zhang 24920c24465SJunchao Zhang #if defined(PETSC_HAVE_CUDA) 25071438e86SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_CUDA(PetscSF, PetscSFLink, MPI_Datatype); 25120c24465SJunchao Zhang #endif 25220c24465SJunchao Zhang 25359af0bd3SScott Kruger #if defined(PETSC_HAVE_HIP) 25471438e86SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_HIP(PetscSF, PetscSFLink, MPI_Datatype); 25559af0bd3SScott Kruger #endif 25659af0bd3SScott Kruger 25720c24465SJunchao Zhang #if defined(PETSC_HAVE_KOKKOS) 25820c24465SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Kokkos(PetscSF, PetscSFLink, MPI_Datatype); 25920c24465SJunchao Zhang #endif 26020c24465SJunchao Zhang 26171438e86SJunchao Zhang #if defined(PETSC_HAVE_NVSHMEM) 26271438e86SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkCreate_NVSHMEM(PetscSF, MPI_Datatype, PetscMemType, const void *, PetscMemType, const void *, MPI_Op, PetscSFOperation, PetscSFLink *); 26371438e86SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkNvshmemCheck(PetscSF, PetscMemType, const void *, PetscMemType, const void *, PetscBool *); 26471438e86SJunchao Zhang #endif 26571438e86SJunchao Zhang 266d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkStartCommunication(PetscSF sf, PetscSFLink link, PetscSFDirection direction) 267d71ae5a4SJacob Faibussowitsch { 26871438e86SJunchao Zhang PetscFunctionBegin; 2699566063dSJacob Faibussowitsch if (link->StartCommunication) PetscCall((*link->StartCommunication)(sf, link, direction)); 2703ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 27171438e86SJunchao Zhang } 27271438e86SJunchao Zhang 273d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkFinishCommunication(PetscSF sf, PetscSFLink link, PetscSFDirection direction) 274d71ae5a4SJacob Faibussowitsch { 27571438e86SJunchao Zhang PetscFunctionBegin; 2769566063dSJacob Faibussowitsch if (link->FinishCommunication) PetscCall((*link->FinishCommunication)(sf, link, direction)); 2773ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 27871438e86SJunchao Zhang } 27971438e86SJunchao Zhang 2807fd2d3dbSJunchao Zhang /* A set of helper routines for Pack/Unpack/Scatter on GPUs */ 2814d9d436bSJunchao Zhang #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP) || defined(PETSC_HAVE_SYCL) 2827fd2d3dbSJunchao Zhang /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need 2837fd2d3dbSJunchao Zhang to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls. 2847fd2d3dbSJunchao Zhang */ 285d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf, PetscSFLink link, PetscBool device2host) 286d71ae5a4SJacob Faibussowitsch { 2877fd2d3dbSJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 2887fd2d3dbSJunchao Zhang 2897fd2d3dbSJunchao Zhang PetscFunctionBegin; 29071438e86SJunchao Zhang /* rootdata is on device but we use regular MPI for communication */ 29171438e86SJunchao Zhang if (PetscMemTypeDevice(link->rootmtype) && PetscMemTypeHost(link->rootmtype_mpi) && bas->rootbuflen[PETSCSF_REMOTE]) { 2927fd2d3dbSJunchao Zhang void *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 2937fd2d3dbSJunchao Zhang void *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 2947fd2d3dbSJunchao Zhang size_t count = bas->rootbuflen[PETSCSF_REMOTE] * link->unitbytes; 2957fd2d3dbSJunchao Zhang if (device2host) { 2969566063dSJacob Faibussowitsch PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_HOST, h_buf, PETSC_MEMTYPE_DEVICE, d_buf, count)); 2979566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu(count)); 2987fd2d3dbSJunchao Zhang } else { 2999566063dSJacob Faibussowitsch PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, d_buf, PETSC_MEMTYPE_HOST, h_buf, count)); 3009566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(count)); 3017fd2d3dbSJunchao Zhang } 3027fd2d3dbSJunchao Zhang } 3033ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3047fd2d3dbSJunchao Zhang } 3057fd2d3dbSJunchao Zhang 306d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf, PetscSFLink link, PetscBool device2host) 307d71ae5a4SJacob Faibussowitsch { 3087fd2d3dbSJunchao Zhang PetscFunctionBegin; 30971438e86SJunchao Zhang if (PetscMemTypeDevice(link->leafmtype) && PetscMemTypeHost(link->leafmtype_mpi) && sf->leafbuflen[PETSCSF_REMOTE]) { 3107fd2d3dbSJunchao Zhang void *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 3117fd2d3dbSJunchao Zhang void *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 3127fd2d3dbSJunchao Zhang size_t count = sf->leafbuflen[PETSCSF_REMOTE] * link->unitbytes; 3137fd2d3dbSJunchao Zhang if (device2host) { 3149566063dSJacob Faibussowitsch PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_HOST, h_buf, PETSC_MEMTYPE_DEVICE, d_buf, count)); 3159566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu(count)); 3167fd2d3dbSJunchao Zhang } else { 3179566063dSJacob Faibussowitsch PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, d_buf, PETSC_MEMTYPE_HOST, h_buf, count)); 3189566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(count)); 3197fd2d3dbSJunchao Zhang } 3207fd2d3dbSJunchao Zhang } 3213ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3227fd2d3dbSJunchao Zhang } 3237fd2d3dbSJunchao Zhang 32471438e86SJunchao Zhang /* Make sure root/leafbuf for the remote is ready for MPI */ 325d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkSyncStreamBeforeCallMPI(PetscSF sf, PetscSFLink link, PetscSFDirection direction) 326d71ae5a4SJacob Faibussowitsch { 32771438e86SJunchao Zhang PetscSF_Basic *bas; 32871438e86SJunchao Zhang PetscInt buflen; 32971438e86SJunchao Zhang PetscMemType mtype; 33071438e86SJunchao Zhang 33171438e86SJunchao Zhang PetscFunctionBegin; 33271438e86SJunchao Zhang if (direction == PETSCSF_ROOT2LEAF) { 33371438e86SJunchao Zhang bas = (PetscSF_Basic *)sf->data; 33471438e86SJunchao Zhang mtype = link->rootmtype; 33571438e86SJunchao Zhang buflen = bas->rootbuflen[PETSCSF_REMOTE]; 33671438e86SJunchao Zhang } else { 33771438e86SJunchao Zhang mtype = link->leafmtype; 33871438e86SJunchao Zhang buflen = sf->leafbuflen[PETSCSF_REMOTE]; 33971438e86SJunchao Zhang } 34071438e86SJunchao Zhang 34148a46eb9SPierre Jolivet if (PetscMemTypeDevice(mtype) && buflen) PetscCall((*link->SyncStream)(link)); 3423ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 34371438e86SJunchao Zhang } 3447fd2d3dbSJunchao Zhang #else /* Host only */ 3453ba16761SJacob Faibussowitsch #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a, b, c) PETSC_SUCCESS 3463ba16761SJacob Faibussowitsch #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a, b, c) PETSC_SUCCESS 3473ba16761SJacob Faibussowitsch #define PetscSFLinkSyncStreamBeforeCallMPI(a, b, c) PETSC_SUCCESS 3487fd2d3dbSJunchao Zhang #endif 349cd620004SJunchao Zhang 350cd620004SJunchao Zhang /* Get root indices used for pack/unpack 351cd620004SJunchao Zhang 352cd620004SJunchao Zhang Input arguments: 353cd620004SJunchao Zhang +sf - StarForest 354cd620004SJunchao Zhang .link - The link, which provides the stream for the async memcpy (In SF, we make all GPU operations asynchronous to avoid unexpected pipeline stalls) 355cd620004SJunchao Zhang .mtype - In what type of memory? (PETSC_MEMTYPE_DEVICE or PETSC_MEMTYPE_HOST) 35671438e86SJunchao Zhang -scope - Which part of the indices? (PETSCSF_LOCAL or PETSCSF_REMOTE) 357cd620004SJunchao Zhang 358cd620004SJunchao Zhang Output arguments: 359cd620004SJunchao Zhang +count - Count of indices 360cd620004SJunchao Zhang .start - The first index (only useful when indices is NULL) 36171438e86SJunchao Zhang .opt - Packing optimizations 36271438e86SJunchao Zhang -indices - Indices of roots for pack/unpack. NULL means indices are contiguous 363cd620004SJunchao Zhang */ 364d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkGetRootPackOptAndIndices(PetscSF sf, PetscSFLink link, PetscMemType mtype, PetscSFScope scope, PetscInt *count, PetscInt *start, PetscSFPackOpt *opt, const PetscInt **indices) 365d71ae5a4SJacob Faibussowitsch { 366cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 367cd620004SJunchao Zhang PetscInt offset; 368b7c0d12aSJunchao Zhang 369b7c0d12aSJunchao Zhang PetscFunctionBegin; 370fcc7397dSJunchao Zhang *count = bas->rootbuflen[scope]; 371fcc7397dSJunchao Zhang *start = bas->rootstart[scope]; 372fcc7397dSJunchao Zhang *opt = NULL; 373fcc7397dSJunchao Zhang *indices = NULL; 374fcc7397dSJunchao Zhang 375fcc7397dSJunchao Zhang /* We have these rules: 376fcc7397dSJunchao Zhang 1) opt == NULL && indices == NULL ==> indices are contiguous. 377fcc7397dSJunchao Zhang 2) opt != NULL ==> indices are in 3D but not contiguous. On host, indices != NULL since indices are already available and we do not 378fcc7397dSJunchao Zhang want to enforce all operations to use opt; but on device, indices = NULL since we do not want to copy indices to device. 379fcc7397dSJunchao Zhang */ 380fcc7397dSJunchao Zhang if (!bas->rootcontig[scope]) { 381cd620004SJunchao Zhang offset = (scope == PETSCSF_LOCAL) ? 0 : bas->ioffset[bas->ndiranks]; 3829371c9d4SSatish Balay if (PetscMemTypeHost(mtype)) { 3839371c9d4SSatish Balay *opt = bas->rootpackopt[scope]; 3849371c9d4SSatish Balay *indices = bas->irootloc + offset; 3859371c9d4SSatish Balay } else { 386fcc7397dSJunchao Zhang size_t size; 387fcc7397dSJunchao Zhang if (bas->rootpackopt[scope]) { 388fcc7397dSJunchao Zhang if (!bas->rootpackopt_d[scope]) { 3899566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(1, &bas->rootpackopt_d[scope])); 3909566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(bas->rootpackopt_d[scope], bas->rootpackopt[scope], 1)); /* Make pointers in bas->rootpackopt_d[] still work on host */ 391fcc7397dSJunchao Zhang size = (bas->rootpackopt[scope]->n * 7 + 2) * sizeof(PetscInt); /* See comments at struct _n_PetscSFPackOpt*/ 3929566063dSJacob Faibussowitsch PetscCall(PetscSFMalloc(sf, PETSC_MEMTYPE_DEVICE, size, (void **)&bas->rootpackopt_d[scope]->array)); 3939566063dSJacob Faibussowitsch PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, bas->rootpackopt_d[scope]->array, PETSC_MEMTYPE_HOST, bas->rootpackopt[scope]->array, size)); 394fcc7397dSJunchao Zhang } 395fcc7397dSJunchao Zhang *opt = bas->rootpackopt_d[scope]; 396fcc7397dSJunchao Zhang } else { /* On device, we only provide indices when there is no optimization. We're reluctant to copy indices to device. */ 397fcc7397dSJunchao Zhang if (!bas->irootloc_d[scope]) { 398fcc7397dSJunchao Zhang size = bas->rootbuflen[scope] * sizeof(PetscInt); 3999566063dSJacob Faibussowitsch PetscCall(PetscSFMalloc(sf, PETSC_MEMTYPE_DEVICE, size, (void **)&bas->irootloc_d[scope])); 4009566063dSJacob Faibussowitsch PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, bas->irootloc_d[scope], PETSC_MEMTYPE_HOST, bas->irootloc + offset, size)); 401b7c0d12aSJunchao Zhang } 402cd620004SJunchao Zhang *indices = bas->irootloc_d[scope]; 403cd620004SJunchao Zhang } 404cd620004SJunchao Zhang } 405cd620004SJunchao Zhang } 4063ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 407b7c0d12aSJunchao Zhang } 408b7c0d12aSJunchao Zhang 409cd620004SJunchao Zhang /* Get leaf indices used for pack/unpack 410cd620004SJunchao Zhang 411fcc7397dSJunchao Zhang See also PetscSFLinkGetRootPackOptAndIndices() 412cd620004SJunchao Zhang */ 413d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkGetLeafPackOptAndIndices(PetscSF sf, PetscSFLink link, PetscMemType mtype, PetscSFScope scope, PetscInt *count, PetscInt *start, PetscSFPackOpt *opt, const PetscInt **indices) 414d71ae5a4SJacob Faibussowitsch { 415cd620004SJunchao Zhang PetscInt offset; 416cd620004SJunchao Zhang 417cd620004SJunchao Zhang PetscFunctionBegin; 418fcc7397dSJunchao Zhang *count = sf->leafbuflen[scope]; 419fcc7397dSJunchao Zhang *start = sf->leafstart[scope]; 420fcc7397dSJunchao Zhang *opt = NULL; 421fcc7397dSJunchao Zhang *indices = NULL; 422fcc7397dSJunchao Zhang if (!sf->leafcontig[scope]) { 423cd620004SJunchao Zhang offset = (scope == PETSCSF_LOCAL) ? 0 : sf->roffset[sf->ndranks]; 4249371c9d4SSatish Balay if (PetscMemTypeHost(mtype)) { 4259371c9d4SSatish Balay *opt = sf->leafpackopt[scope]; 4269371c9d4SSatish Balay *indices = sf->rmine + offset; 4279371c9d4SSatish Balay } else { 428fcc7397dSJunchao Zhang size_t size; 429fcc7397dSJunchao Zhang if (sf->leafpackopt[scope]) { 430fcc7397dSJunchao Zhang if (!sf->leafpackopt_d[scope]) { 4319566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(1, &sf->leafpackopt_d[scope])); 4329566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(sf->leafpackopt_d[scope], sf->leafpackopt[scope], 1)); 433fcc7397dSJunchao Zhang size = (sf->leafpackopt[scope]->n * 7 + 2) * sizeof(PetscInt); /* See comments at struct _n_PetscSFPackOpt*/ 4349566063dSJacob Faibussowitsch PetscCall(PetscSFMalloc(sf, PETSC_MEMTYPE_DEVICE, size, (void **)&sf->leafpackopt_d[scope]->array)); /* Change ->array to a device pointer */ 4359566063dSJacob Faibussowitsch PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, sf->leafpackopt_d[scope]->array, PETSC_MEMTYPE_HOST, sf->leafpackopt[scope]->array, size)); 436fcc7397dSJunchao Zhang } 437fcc7397dSJunchao Zhang *opt = sf->leafpackopt_d[scope]; 438fcc7397dSJunchao Zhang } else { 439fcc7397dSJunchao Zhang if (!sf->rmine_d[scope]) { 440fcc7397dSJunchao Zhang size = sf->leafbuflen[scope] * sizeof(PetscInt); 4419566063dSJacob Faibussowitsch PetscCall(PetscSFMalloc(sf, PETSC_MEMTYPE_DEVICE, size, (void **)&sf->rmine_d[scope])); 4429566063dSJacob Faibussowitsch PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, sf->rmine_d[scope], PETSC_MEMTYPE_HOST, sf->rmine + offset, size)); 443cd620004SJunchao Zhang } 444cd620004SJunchao Zhang *indices = sf->rmine_d[scope]; 445cd620004SJunchao Zhang } 446cd620004SJunchao Zhang } 447cd620004SJunchao Zhang } 4483ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 449cd620004SJunchao Zhang } 450