1 #include <../src/vec/is/sf/impls/basic/sfpack.h> 2 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 3 #include <petscpkg_version.h> 4 5 /* Convenience local types and wrappers */ 6 #if defined(PETSC_HAVE_MPI_LARGE_COUNT) && defined(PETSC_USE_64BIT_INDICES) 7 typedef MPI_Count PetscSFCount; 8 typedef MPI_Aint PetscSFAint; 9 #define MPIU_Neighbor_alltoallv(a, b, c, d, e, f, g, h, i) MPI_Neighbor_alltoallv_c(a, b, c, d, e, f, g, h, i) 10 #define MPIU_Neighbor_alltoallv_init(a, b, c, d, e, f, g, h, i, j, k) MPI_Neighbor_alltoallv_init_c(a, b, c, d, e, f, g, h, i, j, k) 11 #define MPIU_Ineighbor_alltoallv(a, b, c, d, e, f, g, h, i, j) MPI_Ineighbor_alltoallv_c(a, b, c, d, e, f, g, h, i, j) 12 #else 13 typedef PetscMPIInt PetscSFCount; 14 typedef PetscMPIInt PetscSFAint; 15 #define MPIU_Neighbor_alltoallv(a, b, c, d, e, f, g, h, i) MPI_Neighbor_alltoallv(a, b, c, d, e, f, g, h, i) 16 #define MPIU_Neighbor_alltoallv_init(a, b, c, d, e, f, g, h, i, j, k) MPI_Neighbor_alltoallv_init(a, b, c, d, e, f, g, h, i, j, k) 17 #define MPIU_Ineighbor_alltoallv(a, b, c, d, e, f, g, h, i, j) MPI_Ineighbor_alltoallv(a, b, c, d, e, f, g, h, i, j) 18 #endif 19 20 typedef struct { 21 SFBASICHEADER; 22 MPI_Comm comms[2]; /* Communicators with distributed topology in both directions */ 23 PetscBool initialized[2]; /* Are the two communicators initialized? */ 24 PetscSFCount *rootcounts, *leafcounts; /* counts for non-distinguished ranks */ 25 PetscSFAint *rootdispls, *leafdispls; /* displs for non-distinguished ranks */ 26 PetscMPIInt *rootweights, *leafweights; 27 PetscInt rootdegree, leafdegree; 28 } PetscSF_Neighbor; 29 30 /*===================================================================================*/ 31 /* Internal utility routines */ 32 /*===================================================================================*/ 33 34 static inline PetscErrorCode PetscLogMPIMessages(PetscInt nsend, PetscSFCount *sendcnts, MPI_Datatype sendtype, PetscInt nrecv, PetscSFCount *recvcnts, MPI_Datatype recvtype) 35 { 36 PetscFunctionBegin; 37 if (PetscDefined(USE_LOG)) { 38 petsc_isend_ct += (PetscLogDouble)nsend; 39 petsc_irecv_ct += (PetscLogDouble)nrecv; 40 41 if (sendtype != MPI_DATATYPE_NULL) { 42 PetscMPIInt i, typesize; 43 PetscCallMPI(MPI_Type_size(sendtype, &typesize)); 44 for (i = 0; i < nsend; i++) petsc_isend_len += (PetscLogDouble)(sendcnts[i] * typesize); 45 } 46 47 if (recvtype != MPI_DATATYPE_NULL) { 48 PetscMPIInt i, typesize; 49 PetscCallMPI(MPI_Type_size(recvtype, &typesize)); 50 for (i = 0; i < nrecv; i++) petsc_irecv_len += (PetscLogDouble)(recvcnts[i] * typesize); 51 } 52 } 53 PetscFunctionReturn(PETSC_SUCCESS); 54 } 55 56 /* Get the communicator with distributed graph topology, which is not cheap to build so we do it on demand (instead of at PetscSFSetUp time) */ 57 static PetscErrorCode PetscSFGetDistComm_Neighbor(PetscSF sf, PetscSFDirection direction, MPI_Comm *distcomm) 58 { 59 PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data; 60 61 PetscFunctionBegin; 62 if (!dat->initialized[direction]) { 63 PetscMPIInt nrootranks, ndrootranks, nleafranks, ndleafranks; 64 PetscMPIInt indegree, outdegree; 65 const PetscMPIInt *rootranks, *leafranks, *sources, *destinations; 66 MPI_Comm comm, *mycomm = &dat->comms[direction]; 67 68 PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, &rootranks, NULL, NULL)); /* Which ranks will access my roots (I am a destination) */ 69 PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, &leafranks, NULL, NULL, NULL)); /* My leaves will access whose roots (I am a source) */ 70 indegree = nrootranks - ndrootranks; 71 outdegree = nleafranks - ndleafranks; 72 sources = PetscSafePointerPlusOffset(rootranks, ndrootranks); 73 destinations = PetscSafePointerPlusOffset(leafranks, ndleafranks); 74 PetscCall(PetscObjectGetComm((PetscObject)sf, &comm)); 75 if (direction == PETSCSF_LEAF2ROOT) { 76 PetscCallMPI(MPI_Dist_graph_create_adjacent(comm, indegree, sources, dat->rootweights, outdegree, destinations, dat->leafweights, MPI_INFO_NULL, 1 /*reorder*/, mycomm)); 77 } else { /* PETSCSF_ROOT2LEAF, reverse src & dest */ 78 PetscCallMPI(MPI_Dist_graph_create_adjacent(comm, outdegree, destinations, dat->leafweights, indegree, sources, dat->rootweights, MPI_INFO_NULL, 1 /*reorder*/, mycomm)); 79 } 80 dat->initialized[direction] = PETSC_TRUE; 81 } 82 *distcomm = dat->comms[direction]; 83 PetscFunctionReturn(PETSC_SUCCESS); 84 } 85 86 // start MPI_Ineighbor_alltoallv (only used for inter-proccess communication) 87 static PetscErrorCode PetscSFLinkStartCommunication_Neighbor(PetscSF sf, PetscSFLink link, PetscSFDirection direction) 88 { 89 PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data; 90 MPI_Comm distcomm = MPI_COMM_NULL; 91 void *rootbuf = NULL, *leafbuf = NULL; 92 MPI_Request *req = NULL; 93 94 PetscFunctionBegin; 95 if (direction == PETSCSF_ROOT2LEAF) { 96 PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host before sending */)); 97 } else { 98 PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host */)); 99 } 100 101 PetscCall(PetscSFGetDistComm_Neighbor(sf, direction, &distcomm)); 102 PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, &rootbuf, &leafbuf, &req, NULL)); 103 PetscCall(PetscSFLinkSyncStreamBeforeCallMPI(sf, link)); 104 105 if (dat->rootdegree || dat->leafdegree) { // OpenMPI-3.0 ran into error with rootdegree = leafdegree = 0, so we skip the call in this case 106 if (direction == PETSCSF_ROOT2LEAF) { 107 PetscCallMPI(MPIU_Ineighbor_alltoallv(rootbuf, dat->rootcounts, dat->rootdispls, link->unit, leafbuf, dat->leafcounts, dat->leafdispls, link->unit, distcomm, req)); 108 PetscCall(PetscLogMPIMessages(dat->rootdegree, dat->rootcounts, link->unit, dat->leafdegree, dat->leafcounts, link->unit)); 109 } else { 110 PetscCallMPI(MPIU_Ineighbor_alltoallv(leafbuf, dat->leafcounts, dat->leafdispls, link->unit, rootbuf, dat->rootcounts, dat->rootdispls, link->unit, distcomm, req)); 111 PetscCall(PetscLogMPIMessages(dat->leafdegree, dat->leafcounts, link->unit, dat->rootdegree, dat->rootcounts, link->unit)); 112 } 113 } 114 PetscFunctionReturn(PETSC_SUCCESS); 115 } 116 117 #if defined(PETSC_HAVE_MPI_PERSISTENT_NEIGHBORHOOD_COLLECTIVES) 118 static PetscErrorCode PetscSFLinkInitMPIRequests_Persistent_Neighbor(PetscSF sf, PetscSFLink link, PetscSFDirection direction) 119 { 120 PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data; 121 MPI_Comm distcomm = MPI_COMM_NULL; 122 const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */ 123 const PetscInt rootdirect_mpi = link->rootdirect_mpi; 124 MPI_Request *req = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]; 125 void *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi], *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]; 126 MPI_Info info; 127 128 PetscFunctionBegin; 129 PetscCall(PetscSFGetDistComm_Neighbor(sf, direction, &distcomm)); 130 if (dat->rootdegree || dat->leafdegree) { 131 if (!link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) { 132 PetscCallMPI(MPI_Info_create(&info)); // currently, we don't use info 133 if (direction == PETSCSF_ROOT2LEAF) { 134 PetscCallMPI(MPIU_Neighbor_alltoallv_init(rootbuf, dat->rootcounts, dat->rootdispls, link->unit, leafbuf, dat->leafcounts, dat->leafdispls, link->unit, distcomm, info, req)); 135 } else { 136 PetscCallMPI(MPIU_Neighbor_alltoallv_init(leafbuf, dat->leafcounts, dat->leafdispls, link->unit, rootbuf, dat->rootcounts, dat->rootdispls, link->unit, distcomm, info, req)); 137 } 138 link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE; 139 PetscCallMPI(MPI_Info_free(&info)); 140 } 141 } 142 PetscFunctionReturn(PETSC_SUCCESS); 143 } 144 145 // Start MPI requests. If use non-GPU aware MPI, we might need to copy data from device buf to host buf 146 static PetscErrorCode PetscSFLinkStartCommunication_Persistent_Neighbor(PetscSF sf, PetscSFLink link, PetscSFDirection direction) 147 { 148 PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data; 149 MPI_Request *req = NULL; 150 151 PetscFunctionBegin; 152 if (direction == PETSCSF_ROOT2LEAF) { 153 PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host before sending */)); 154 } else { 155 PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host */)); 156 } 157 158 PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &req, NULL)); 159 PetscCall(PetscSFLinkSyncStreamBeforeCallMPI(sf, link)); 160 if (dat->rootdegree || dat->leafdegree) { 161 PetscCallMPI(MPI_Start(req)); 162 if (direction == PETSCSF_ROOT2LEAF) { 163 PetscCall(PetscLogMPIMessages(dat->rootdegree, dat->rootcounts, link->unit, dat->leafdegree, dat->leafcounts, link->unit)); 164 } else { 165 PetscCall(PetscLogMPIMessages(dat->leafdegree, dat->leafcounts, link->unit, dat->rootdegree, dat->rootcounts, link->unit)); 166 } 167 } 168 PetscFunctionReturn(PETSC_SUCCESS); 169 } 170 #endif 171 172 static PetscErrorCode PetscSFSetCommunicationOps_Neighbor(PetscSF sf, PetscSFLink link) 173 { 174 PetscFunctionBegin; 175 #if defined(PETSC_HAVE_MPI_PERSISTENT_NEIGHBORHOOD_COLLECTIVES) 176 if (sf->persistent) { 177 link->InitMPIRequests = PetscSFLinkInitMPIRequests_Persistent_Neighbor; 178 link->StartCommunication = PetscSFLinkStartCommunication_Persistent_Neighbor; 179 } else 180 #endif 181 { 182 link->StartCommunication = PetscSFLinkStartCommunication_Neighbor; 183 } 184 PetscFunctionReturn(PETSC_SUCCESS); 185 } 186 187 /*===================================================================================*/ 188 /* Implementations of SF public APIs */ 189 /*===================================================================================*/ 190 static PetscErrorCode PetscSFSetUp_Neighbor(PetscSF sf) 191 { 192 PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data; 193 PetscMPIInt nrootranks, ndrootranks, nleafranks, ndleafranks; 194 const PetscInt *rootoffset, *leafoffset; 195 PetscMPIInt m, n, m2, n2; 196 197 PetscFunctionBegin; 198 /* SFNeighbor inherits from Basic */ 199 PetscCall(PetscSFSetUp_Basic(sf)); 200 /* SFNeighbor specific */ 201 PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, NULL, &rootoffset, NULL)); 202 PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, NULL, &leafoffset, NULL, NULL)); 203 dat->rootdegree = m = (PetscMPIInt)(nrootranks - ndrootranks); 204 dat->leafdegree = n = (PetscMPIInt)(nleafranks - ndleafranks); 205 sf->nleafreqs = 0; 206 dat->nrootreqs = 1; // collectives only need one MPI_Request. We just put it in rootreqs[] 207 208 m2 = m; 209 n2 = n; 210 #if defined(PETSC_HAVE_OPENMPI) // workaround for an OpenMPI 5.0.x bug, https://github.com/open-mpi/ompi/pull/12614 211 #if PETSC_PKG_OPENMPI_VERSION_LE(5, 0, 3) 212 m2 = m ? m : 1; 213 n2 = n ? n : 1; 214 #endif 215 #endif 216 // Only setup MPI displs/counts for non-distinguished ranks. Distinguished ranks use shared memory 217 PetscCall(PetscMalloc6(m2, &dat->rootdispls, m2, &dat->rootcounts, m2, &dat->rootweights, n2, &dat->leafdispls, n2, &dat->leafcounts, n2, &dat->leafweights)); 218 219 #if defined(PETSC_HAVE_MPI_LARGE_COUNT) && defined(PETSC_USE_64BIT_INDICES) 220 for (PetscMPIInt i = ndrootranks, j = 0; i < nrootranks; i++, j++) { 221 dat->rootdispls[j] = rootoffset[i] - rootoffset[ndrootranks]; 222 dat->rootcounts[j] = rootoffset[i + 1] - rootoffset[i]; 223 dat->rootweights[j] = (PetscMPIInt)((PetscReal)dat->rootcounts[j] / (PetscReal)PETSC_INT_MAX * 2147483647); /* Scale to range of PetscMPIInt */ 224 } 225 226 for (PetscMPIInt i = ndleafranks, j = 0; i < nleafranks; i++, j++) { 227 dat->leafdispls[j] = leafoffset[i] - leafoffset[ndleafranks]; 228 dat->leafcounts[j] = leafoffset[i + 1] - leafoffset[i]; 229 dat->leafweights[j] = (PetscMPIInt)((PetscReal)dat->leafcounts[j] / (PetscReal)PETSC_INT_MAX * 2147483647); 230 } 231 #else 232 for (PetscMPIInt i = ndrootranks, j = 0; i < nrootranks; i++, j++) { 233 PetscCall(PetscMPIIntCast(rootoffset[i] - rootoffset[ndrootranks], &m)); 234 dat->rootdispls[j] = m; 235 PetscCall(PetscMPIIntCast(rootoffset[i + 1] - rootoffset[i], &n)); 236 dat->rootcounts[j] = n; 237 dat->rootweights[j] = n; 238 } 239 240 for (PetscMPIInt i = ndleafranks, j = 0; i < nleafranks; i++, j++) { 241 PetscCall(PetscMPIIntCast(leafoffset[i] - leafoffset[ndleafranks], &m)); 242 dat->leafdispls[j] = m; 243 PetscCall(PetscMPIIntCast(leafoffset[i + 1] - leafoffset[i], &n)); 244 dat->leafcounts[j] = n; 245 dat->leafweights[j] = n; 246 } 247 #endif 248 PetscFunctionReturn(PETSC_SUCCESS); 249 } 250 251 static PetscErrorCode PetscSFReset_Neighbor(PetscSF sf) 252 { 253 PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data; 254 255 PetscFunctionBegin; 256 PetscCheck(!dat->inuse, PetscObjectComm((PetscObject)sf), PETSC_ERR_ARG_WRONGSTATE, "Outstanding operation has not been completed"); 257 PetscCall(PetscFree6(dat->rootdispls, dat->rootcounts, dat->rootweights, dat->leafdispls, dat->leafcounts, dat->leafweights)); 258 for (int i = 0; i < 2; i++) { 259 if (dat->initialized[i]) { 260 PetscCallMPI(MPI_Comm_free(&dat->comms[i])); 261 dat->initialized[i] = PETSC_FALSE; 262 } 263 } 264 PetscCall(PetscSFReset_Basic(sf)); /* Common part */ 265 PetscFunctionReturn(PETSC_SUCCESS); 266 } 267 268 static PetscErrorCode PetscSFDestroy_Neighbor(PetscSF sf) 269 { 270 PetscFunctionBegin; 271 PetscCall(PetscSFReset_Neighbor(sf)); 272 PetscCall(PetscFree(sf->data)); 273 PetscFunctionReturn(PETSC_SUCCESS); 274 } 275 276 PETSC_INTERN PetscErrorCode PetscSFCreate_Neighbor(PetscSF sf) 277 { 278 PetscSF_Neighbor *dat; 279 280 PetscFunctionBegin; 281 sf->ops->CreateEmbeddedRootSF = PetscSFCreateEmbeddedRootSF_Basic; 282 sf->ops->BcastBegin = PetscSFBcastBegin_Basic; 283 sf->ops->BcastEnd = PetscSFBcastEnd_Basic; 284 sf->ops->ReduceBegin = PetscSFReduceBegin_Basic; 285 sf->ops->ReduceEnd = PetscSFReduceEnd_Basic; 286 sf->ops->FetchAndOpBegin = PetscSFFetchAndOpBegin_Basic; 287 sf->ops->FetchAndOpEnd = PetscSFFetchAndOpEnd_Basic; 288 sf->ops->GetLeafRanks = PetscSFGetLeafRanks_Basic; 289 sf->ops->View = PetscSFView_Basic; 290 291 sf->ops->SetUp = PetscSFSetUp_Neighbor; 292 sf->ops->Reset = PetscSFReset_Neighbor; 293 sf->ops->Destroy = PetscSFDestroy_Neighbor; 294 sf->ops->SetCommunicationOps = PetscSFSetCommunicationOps_Neighbor; 295 296 #if defined(PETSC_HAVE_MPI_PERSISTENT_NEIGHBORHOOD_COLLECTIVES) 297 PetscObjectOptionsBegin((PetscObject)sf); 298 PetscCall(PetscOptionsBool("-sf_neighbor_persistent", "Use MPI-4 persistent neighborhood collectives; used along with -sf_type neighbor", "PetscSFCreate", sf->persistent, &sf->persistent, NULL)); 299 PetscOptionsEnd(); 300 #endif 301 sf->collective = PETSC_TRUE; 302 303 PetscCall(PetscNew(&dat)); 304 sf->data = (void *)dat; 305 PetscFunctionReturn(PETSC_SUCCESS); 306 } 307