1 #include <../src/vec/is/sf/impls/basic/sfpack.h> 2 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 3 #include <petscpkg_version.h> 4 5 /* Convenience local types */ 6 #if defined(PETSC_HAVE_MPI_LARGE_COUNT) && defined(PETSC_USE_64BIT_INDICES) 7 typedef MPI_Count PetscSFCount; 8 typedef MPI_Aint PetscSFAint; 9 #else 10 typedef PetscMPIInt PetscSFCount; 11 typedef PetscMPIInt PetscSFAint; 12 #endif 13 14 typedef struct { 15 SFBASICHEADER; 16 MPI_Comm comms[2]; /* Communicators with distributed topology in both directions */ 17 PetscBool initialized[2]; /* Are the two communicators initialized? */ 18 PetscSFCount *rootcounts, *leafcounts; /* counts for non-distinguished ranks */ 19 PetscSFAint *rootdispls, *leafdispls; /* displs for non-distinguished ranks */ 20 PetscMPIInt *rootweights, *leafweights; 21 PetscInt rootdegree, leafdegree; 22 } PetscSF_Neighbor; 23 24 /*===================================================================================*/ 25 /* Internal utility routines */ 26 /*===================================================================================*/ 27 28 static inline PetscErrorCode PetscLogMPIMessages(PetscInt nsend, PetscSFCount *sendcnts, MPI_Datatype sendtype, PetscInt nrecv, PetscSFCount *recvcnts, MPI_Datatype recvtype) 29 { 30 PetscFunctionBegin; 31 if (PetscDefined(USE_LOG)) { 32 petsc_isend_ct += (PetscLogDouble)nsend; 33 petsc_irecv_ct += (PetscLogDouble)nrecv; 34 35 if (sendtype != MPI_DATATYPE_NULL) { 36 PetscMPIInt i, typesize; 37 PetscCallMPI(MPI_Type_size(sendtype, &typesize)); 38 for (i = 0; i < nsend; i++) petsc_isend_len += (PetscLogDouble)(sendcnts[i] * typesize); 39 } 40 41 if (recvtype != MPI_DATATYPE_NULL) { 42 PetscMPIInt i, typesize; 43 PetscCallMPI(MPI_Type_size(recvtype, &typesize)); 44 for (i = 0; i < nrecv; i++) petsc_irecv_len += (PetscLogDouble)(recvcnts[i] * typesize); 45 } 46 } 47 PetscFunctionReturn(PETSC_SUCCESS); 48 } 49 50 /* Get the communicator with distributed graph topology, which is not cheap to build so we do it on demand (instead of at PetscSFSetUp time) */ 51 static PetscErrorCode PetscSFGetDistComm_Neighbor(PetscSF sf, PetscSFDirection direction, MPI_Comm *distcomm) 52 { 53 PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data; 54 55 PetscFunctionBegin; 56 if (!dat->initialized[direction]) { 57 PetscInt nrootranks, ndrootranks, nleafranks, ndleafranks; 58 PetscMPIInt indegree, outdegree; 59 const PetscMPIInt *rootranks, *leafranks, *sources, *destinations; 60 MPI_Comm comm, *mycomm = &dat->comms[direction]; 61 62 PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, &rootranks, NULL, NULL)); /* Which ranks will access my roots (I am a destination) */ 63 PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, &leafranks, NULL, NULL, NULL)); /* My leaves will access whose roots (I am a source) */ 64 indegree = nrootranks - ndrootranks; 65 outdegree = nleafranks - ndleafranks; 66 sources = PetscSafePointerPlusOffset(rootranks, ndrootranks); 67 destinations = PetscSafePointerPlusOffset(leafranks, ndleafranks); 68 PetscCall(PetscObjectGetComm((PetscObject)sf, &comm)); 69 if (direction == PETSCSF_LEAF2ROOT) { 70 PetscCallMPI(MPI_Dist_graph_create_adjacent(comm, indegree, sources, dat->rootweights, outdegree, destinations, dat->leafweights, MPI_INFO_NULL, 1 /*reorder*/, mycomm)); 71 } else { /* PETSCSF_ROOT2LEAF, reverse src & dest */ 72 PetscCallMPI(MPI_Dist_graph_create_adjacent(comm, outdegree, destinations, dat->leafweights, indegree, sources, dat->rootweights, MPI_INFO_NULL, 1 /*reorder*/, mycomm)); 73 } 74 dat->initialized[direction] = PETSC_TRUE; 75 } 76 *distcomm = dat->comms[direction]; 77 PetscFunctionReturn(PETSC_SUCCESS); 78 } 79 80 // start MPI_Ineighbor_alltoallv (only used for inter-proccess communication) 81 static PetscErrorCode PetscSFLinkStartCommunication_Neighbor(PetscSF sf, PetscSFLink link, PetscSFDirection direction) 82 { 83 PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data; 84 MPI_Comm distcomm = MPI_COMM_NULL; 85 void *rootbuf = NULL, *leafbuf = NULL; 86 MPI_Request *req = NULL; 87 88 PetscFunctionBegin; 89 if (direction == PETSCSF_ROOT2LEAF) { 90 PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host before sending */)); 91 } else { 92 PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host */)); 93 } 94 95 PetscCall(PetscSFGetDistComm_Neighbor(sf, direction, &distcomm)); 96 PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, &rootbuf, &leafbuf, &req, NULL)); 97 PetscCall(PetscSFLinkSyncStreamBeforeCallMPI(sf, link)); 98 99 if (dat->rootdegree || dat->leafdegree) { // OpenMPI-3.0 ran into error with rootdegree = leafdegree = 0, so we skip the call in this case 100 if (direction == PETSCSF_ROOT2LEAF) { 101 PetscCallMPI(MPIU_Ineighbor_alltoallv(rootbuf, dat->rootcounts, dat->rootdispls, link->unit, leafbuf, dat->leafcounts, dat->leafdispls, link->unit, distcomm, req)); 102 PetscCall(PetscLogMPIMessages(dat->rootdegree, dat->rootcounts, link->unit, dat->leafdegree, dat->leafcounts, link->unit)); 103 } else { 104 PetscCallMPI(MPIU_Ineighbor_alltoallv(leafbuf, dat->leafcounts, dat->leafdispls, link->unit, rootbuf, dat->rootcounts, dat->rootdispls, link->unit, distcomm, req)); 105 PetscCall(PetscLogMPIMessages(dat->leafdegree, dat->leafcounts, link->unit, dat->rootdegree, dat->rootcounts, link->unit)); 106 } 107 } 108 PetscFunctionReturn(PETSC_SUCCESS); 109 } 110 111 #if defined(PETSC_HAVE_MPI_PERSISTENT_NEIGHBORHOOD_COLLECTIVES) 112 static PetscErrorCode PetscSFLinkInitMPIRequests_Persistent_Neighbor(PetscSF sf, PetscSFLink link, PetscSFDirection direction) 113 { 114 PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data; 115 MPI_Comm distcomm = MPI_COMM_NULL; 116 const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */ 117 const PetscInt rootdirect_mpi = link->rootdirect_mpi; 118 MPI_Request *req = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]; 119 void *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi], *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]; 120 MPI_Info info; 121 122 PetscFunctionBegin; 123 PetscCall(PetscSFGetDistComm_Neighbor(sf, direction, &distcomm)); 124 if (dat->rootdegree || dat->leafdegree) { 125 if (!link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) { 126 PetscCallMPI(MPI_Info_create(&info)); // currently, we don't use info 127 if (direction == PETSCSF_ROOT2LEAF) { 128 PetscCallMPI(MPIU_Neighbor_alltoallv_init(rootbuf, dat->rootcounts, dat->rootdispls, link->unit, leafbuf, dat->leafcounts, dat->leafdispls, link->unit, distcomm, info, req)); 129 } else { 130 PetscCallMPI(MPIU_Neighbor_alltoallv_init(leafbuf, dat->leafcounts, dat->leafdispls, link->unit, rootbuf, dat->rootcounts, dat->rootdispls, link->unit, distcomm, info, req)); 131 } 132 link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE; 133 PetscCallMPI(MPI_Info_free(&info)); 134 } 135 } 136 PetscFunctionReturn(PETSC_SUCCESS); 137 } 138 139 // Start MPI requests. If use non-GPU aware MPI, we might need to copy data from device buf to host buf 140 static PetscErrorCode PetscSFLinkStartCommunication_Persistent_Neighbor(PetscSF sf, PetscSFLink link, PetscSFDirection direction) 141 { 142 PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data; 143 MPI_Request *req = NULL; 144 145 PetscFunctionBegin; 146 if (direction == PETSCSF_ROOT2LEAF) { 147 PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host before sending */)); 148 } else { 149 PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host */)); 150 } 151 152 PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &req, NULL)); 153 PetscCall(PetscSFLinkSyncStreamBeforeCallMPI(sf, link)); 154 if (dat->rootdegree || dat->leafdegree) { 155 PetscCallMPI(MPI_Start(req)); 156 if (direction == PETSCSF_ROOT2LEAF) { 157 PetscCall(PetscLogMPIMessages(dat->rootdegree, dat->rootcounts, link->unit, dat->leafdegree, dat->leafcounts, link->unit)); 158 } else { 159 PetscCall(PetscLogMPIMessages(dat->leafdegree, dat->leafcounts, link->unit, dat->rootdegree, dat->rootcounts, link->unit)); 160 } 161 } 162 PetscFunctionReturn(PETSC_SUCCESS); 163 } 164 #endif 165 166 static PetscErrorCode PetscSFSetCommunicationOps_Neighbor(PetscSF sf, PetscSFLink link) 167 { 168 PetscFunctionBegin; 169 #if defined(PETSC_HAVE_MPI_PERSISTENT_NEIGHBORHOOD_COLLECTIVES) 170 if (sf->persistent) { 171 link->InitMPIRequests = PetscSFLinkInitMPIRequests_Persistent_Neighbor; 172 link->StartCommunication = PetscSFLinkStartCommunication_Persistent_Neighbor; 173 } else 174 #endif 175 { 176 link->StartCommunication = PetscSFLinkStartCommunication_Neighbor; 177 } 178 PetscFunctionReturn(PETSC_SUCCESS); 179 } 180 181 /*===================================================================================*/ 182 /* Implementations of SF public APIs */ 183 /*===================================================================================*/ 184 static PetscErrorCode PetscSFSetUp_Neighbor(PetscSF sf) 185 { 186 PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data; 187 PetscInt i, j, nrootranks, ndrootranks, nleafranks, ndleafranks; 188 const PetscInt *rootoffset, *leafoffset; 189 PetscMPIInt m, n, m2, n2; 190 191 PetscFunctionBegin; 192 /* SFNeighbor inherits from Basic */ 193 PetscCall(PetscSFSetUp_Basic(sf)); 194 /* SFNeighbor specific */ 195 PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, NULL, &rootoffset, NULL)); 196 PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, NULL, &leafoffset, NULL, NULL)); 197 dat->rootdegree = m = (PetscMPIInt)(nrootranks - ndrootranks); 198 dat->leafdegree = n = (PetscMPIInt)(nleafranks - ndleafranks); 199 sf->nleafreqs = 0; 200 dat->nrootreqs = 1; // collectives only need one MPI_Request. We just put it in rootreqs[] 201 202 m2 = m; 203 n2 = n; 204 #if defined(PETSC_HAVE_OPENMPI) // workaround for an OpenMPI 5.0.x bug, https://github.com/open-mpi/ompi/pull/12614 205 #if PETSC_PKG_OPENMPI_VERSION_LE(5, 0, 3) 206 m2 = m ? m : 1; 207 n2 = n ? n : 1; 208 #endif 209 #endif 210 // Only setup MPI displs/counts for non-distinguished ranks. Distinguished ranks use shared memory 211 PetscCall(PetscMalloc6(m2, &dat->rootdispls, m2, &dat->rootcounts, m2, &dat->rootweights, n2, &dat->leafdispls, n2, &dat->leafcounts, n2, &dat->leafweights)); 212 213 #if defined(PETSC_HAVE_MPI_LARGE_COUNT) && defined(PETSC_USE_64BIT_INDICES) 214 for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) { 215 dat->rootdispls[j] = rootoffset[i] - rootoffset[ndrootranks]; 216 dat->rootcounts[j] = rootoffset[i + 1] - rootoffset[i]; 217 dat->rootweights[j] = (PetscMPIInt)((PetscReal)dat->rootcounts[j] / (PetscReal)PETSC_MAX_INT * 2147483647); /* Scale to range of PetscMPIInt */ 218 } 219 220 for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) { 221 dat->leafdispls[j] = leafoffset[i] - leafoffset[ndleafranks]; 222 dat->leafcounts[j] = leafoffset[i + 1] - leafoffset[i]; 223 dat->leafweights[j] = (PetscMPIInt)((PetscReal)dat->leafcounts[j] / (PetscReal)PETSC_MAX_INT * 2147483647); 224 } 225 #else 226 for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) { 227 PetscCall(PetscMPIIntCast(rootoffset[i] - rootoffset[ndrootranks], &m)); 228 dat->rootdispls[j] = m; 229 PetscCall(PetscMPIIntCast(rootoffset[i + 1] - rootoffset[i], &n)); 230 dat->rootcounts[j] = n; 231 dat->rootweights[j] = n; 232 } 233 234 for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) { 235 PetscCall(PetscMPIIntCast(leafoffset[i] - leafoffset[ndleafranks], &m)); 236 dat->leafdispls[j] = m; 237 PetscCall(PetscMPIIntCast(leafoffset[i + 1] - leafoffset[i], &n)); 238 dat->leafcounts[j] = n; 239 dat->leafweights[j] = n; 240 } 241 #endif 242 PetscFunctionReturn(PETSC_SUCCESS); 243 } 244 245 static PetscErrorCode PetscSFReset_Neighbor(PetscSF sf) 246 { 247 PetscInt i; 248 PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data; 249 250 PetscFunctionBegin; 251 PetscCheck(!dat->inuse, PetscObjectComm((PetscObject)sf), PETSC_ERR_ARG_WRONGSTATE, "Outstanding operation has not been completed"); 252 PetscCall(PetscFree6(dat->rootdispls, dat->rootcounts, dat->rootweights, dat->leafdispls, dat->leafcounts, dat->leafweights)); 253 for (i = 0; i < 2; i++) { 254 if (dat->initialized[i]) { 255 PetscCallMPI(MPI_Comm_free(&dat->comms[i])); 256 dat->initialized[i] = PETSC_FALSE; 257 } 258 } 259 PetscCall(PetscSFReset_Basic(sf)); /* Common part */ 260 PetscFunctionReturn(PETSC_SUCCESS); 261 } 262 263 static PetscErrorCode PetscSFDestroy_Neighbor(PetscSF sf) 264 { 265 PetscFunctionBegin; 266 PetscCall(PetscSFReset_Neighbor(sf)); 267 PetscCall(PetscFree(sf->data)); 268 PetscFunctionReturn(PETSC_SUCCESS); 269 } 270 271 PETSC_INTERN PetscErrorCode PetscSFCreate_Neighbor(PetscSF sf) 272 { 273 PetscSF_Neighbor *dat; 274 275 PetscFunctionBegin; 276 sf->ops->CreateEmbeddedRootSF = PetscSFCreateEmbeddedRootSF_Basic; 277 sf->ops->BcastBegin = PetscSFBcastBegin_Basic; 278 sf->ops->BcastEnd = PetscSFBcastEnd_Basic; 279 sf->ops->ReduceBegin = PetscSFReduceBegin_Basic; 280 sf->ops->ReduceEnd = PetscSFReduceEnd_Basic; 281 sf->ops->FetchAndOpBegin = PetscSFFetchAndOpBegin_Basic; 282 sf->ops->FetchAndOpEnd = PetscSFFetchAndOpEnd_Basic; 283 sf->ops->GetLeafRanks = PetscSFGetLeafRanks_Basic; 284 sf->ops->View = PetscSFView_Basic; 285 286 sf->ops->SetUp = PetscSFSetUp_Neighbor; 287 sf->ops->Reset = PetscSFReset_Neighbor; 288 sf->ops->Destroy = PetscSFDestroy_Neighbor; 289 sf->ops->SetCommunicationOps = PetscSFSetCommunicationOps_Neighbor; 290 291 #if defined(PETSC_HAVE_MPI_PERSISTENT_NEIGHBORHOOD_COLLECTIVES) 292 PetscObjectOptionsBegin((PetscObject)sf); 293 PetscCall(PetscOptionsBool("-sf_neighbor_persistent", "Use MPI-4 persistent neighborhood collectives; used along with -sf_type neighbor", "PetscSFCreate", sf->persistent, &sf->persistent, NULL)); 294 PetscOptionsEnd(); 295 #endif 296 sf->collective = PETSC_TRUE; 297 298 PetscCall(PetscNew(&dat)); 299 sf->data = (void *)dat; 300 PetscFunctionReturn(PETSC_SUCCESS); 301 } 302