1 #include <../src/vec/is/sf/impls/basic/sfpack.h> 2 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 3 4 /* Convenience local types */ 5 #if defined(PETSC_HAVE_MPI_LARGE_COUNT) && defined(PETSC_USE_64BIT_INDICES) 6 typedef MPI_Count PetscSFCount; 7 typedef MPI_Aint PetscSFAint; 8 #else 9 typedef PetscMPIInt PetscSFCount; 10 typedef PetscMPIInt PetscSFAint; 11 #endif 12 13 typedef struct { 14 SFBASICHEADER; 15 MPI_Comm comms[2]; /* Communicators with distributed topology in both directions */ 16 PetscBool initialized[2]; /* Are the two communicators initialized? */ 17 PetscSFCount *rootcounts, *leafcounts; /* counts for non-distinguished ranks */ 18 PetscSFAint *rootdispls, *leafdispls; /* displs for non-distinguished ranks */ 19 PetscMPIInt *rootweights, *leafweights; 20 PetscInt rootdegree, leafdegree; 21 } PetscSF_Neighbor; 22 23 /*===================================================================================*/ 24 /* Internal utility routines */ 25 /*===================================================================================*/ 26 27 static inline PetscErrorCode PetscLogMPIMessages(PetscInt nsend, PetscSFCount *sendcnts, MPI_Datatype sendtype, PetscInt nrecv, PetscSFCount *recvcnts, MPI_Datatype recvtype) 28 { 29 PetscFunctionBegin; 30 if (PetscDefined(USE_LOG)) { 31 petsc_isend_ct += (PetscLogDouble)nsend; 32 petsc_irecv_ct += (PetscLogDouble)nrecv; 33 34 if (sendtype != MPI_DATATYPE_NULL) { 35 PetscMPIInt i, typesize; 36 PetscCallMPI(MPI_Type_size(sendtype, &typesize)); 37 for (i = 0; i < nsend; i++) petsc_isend_len += (PetscLogDouble)(sendcnts[i] * typesize); 38 } 39 40 if (recvtype != MPI_DATATYPE_NULL) { 41 PetscMPIInt i, typesize; 42 PetscCallMPI(MPI_Type_size(recvtype, &typesize)); 43 for (i = 0; i < nrecv; i++) petsc_irecv_len += (PetscLogDouble)(recvcnts[i] * typesize); 44 } 45 } 46 PetscFunctionReturn(PETSC_SUCCESS); 47 } 48 49 /* Get the communicator with distributed graph topology, which is not cheap to build so we do it on demand (instead of at PetscSFSetUp time) */ 50 static PetscErrorCode PetscSFGetDistComm_Neighbor(PetscSF sf, PetscSFDirection direction, MPI_Comm *distcomm) 51 { 52 PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data; 53 54 PetscFunctionBegin; 55 if (!dat->initialized[direction]) { 56 PetscInt nrootranks, ndrootranks, nleafranks, ndleafranks; 57 PetscMPIInt indegree, outdegree; 58 const PetscMPIInt *rootranks, *leafranks, *sources, *destinations; 59 MPI_Comm comm, *mycomm = &dat->comms[direction]; 60 61 PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, &rootranks, NULL, NULL)); /* Which ranks will access my roots (I am a destination) */ 62 PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, &leafranks, NULL, NULL, NULL)); /* My leaves will access whose roots (I am a source) */ 63 indegree = nrootranks - ndrootranks; 64 outdegree = nleafranks - ndleafranks; 65 sources = PetscSafePointerPlusOffset(rootranks, ndrootranks); 66 destinations = PetscSafePointerPlusOffset(leafranks, ndleafranks); 67 PetscCall(PetscObjectGetComm((PetscObject)sf, &comm)); 68 if (direction == PETSCSF_LEAF2ROOT) { 69 PetscCallMPI(MPI_Dist_graph_create_adjacent(comm, indegree, sources, dat->rootweights, outdegree, destinations, dat->leafweights, MPI_INFO_NULL, 1 /*reorder*/, mycomm)); 70 } else { /* PETSCSF_ROOT2LEAF, reverse src & dest */ 71 PetscCallMPI(MPI_Dist_graph_create_adjacent(comm, outdegree, destinations, dat->leafweights, indegree, sources, dat->rootweights, MPI_INFO_NULL, 1 /*reorder*/, mycomm)); 72 } 73 dat->initialized[direction] = PETSC_TRUE; 74 } 75 *distcomm = dat->comms[direction]; 76 PetscFunctionReturn(PETSC_SUCCESS); 77 } 78 79 // start MPI_Ineighbor_alltoallv (only used for inter-proccess communication) 80 static PetscErrorCode PetscSFLinkStartCommunication_Neighbor(PetscSF sf, PetscSFLink link, PetscSFDirection direction) 81 { 82 PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data; 83 MPI_Comm distcomm = MPI_COMM_NULL; 84 void *rootbuf = NULL, *leafbuf = NULL; 85 MPI_Request *req = NULL; 86 87 PetscFunctionBegin; 88 if (direction == PETSCSF_ROOT2LEAF) { 89 PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host before sending */)); 90 } else { 91 PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host */)); 92 } 93 94 PetscCall(PetscSFGetDistComm_Neighbor(sf, direction, &distcomm)); 95 PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, &rootbuf, &leafbuf, &req, NULL)); 96 PetscCall(PetscSFLinkSyncStreamBeforeCallMPI(sf, link)); 97 98 if (dat->rootdegree || dat->leafdegree) { // OpenMPI-3.0 ran into error with rootdegree = leafdegree = 0, so we skip the call in this case 99 if (direction == PETSCSF_ROOT2LEAF) { 100 PetscCallMPI(MPIU_Ineighbor_alltoallv(rootbuf, dat->rootcounts, dat->rootdispls, link->unit, leafbuf, dat->leafcounts, dat->leafdispls, link->unit, distcomm, req)); 101 PetscCall(PetscLogMPIMessages(dat->rootdegree, dat->rootcounts, link->unit, dat->leafdegree, dat->leafcounts, link->unit)); 102 } else { 103 PetscCallMPI(MPIU_Ineighbor_alltoallv(leafbuf, dat->leafcounts, dat->leafdispls, link->unit, rootbuf, dat->rootcounts, dat->rootdispls, link->unit, distcomm, req)); 104 PetscCall(PetscLogMPIMessages(dat->leafdegree, dat->leafcounts, link->unit, dat->rootdegree, dat->rootcounts, link->unit)); 105 } 106 } 107 PetscFunctionReturn(PETSC_SUCCESS); 108 } 109 110 #if defined(PETSC_HAVE_MPI_PERSISTENT_NEIGHBORHOOD_COLLECTIVES) 111 static PetscErrorCode PetscSFLinkInitMPIRequests_Persistent_Neighbor(PetscSF sf, PetscSFLink link, PetscSFDirection direction) 112 { 113 PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data; 114 MPI_Comm distcomm = MPI_COMM_NULL; 115 const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */ 116 const PetscInt rootdirect_mpi = link->rootdirect_mpi; 117 MPI_Request *req = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]; 118 void *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi], *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]; 119 MPI_Info info; 120 121 PetscFunctionBegin; 122 PetscCall(PetscSFGetDistComm_Neighbor(sf, direction, &distcomm)); 123 if (dat->rootdegree || dat->leafdegree) { 124 if (!link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) { 125 PetscCallMPI(MPI_Info_create(&info)); // currently, we don't use info 126 if (direction == PETSCSF_ROOT2LEAF) { 127 PetscCallMPI(MPIU_Neighbor_alltoallv_init(rootbuf, dat->rootcounts, dat->rootdispls, link->unit, leafbuf, dat->leafcounts, dat->leafdispls, link->unit, distcomm, info, req)); 128 } else { 129 PetscCallMPI(MPIU_Neighbor_alltoallv_init(leafbuf, dat->leafcounts, dat->leafdispls, link->unit, rootbuf, dat->rootcounts, dat->rootdispls, link->unit, distcomm, info, req)); 130 } 131 link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE; 132 PetscCallMPI(MPI_Info_free(&info)); 133 } 134 } 135 PetscFunctionReturn(PETSC_SUCCESS); 136 } 137 138 // Start MPI requests. If use non-GPU aware MPI, we might need to copy data from device buf to host buf 139 static PetscErrorCode PetscSFLinkStartCommunication_Persistent_Neighbor(PetscSF sf, PetscSFLink link, PetscSFDirection direction) 140 { 141 PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data; 142 MPI_Request *req = NULL; 143 144 PetscFunctionBegin; 145 if (direction == PETSCSF_ROOT2LEAF) { 146 PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host before sending */)); 147 } else { 148 PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host */)); 149 } 150 151 PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &req, NULL)); 152 PetscCall(PetscSFLinkSyncStreamBeforeCallMPI(sf, link)); 153 if (dat->rootdegree || dat->leafdegree) { 154 PetscCallMPI(MPI_Start(req)); 155 if (direction == PETSCSF_ROOT2LEAF) { 156 PetscCall(PetscLogMPIMessages(dat->rootdegree, dat->rootcounts, link->unit, dat->leafdegree, dat->leafcounts, link->unit)); 157 } else { 158 PetscCall(PetscLogMPIMessages(dat->leafdegree, dat->leafcounts, link->unit, dat->rootdegree, dat->rootcounts, link->unit)); 159 } 160 } 161 PetscFunctionReturn(PETSC_SUCCESS); 162 } 163 #endif 164 165 static PetscErrorCode PetscSFSetCommunicationOps_Neighbor(PetscSF sf, PetscSFLink link) 166 { 167 PetscFunctionBegin; 168 #if defined(PETSC_HAVE_MPI_PERSISTENT_NEIGHBORHOOD_COLLECTIVES) 169 if (sf->persistent) { 170 link->InitMPIRequests = PetscSFLinkInitMPIRequests_Persistent_Neighbor; 171 link->StartCommunication = PetscSFLinkStartCommunication_Persistent_Neighbor; 172 } else 173 #endif 174 { 175 link->StartCommunication = PetscSFLinkStartCommunication_Neighbor; 176 } 177 PetscFunctionReturn(PETSC_SUCCESS); 178 } 179 180 /*===================================================================================*/ 181 /* Implementations of SF public APIs */ 182 /*===================================================================================*/ 183 static PetscErrorCode PetscSFSetUp_Neighbor(PetscSF sf) 184 { 185 PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data; 186 PetscInt i, j, nrootranks, ndrootranks, nleafranks, ndleafranks; 187 const PetscInt *rootoffset, *leafoffset; 188 PetscMPIInt m, n; 189 190 PetscFunctionBegin; 191 /* SFNeighbor inherits from Basic */ 192 PetscCall(PetscSFSetUp_Basic(sf)); 193 /* SFNeighbor specific */ 194 PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, NULL, &rootoffset, NULL)); 195 PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, NULL, &leafoffset, NULL, NULL)); 196 dat->rootdegree = m = (PetscMPIInt)(nrootranks - ndrootranks); 197 dat->leafdegree = n = (PetscMPIInt)(nleafranks - ndleafranks); 198 sf->nleafreqs = 0; 199 dat->nrootreqs = 1; // collectives only need one MPI_Request. We just put it in rootreqs[] 200 201 /* Only setup MPI displs/counts for non-distinguished ranks. Distinguished ranks use shared memory */ 202 #if !PetscDefined(HAVE_OPENMPI) || (PetscDefined(HAVE_OMPI_MAJOR_VERSION) && PetscDefined(HAVE_OMPI_MINOR_VERSION) && PetscDefined(HAVE_OMPI_RELEASE_VERSION) && !(PETSC_HAVE_OMPI_MAJOR_VERSION == 5 && PETSC_HAVE_OMPI_MINOR_VERSION == 0 && PETSC_HAVE_OMPI_RELEASE_VERSION == 0)) 203 PetscCall(PetscMalloc6(m, &dat->rootdispls, m, &dat->rootcounts, m, &dat->rootweights, n, &dat->leafdispls, n, &dat->leafcounts, n, &dat->leafweights)); 204 #else // workaround for an OpenMPI 5.0.0 bug, https://github.com/open-mpi/ompi/issues/12037 205 PetscMPIInt m2 = m ? m : 1, n2 = n ? n : 1; 206 PetscCall(PetscMalloc6(m2, &dat->rootdispls, m2, &dat->rootcounts, m2, &dat->rootweights, n2, &dat->leafdispls, n2, &dat->leafcounts, n2, &dat->leafweights)); 207 #endif 208 209 #if defined(PETSC_HAVE_MPI_LARGE_COUNT) && defined(PETSC_USE_64BIT_INDICES) 210 for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) { 211 dat->rootdispls[j] = rootoffset[i] - rootoffset[ndrootranks]; 212 dat->rootcounts[j] = rootoffset[i + 1] - rootoffset[i]; 213 dat->rootweights[j] = (PetscMPIInt)((PetscReal)dat->rootcounts[j] / (PetscReal)PETSC_MAX_INT * 2147483647); /* Scale to range of PetscMPIInt */ 214 } 215 216 for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) { 217 dat->leafdispls[j] = leafoffset[i] - leafoffset[ndleafranks]; 218 dat->leafcounts[j] = leafoffset[i + 1] - leafoffset[i]; 219 dat->leafweights[j] = (PetscMPIInt)((PetscReal)dat->leafcounts[j] / (PetscReal)PETSC_MAX_INT * 2147483647); 220 } 221 #else 222 for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) { 223 PetscCall(PetscMPIIntCast(rootoffset[i] - rootoffset[ndrootranks], &m)); 224 dat->rootdispls[j] = m; 225 PetscCall(PetscMPIIntCast(rootoffset[i + 1] - rootoffset[i], &n)); 226 dat->rootcounts[j] = n; 227 dat->rootweights[j] = n; 228 } 229 230 for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) { 231 PetscCall(PetscMPIIntCast(leafoffset[i] - leafoffset[ndleafranks], &m)); 232 dat->leafdispls[j] = m; 233 PetscCall(PetscMPIIntCast(leafoffset[i + 1] - leafoffset[i], &n)); 234 dat->leafcounts[j] = n; 235 dat->leafweights[j] = n; 236 } 237 #endif 238 PetscFunctionReturn(PETSC_SUCCESS); 239 } 240 241 static PetscErrorCode PetscSFReset_Neighbor(PetscSF sf) 242 { 243 PetscInt i; 244 PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data; 245 246 PetscFunctionBegin; 247 PetscCheck(!dat->inuse, PetscObjectComm((PetscObject)sf), PETSC_ERR_ARG_WRONGSTATE, "Outstanding operation has not been completed"); 248 PetscCall(PetscFree6(dat->rootdispls, dat->rootcounts, dat->rootweights, dat->leafdispls, dat->leafcounts, dat->leafweights)); 249 for (i = 0; i < 2; i++) { 250 if (dat->initialized[i]) { 251 PetscCallMPI(MPI_Comm_free(&dat->comms[i])); 252 dat->initialized[i] = PETSC_FALSE; 253 } 254 } 255 PetscCall(PetscSFReset_Basic(sf)); /* Common part */ 256 PetscFunctionReturn(PETSC_SUCCESS); 257 } 258 259 static PetscErrorCode PetscSFDestroy_Neighbor(PetscSF sf) 260 { 261 PetscFunctionBegin; 262 PetscCall(PetscSFReset_Neighbor(sf)); 263 PetscCall(PetscFree(sf->data)); 264 PetscFunctionReturn(PETSC_SUCCESS); 265 } 266 267 PETSC_INTERN PetscErrorCode PetscSFCreate_Neighbor(PetscSF sf) 268 { 269 PetscSF_Neighbor *dat; 270 271 PetscFunctionBegin; 272 sf->ops->CreateEmbeddedRootSF = PetscSFCreateEmbeddedRootSF_Basic; 273 sf->ops->BcastBegin = PetscSFBcastBegin_Basic; 274 sf->ops->BcastEnd = PetscSFBcastEnd_Basic; 275 sf->ops->ReduceBegin = PetscSFReduceBegin_Basic; 276 sf->ops->ReduceEnd = PetscSFReduceEnd_Basic; 277 sf->ops->FetchAndOpBegin = PetscSFFetchAndOpBegin_Basic; 278 sf->ops->FetchAndOpEnd = PetscSFFetchAndOpEnd_Basic; 279 sf->ops->GetLeafRanks = PetscSFGetLeafRanks_Basic; 280 sf->ops->View = PetscSFView_Basic; 281 282 sf->ops->SetUp = PetscSFSetUp_Neighbor; 283 sf->ops->Reset = PetscSFReset_Neighbor; 284 sf->ops->Destroy = PetscSFDestroy_Neighbor; 285 sf->ops->SetCommunicationOps = PetscSFSetCommunicationOps_Neighbor; 286 287 #if defined(PETSC_HAVE_MPI_PERSISTENT_NEIGHBORHOOD_COLLECTIVES) 288 PetscObjectOptionsBegin((PetscObject)sf); 289 PetscCall(PetscOptionsBool("-sf_neighbor_persistent", "Use MPI-4 persistent neighborhood collectives; used along with -sf_type neighbor", "PetscSFCreate", sf->persistent, &sf->persistent, NULL)); 290 PetscOptionsEnd(); 291 #endif 292 sf->collective = PETSC_TRUE; 293 294 PetscCall(PetscNew(&dat)); 295 sf->data = (void *)dat; 296 PetscFunctionReturn(PETSC_SUCCESS); 297 } 298