xref: /petsc/src/sys/utils/mpishm.c (revision a32e93adbc24f554b14767e95a73419304826960)
1 #include <petscsys.h>        /*I  "petscsys.h"  I*/
2 #include <petsc/private/petscimpl.h>
3 
4 struct _n_PetscShmComm {
5   PetscMPIInt *globranks;       /* global ranks of each rank in the shared memory communicator */
6   PetscMPIInt shmsize;          /* size of the shared memory communicator */
7   MPI_Comm    globcomm,shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */
8 };
9 
10 /*
11    Private routine to delete internal tag/name shared memory communicator when a communicator is freed.
12 
13    This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this  data as an attribute is freed.
14 
15    Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval()
16 
17 */
18 PETSC_EXTERN PetscMPIInt MPIAPI Petsc_DelComm_Shm(MPI_Comm comm,PetscMPIInt keyval,void *val,void *extra_state)
19 {
20   PetscErrorCode  ierr;
21   PetscShmComm p = (PetscShmComm)val;
22 
23   PetscFunctionBegin;
24   ierr = PetscInfo1(0,"Deleting shared memory subcommunicator in a MPI_Comm %ld\n",(long)comm);CHKERRMPI(ierr);
25   ierr = MPI_Comm_free(&p->shmcomm);CHKERRMPI(ierr);
26   ierr = PetscFree(p->globranks);CHKERRMPI(ierr);
27   ierr = PetscFree(val);CHKERRMPI(ierr);
28   PetscFunctionReturn(MPI_SUCCESS);
29 }
30 
31 /*@C
32     PetscShmCommGet - Given a PETSc communicator returns a communicator of all ranks that share a common memory
33 
34 
35     Collective on comm.
36 
37     Input Parameter:
38 .   globcomm - MPI_Comm
39 
40     Output Parameter:
41 .   pshmcomm - the PETSc shared memory communicator object
42 
43     Level: developer
44 
45     Notes:
46     This should be called only with an PetscCommDuplicate() communictor
47 
48            When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis
49 
50     Concepts: MPI subcomm^numbering
51 
52 @*/
53 PetscErrorCode PetscShmCommGet(MPI_Comm globcomm,PetscShmComm *pshmcomm)
54 {
55 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
56   PetscErrorCode   ierr;
57   MPI_Group        globgroup,shmgroup;
58   PetscMPIInt      *shmranks,i,flg;
59   PetscCommCounter *counter;
60 
61   PetscFunctionBegin;
62   ierr = MPI_Comm_get_attr(globcomm,Petsc_Counter_keyval,&counter,&flg);CHKERRQ(ierr);
63   if (!flg) SETERRQ(globcomm,PETSC_ERR_ARG_CORRUPT,"Bad MPI communicator supplied; must be a PETSc communicator");
64 
65   ierr = MPI_Comm_get_attr(globcomm,Petsc_ShmComm_keyval,pshmcomm,&flg);CHKERRQ(ierr);
66   if (flg) PetscFunctionReturn(0);
67 
68   ierr        = PetscNew(pshmcomm);CHKERRQ(ierr);
69   (*pshmcomm)->globcomm = globcomm;
70 
71   ierr = MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED,0, MPI_INFO_NULL,&(*pshmcomm)->shmcomm);CHKERRQ(ierr);
72 
73   ierr = MPI_Comm_size((*pshmcomm)->shmcomm,&(*pshmcomm)->shmsize);CHKERRQ(ierr);
74   ierr = MPI_Comm_group(globcomm, &globgroup);CHKERRQ(ierr);
75   ierr = MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup);CHKERRQ(ierr);
76   ierr = PetscMalloc1((*pshmcomm)->shmsize,&shmranks);CHKERRQ(ierr);
77   ierr = PetscMalloc1((*pshmcomm)->shmsize,&(*pshmcomm)->globranks);CHKERRQ(ierr);
78   for (i=0; i<(*pshmcomm)->shmsize; i++) shmranks[i] = i;
79   ierr = MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks);CHKERRQ(ierr);
80   ierr = PetscFree(shmranks);CHKERRQ(ierr);
81   ierr = MPI_Group_free(&globgroup);CHKERRQ(ierr);
82   ierr = MPI_Group_free(&shmgroup);CHKERRQ(ierr);
83 
84   for (i=0; i<(*pshmcomm)->shmsize; i++) {
85     ierr = PetscInfo2(NULL,"Shared memory rank %d global rank %d\n",i,(*pshmcomm)->globranks[i]);CHKERRQ(ierr);
86   }
87   ierr = MPI_Comm_set_attr(globcomm,Petsc_ShmComm_keyval,*pshmcomm);CHKERRQ(ierr);
88   PetscFunctionReturn(0);
89 #else
90   SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich.");
91 #endif
92 }
93 
94 /*@C
95     PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator
96 
97     Input Parameters:
98 +   pshmcomm - the shared memory communicator object
99 -   grank    - the global rank
100 
101     Output Parameter:
102 .   lrank - the local rank, or MPI_PROC_NULL if it does not exist
103 
104     Level: developer
105 
106     Developer Notes:
107     Assumes the pshmcomm->globranks[] is sorted
108 
109     It may be better to rewrite this to map multiple global ranks to local in the same function call
110 
111     Concepts: MPI subcomm^numbering
112 
113 @*/
114 PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm,PetscMPIInt grank,PetscMPIInt *lrank)
115 {
116   PetscMPIInt    low,high,t,i;
117   PetscBool      flg = PETSC_FALSE;
118   PetscErrorCode ierr;
119 
120   PetscFunctionBegin;
121   *lrank = MPI_PROC_NULL;
122   if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(0);
123   if (grank > pshmcomm->globranks[pshmcomm->shmsize-1]) PetscFunctionReturn(0);
124   ierr = PetscOptionsGetBool(NULL,NULL,"-noshared",&flg,NULL);CHKERRQ(ierr);
125   if (flg) PetscFunctionReturn(0);
126   low  = 0;
127   high = pshmcomm->shmsize;
128   while (high-low > 5) {
129     t = (low+high)/2;
130     if (pshmcomm->globranks[t] > grank) high = t;
131     else low = t;
132   }
133   for (i=low; i<high; i++) {
134     if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(0);
135     if (pshmcomm->globranks[i] == grank) {
136       *lrank = i;
137       PetscFunctionReturn(0);
138     }
139   }
140   PetscFunctionReturn(0);
141 }
142 
143 /*@C
144     PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank
145 
146     Input Parameters:
147 +   pshmcomm - the shared memory communicator object
148 -   lrank    - the local rank in the shared memory communicator
149 
150     Output Parameter:
151 .   grank - the global rank in the global communicator where the shared memory communicator is built
152 
153     Level: developer
154 
155     Concepts: MPI subcomm^numbering
156 @*/
157 PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm,PetscMPIInt lrank,PetscMPIInt *grank)
158 {
159   PetscFunctionBegin;
160 #ifdef PETSC_USE_DEBUG
161   {
162     PetscErrorCode ierr;
163     if (lrank < 0 || lrank >= pshmcomm->shmsize) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"No rank %D in the shared memory communicator",lrank);CHKERRQ(ierr); }
164   }
165 #endif
166   *grank = pshmcomm->globranks[lrank];
167   PetscFunctionReturn(0);
168 }
169 
170 /*@C
171     PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory
172 
173     Input Parameter:
174 .   pshmcomm - PetscShmComm object obtained with PetscShmCommGet()
175 
176     Output Parameter:
177 .   comm     - the MPI communicator
178 
179     Level: developer
180 
181 @*/
182 PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm,MPI_Comm *comm)
183 {
184   PetscFunctionBegin;
185   *comm = pshmcomm->shmcomm;
186   PetscFunctionReturn(0);
187 }
188 
189 #if defined(PETSC_HAVE_OPENMP) && defined(PETSC_HAVE_PTHREAD) && defined(PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY) && defined(PETSC_HAVE_HWLOC)
190 #include <pthread.h>
191 #include <hwloc.h>
192 #include <omp.h>
193 
194 struct _n_PetscOmpCtrl {
195   MPI_Comm          omp_comm;        /* a shared memory communicator to spawn omp threads */
196   MPI_Comm          omp_master_comm; /* a communicator to give to third party libraries */
197   PetscMPIInt       omp_comm_size;   /* size of omp_comm, a kind of OMP_NUM_THREADS */
198   PetscBool         is_omp_master;   /* rank 0's in omp_comm */
199   MPI_Win           omp_win;         /* a shared memory window containing a barrier */
200   pthread_barrier_t *barrier;        /* pointer to the barrier */
201   hwloc_topology_t  topology;
202   hwloc_cpuset_t    cpuset;          /* cpu bindings of omp master */
203   hwloc_cpuset_t    omp_cpuset;      /* union of cpu bindings of ranks in omp_comm */
204 };
205 
206 /* Allocate a shared pthread_barrier_t object in ctrl->omp_comm, set ctrl->barrier */
207 PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
208 {
209   PetscErrorCode        ierr;
210   MPI_Aint              size;
211   PetscMPIInt           disp_unit;
212   void                  *baseptr;
213   pthread_barrierattr_t attr;
214 
215   size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
216   ierr = MPI_Win_allocate_shared(size,1,MPI_INFO_NULL,ctrl->omp_comm,&baseptr,&ctrl->omp_win);CHKERRQ(ierr);
217   ierr = MPI_Win_shared_query(ctrl->omp_win,0,&size,&disp_unit,&baseptr);CHKERRQ(ierr);
218   ctrl->barrier = (pthread_barrier_t*)baseptr;
219 
220   /* omp master initializes the barrier */
221   if (ctrl->is_omp_master) {
222     ierr = MPI_Comm_size(ctrl->omp_comm,&ctrl->omp_comm_size);CHKERRQ(ierr);
223     ierr = pthread_barrierattr_init(&attr);CHKERRQ(ierr);
224     ierr = pthread_barrierattr_setpshared(&attr,PTHREAD_PROCESS_SHARED);CHKERRQ(ierr); /* make the barrier also work for processes */
225     ierr = pthread_barrier_init(ctrl->barrier,&attr,(unsigned int)ctrl->omp_comm_size);CHKERRQ(ierr);
226     ierr = pthread_barrierattr_destroy(&attr);CHKERRQ(ierr);
227   }
228 
229   /* the MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
230   MPI_Barrier(ctrl->omp_comm);
231   PetscFunctionReturn(0);
232 }
233 
234 /* Destroy ctrl->barrier */
235 PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
236 {
237   PetscErrorCode ierr;
238 
239   /* the MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
240   ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
241   if (ctrl->is_omp_master) { ierr = pthread_barrier_destroy(ctrl->barrier);CHKERRQ(ierr); }
242   ierr = MPI_Win_free(&ctrl->omp_win);CHKERRQ(ierr);
243   PetscFunctionReturn(0);
244 }
245 
246 /* create a PETSc OpenMP controler, which manages PETSc's interaction with OpenMP runtime */
247 PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm,PetscInt nthreads,PetscOmpCtrl *pctrl)
248 {
249   PetscErrorCode        ierr;
250   PetscOmpCtrl          ctrl;
251   unsigned long         *cpu_ulongs=NULL;
252   PetscInt              i,nr_cpu_ulongs;
253   PetscShmComm          pshmcomm;
254   MPI_Comm              shm_comm;
255   PetscMPIInt           shm_rank,shm_comm_size,omp_rank,color;
256 
257   PetscFunctionBegin;
258   ierr = PetscNew(&ctrl);CHKERRQ(ierr);
259 
260   /*=================================================================================
261     Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
262     physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
263     others are called slaves. OMP Masters make up a new comm called omp_master_comm,
264     which is usually passed to third party libraries.
265    ==================================================================================*/
266 
267   /* fetch the stored shared memory communicator */
268   ierr = PetscShmCommGet(petsc_comm,&pshmcomm);CHKERRQ(ierr);
269   ierr = PetscShmCommGetMpiShmComm(pshmcomm,&shm_comm);CHKERRQ(ierr);
270 
271   ierr = MPI_Comm_rank(shm_comm,&shm_rank);CHKERRQ(ierr);
272   ierr = MPI_Comm_size(shm_comm,&shm_comm_size);CHKERRQ(ierr);
273 
274   if (nthreads < 1 || nthreads > shm_comm_size) SETERRQ2(petsc_comm,PETSC_ERR_ARG_OUTOFRANGE,"number of OpenMP threads %d can not be < 1 or > the MPI shared memory communicator size %d\n",nthreads,shm_comm_size);
275   if (shm_comm_size % nthreads) { ierr = PetscPrintf(petsc_comm,"Warning: number of OpenMP threads %d is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n",nthreads,shm_comm_size);CHKERRQ(ierr); }
276 
277   /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
278      shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
279      color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
280      be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
281      Use 0 as key so that rank ordering wont change in new comm.
282    */
283   color = shm_rank / nthreads;
284   MPI_Comm_split(shm_comm,color,0/*key*/,&ctrl->omp_comm);
285 
286   /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
287   ierr = MPI_Comm_rank(ctrl->omp_comm,&omp_rank);CHKERRQ(ierr);
288   if (!omp_rank) {
289     ctrl->is_omp_master = PETSC_TRUE;  /* master */
290     color = 0;
291   } else {
292     ctrl->is_omp_master = PETSC_FALSE; /* slave */
293     color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
294   }
295   MPI_Comm_split(petsc_comm,color,0/*key*/,&ctrl->omp_master_comm); /* rank 0 in omp_master_comm is rank 0 in petsc_comm */
296 
297   /*=================================================================================
298     Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
299     slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
300     and run them on the idle CPUs.
301    ==================================================================================*/
302   ierr = PetscOmpCtrlCreateBarrier(ctrl);CHKERRQ(ierr);
303 
304   /*=================================================================================
305     omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
306     is the union of the bindings of all ranks in the omp_comm
307     =================================================================================*/
308   ierr = hwloc_topology_init(&ctrl->topology);CHKERRQ(ierr);
309 #if HWLOC_API_VERSION >= 0x00020000
310   /* to filter out unneeded info and have faster hwloc_topology_load */
311   ierr = hwloc_topology_set_all_types_filter(ctrl->topology,HWLOC_TYPE_FILTER_KEEP_NONE);CHKERRQ(ierr);
312   ierr = hwloc_topology_set_type_filter(ctrl->topology,HWLOC_OBJ_CORE,HWLOC_TYPE_FILTER_KEEP_ALL);CHKERRQ(ierr);
313 #endif
314   ierr = hwloc_topology_load(ctrl->topology);CHKERRQ(ierr);
315 
316   ctrl->cpuset = hwloc_bitmap_alloc();
317   ierr = hwloc_get_cpubind(ctrl->topology,ctrl->cpuset, HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
318 
319   /* hwloc main developer said they would add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
320   nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset (ctrl->topology))+sizeof(unsigned long)*8)/sizeof(unsigned long)/8;
321   ierr = PetscMalloc1(nr_cpu_ulongs,&cpu_ulongs);CHKERRQ(ierr);
322   if (nr_cpu_ulongs == 1) {
323     cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
324   } else {
325     for (i=0; i<nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset,(unsigned)i);
326   }
327 
328   ierr = MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs,nr_cpu_ulongs, MPI_UNSIGNED_LONG,MPI_BOR,0,ctrl->omp_comm);CHKERRQ(ierr);
329 
330   if (ctrl->is_omp_master) {
331     ctrl->omp_cpuset = hwloc_bitmap_alloc(); if (!ctrl->omp_cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed within PetscOmpCtrlCreate()\n");
332     if (nr_cpu_ulongs == 1) {
333       ierr = hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);CHKERRQ(ierr);
334     } else {
335       for (i=0; i<nr_cpu_ulongs; i++)  { ierr = hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);CHKERRQ(ierr); }
336     }
337   }
338 
339   /* all wait for the master to finish the initialization before using the barrier */
340   ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
341   ierr = PetscFree(cpu_ulongs);CHKERRQ(ierr);
342   *pctrl = ctrl;
343   PetscFunctionReturn(0);
344 }
345 
346 PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
347 {
348   PetscErrorCode  ierr;
349   PetscOmpCtrl    ctrl = *pctrl;
350 
351   PetscFunctionBegin;
352   hwloc_bitmap_free(ctrl->cpuset);
353   hwloc_topology_destroy(ctrl->topology);
354   PetscOmpCtrlDestroyBarrier(ctrl);
355   ierr = MPI_Comm_free(&ctrl->omp_comm);CHKERRQ(ierr);
356   if (ctrl->is_omp_master) {
357     hwloc_bitmap_free(ctrl->omp_cpuset);
358     ierr = MPI_Comm_free(&ctrl->omp_master_comm);CHKERRQ(ierr);
359   }
360   ierr = PetscFree(ctrl);CHKERRQ(ierr);
361   PetscFunctionReturn(0);
362 }
363 
364 /*@C
365     PetscOmpCtrlGetOmpComms - Get MPI communicators from a PetscOmpCtrl
366 
367     Input Parameter:
368 .   ctrl - a PetscOmpCtrl
369 
370     Output Parameter:
371 +   omp_comm         - a communicator that includes a master rank and slave ranks.
372 .   omp_master_comm  - on master ranks, return a communicator that include master ranks of each omp_comm;
373                        on slave ranks, MPI_COMM_NULL will be return in reality.
374 -   is_omp_master    - true if the calling process is an OMP master rank.
375 
376     Level: developer
377 @*/
378 PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl,MPI_Comm *omp_comm,MPI_Comm *omp_master_comm,PetscBool *is_omp_master)
379 {
380   PetscFunctionBegin;
381   if (omp_comm)        *omp_comm        = ctrl->omp_comm;
382   if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
383   if (is_omp_master)   *is_omp_master   = ctrl->is_omp_master;
384   PetscFunctionReturn(0);
385 }
386 
387 /* a barrier in the scope of an omp_comm. Not using MPI_Barrier since it keeps polling and does not free CPUs OMP wants to use */
388 PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
389 {
390   PetscErrorCode ierr;
391 
392   PetscFunctionBegin;
393   ierr = pthread_barrier_wait(ctrl->barrier);
394   if (ierr && ierr != PTHREAD_BARRIER_SERIAL_THREAD) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %D\n", ierr);
395   PetscFunctionReturn(0);
396 }
397 
398 /* call this on master ranks before calling a library using OpenMP */
399 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
400 {
401   PetscErrorCode ierr;
402 
403   PetscFunctionBegin;
404   ierr = hwloc_set_cpubind(ctrl->topology,ctrl->omp_cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
405   omp_set_num_threads(ctrl->omp_comm_size); /* may override OMP_NUM_THREAD in environment */
406   PetscFunctionReturn(0);
407 }
408 
409 /* call this on master ranks after leaving a library using OpenMP */
410 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
411 {
412   PetscErrorCode ierr;
413 
414   PetscFunctionBegin;
415   ierr = hwloc_set_cpubind(ctrl->topology,ctrl->cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
416   PetscFunctionReturn(0);
417 }
418 
419 #endif /* defined(PETSC_HAVE_PTHREAD) && .. */
420