xref: /petsc/src/sys/utils/openmp/mpmpishm.c (revision ce78bad369055609e946c9d2c25ea67a45873e27)
1*ce78bad3SBarry Smith #include <petscsys.h> /*I  "petscsys.h"  I*/
2*ce78bad3SBarry Smith #include <petsc/private/petscimpl.h>
3*ce78bad3SBarry Smith #include <pthread.h>
4*ce78bad3SBarry Smith #include <hwloc.h>
5*ce78bad3SBarry Smith #include <omp.h>
6*ce78bad3SBarry Smith 
7*ce78bad3SBarry Smith /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
8*ce78bad3SBarry Smith    otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
9*ce78bad3SBarry Smith    simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
10*ce78bad3SBarry Smith    by 50%. Until the reason is found out, we use mmap() instead.
11*ce78bad3SBarry Smith */
12*ce78bad3SBarry Smith #define USE_MMAP_ALLOCATE_SHARED_MEMORY
13*ce78bad3SBarry Smith 
14*ce78bad3SBarry Smith #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
15*ce78bad3SBarry Smith   #include <sys/mman.h>
16*ce78bad3SBarry Smith   #include <sys/types.h>
17*ce78bad3SBarry Smith   #include <sys/stat.h>
18*ce78bad3SBarry Smith   #include <fcntl.h>
19*ce78bad3SBarry Smith #endif
20*ce78bad3SBarry Smith 
21*ce78bad3SBarry Smith struct _n_PetscOmpCtrl {
22*ce78bad3SBarry Smith   MPI_Comm           omp_comm;        /* a shared memory communicator to spawn omp threads */
23*ce78bad3SBarry Smith   MPI_Comm           omp_master_comm; /* a communicator to give to third party libraries */
24*ce78bad3SBarry Smith   PetscMPIInt        omp_comm_size;   /* size of omp_comm, a kind of OMP_NUM_THREADS */
25*ce78bad3SBarry Smith   PetscBool          is_omp_master;   /* rank 0's in omp_comm */
26*ce78bad3SBarry Smith   MPI_Win            omp_win;         /* a shared memory window containing a barrier */
27*ce78bad3SBarry Smith   pthread_barrier_t *barrier;         /* pointer to the barrier */
28*ce78bad3SBarry Smith   hwloc_topology_t   topology;
29*ce78bad3SBarry Smith   hwloc_cpuset_t     cpuset;     /* cpu bindings of omp master */
30*ce78bad3SBarry Smith   hwloc_cpuset_t     omp_cpuset; /* union of cpu bindings of ranks in omp_comm */
31*ce78bad3SBarry Smith };
32*ce78bad3SBarry Smith 
33*ce78bad3SBarry Smith /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
34*ce78bad3SBarry Smith    contained by the controller.
35*ce78bad3SBarry Smith 
36*ce78bad3SBarry Smith    PETSc OpenMP controller users do not call this function directly. This function exists
37*ce78bad3SBarry Smith    only because we want to separate shared memory allocation methods from other code.
38*ce78bad3SBarry Smith  */
39*ce78bad3SBarry Smith static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
40*ce78bad3SBarry Smith {
41*ce78bad3SBarry Smith   MPI_Aint              size;
42*ce78bad3SBarry Smith   void                 *baseptr;
43*ce78bad3SBarry Smith   pthread_barrierattr_t attr;
44*ce78bad3SBarry Smith 
45*ce78bad3SBarry Smith #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
46*ce78bad3SBarry Smith   int  fd;
47*ce78bad3SBarry Smith   char pathname[PETSC_MAX_PATH_LEN];
48*ce78bad3SBarry Smith #else
49*ce78bad3SBarry Smith   PetscMPIInt disp_unit;
50*ce78bad3SBarry Smith #endif
51*ce78bad3SBarry Smith 
52*ce78bad3SBarry Smith   PetscFunctionBegin;
53*ce78bad3SBarry Smith #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
54*ce78bad3SBarry Smith   size = sizeof(pthread_barrier_t);
55*ce78bad3SBarry Smith   if (ctrl->is_omp_master) {
56*ce78bad3SBarry Smith     /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
57*ce78bad3SBarry Smith     PetscCall(PetscGetTmp(PETSC_COMM_SELF, pathname, PETSC_MAX_PATH_LEN));
58*ce78bad3SBarry Smith     PetscCall(PetscStrlcat(pathname, "/petsc-shm-XXXXXX", PETSC_MAX_PATH_LEN));
59*ce78bad3SBarry Smith     /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
60*ce78bad3SBarry Smith     fd = mkstemp(pathname);
61*ce78bad3SBarry Smith     PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not create tmp file %s with mkstemp", pathname);
62*ce78bad3SBarry Smith     PetscCallExternal(ftruncate, fd, size);
63*ce78bad3SBarry Smith     baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
64*ce78bad3SBarry Smith     PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
65*ce78bad3SBarry Smith     PetscCallExternal(close, fd);
66*ce78bad3SBarry Smith     PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
67*ce78bad3SBarry Smith     /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
68*ce78bad3SBarry Smith     PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
69*ce78bad3SBarry Smith     PetscCallExternal(unlink, pathname);
70*ce78bad3SBarry Smith   } else {
71*ce78bad3SBarry Smith     PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
72*ce78bad3SBarry Smith     fd = open(pathname, O_RDWR);
73*ce78bad3SBarry Smith     PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not open tmp file %s", pathname);
74*ce78bad3SBarry Smith     baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
75*ce78bad3SBarry Smith     PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
76*ce78bad3SBarry Smith     PetscCallExternal(close, fd);
77*ce78bad3SBarry Smith     PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
78*ce78bad3SBarry Smith   }
79*ce78bad3SBarry Smith #else
80*ce78bad3SBarry Smith   size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
81*ce78bad3SBarry Smith   PetscCallMPI(MPI_Win_allocate_shared(size, 1, MPI_INFO_NULL, ctrl->omp_comm, &baseptr, &ctrl->omp_win));
82*ce78bad3SBarry Smith   PetscCallMPI(MPI_Win_shared_query(ctrl->omp_win, 0, &size, &disp_unit, &baseptr));
83*ce78bad3SBarry Smith #endif
84*ce78bad3SBarry Smith   ctrl->barrier = (pthread_barrier_t *)baseptr;
85*ce78bad3SBarry Smith 
86*ce78bad3SBarry Smith   /* omp master initializes the barrier */
87*ce78bad3SBarry Smith   if (ctrl->is_omp_master) {
88*ce78bad3SBarry Smith     PetscCallMPI(MPI_Comm_size(ctrl->omp_comm, &ctrl->omp_comm_size));
89*ce78bad3SBarry Smith     PetscCallExternal(pthread_barrierattr_init, &attr);
90*ce78bad3SBarry Smith     PetscCallExternal(pthread_barrierattr_setpshared, &attr, PTHREAD_PROCESS_SHARED); /* make the barrier also work for processes */
91*ce78bad3SBarry Smith     PetscCallExternal(pthread_barrier_init, ctrl->barrier, &attr, (unsigned int)ctrl->omp_comm_size);
92*ce78bad3SBarry Smith     PetscCallExternal(pthread_barrierattr_destroy, &attr);
93*ce78bad3SBarry Smith   }
94*ce78bad3SBarry Smith 
95*ce78bad3SBarry Smith   /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
96*ce78bad3SBarry Smith   PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
97*ce78bad3SBarry Smith   PetscFunctionReturn(PETSC_SUCCESS);
98*ce78bad3SBarry Smith }
99*ce78bad3SBarry Smith 
100*ce78bad3SBarry Smith /* Destroy the pthread barrier in the PETSc OpenMP controller */
101*ce78bad3SBarry Smith static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
102*ce78bad3SBarry Smith {
103*ce78bad3SBarry Smith   PetscFunctionBegin;
104*ce78bad3SBarry Smith   /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
105*ce78bad3SBarry Smith   PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
106*ce78bad3SBarry Smith   if (ctrl->is_omp_master) PetscCallExternal(pthread_barrier_destroy, ctrl->barrier);
107*ce78bad3SBarry Smith 
108*ce78bad3SBarry Smith #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
109*ce78bad3SBarry Smith   PetscCallExternal(munmap, ctrl->barrier, sizeof(pthread_barrier_t));
110*ce78bad3SBarry Smith #else
111*ce78bad3SBarry Smith   PetscCallMPI(MPI_Win_free(&ctrl->omp_win));
112*ce78bad3SBarry Smith #endif
113*ce78bad3SBarry Smith   PetscFunctionReturn(PETSC_SUCCESS);
114*ce78bad3SBarry Smith }
115*ce78bad3SBarry Smith 
116*ce78bad3SBarry Smith /*@C
117*ce78bad3SBarry Smith   PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries that use OpenMP
118*ce78bad3SBarry Smith 
119*ce78bad3SBarry Smith   Input Parameters:
120*ce78bad3SBarry Smith + petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
121*ce78bad3SBarry Smith - nthreads   - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value
122*ce78bad3SBarry Smith 
123*ce78bad3SBarry Smith   Output Parameter:
124*ce78bad3SBarry Smith . pctrl - a PETSc OpenMP controller
125*ce78bad3SBarry Smith 
126*ce78bad3SBarry Smith   Level: developer
127*ce78bad3SBarry Smith 
128*ce78bad3SBarry Smith   Developer Note:
129*ce78bad3SBarry Smith   Possibly use the variable `PetscNumOMPThreads` to determine the number for threads to use
130*ce78bad3SBarry Smith 
131*ce78bad3SBarry Smith .seealso: `PetscOmpCtrlDestroy()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
132*ce78bad3SBarry Smith @*/
133*ce78bad3SBarry Smith PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm, PetscInt nthreads, PetscOmpCtrl *pctrl)
134*ce78bad3SBarry Smith {
135*ce78bad3SBarry Smith   PetscOmpCtrl   ctrl;
136*ce78bad3SBarry Smith   unsigned long *cpu_ulongs = NULL;
137*ce78bad3SBarry Smith   PetscInt       i, nr_cpu_ulongs;
138*ce78bad3SBarry Smith   PetscShmComm   pshmcomm;
139*ce78bad3SBarry Smith   MPI_Comm       shm_comm;
140*ce78bad3SBarry Smith   PetscMPIInt    shm_rank, shm_comm_size, omp_rank, color;
141*ce78bad3SBarry Smith   PetscInt       num_packages, num_cores;
142*ce78bad3SBarry Smith 
143*ce78bad3SBarry Smith   PetscFunctionBegin;
144*ce78bad3SBarry Smith   PetscCall(PetscNew(&ctrl));
145*ce78bad3SBarry Smith 
146*ce78bad3SBarry Smith   /*=================================================================================
147*ce78bad3SBarry Smith     Init hwloc
148*ce78bad3SBarry Smith    ==================================================================================*/
149*ce78bad3SBarry Smith   PetscCallExternal(hwloc_topology_init, &ctrl->topology);
150*ce78bad3SBarry Smith #if HWLOC_API_VERSION >= 0x00020000
151*ce78bad3SBarry Smith   /* to filter out unneeded info and have faster hwloc_topology_load */
152*ce78bad3SBarry Smith   PetscCallExternal(hwloc_topology_set_all_types_filter, ctrl->topology, HWLOC_TYPE_FILTER_KEEP_NONE);
153*ce78bad3SBarry Smith   PetscCallExternal(hwloc_topology_set_type_filter, ctrl->topology, HWLOC_OBJ_CORE, HWLOC_TYPE_FILTER_KEEP_ALL);
154*ce78bad3SBarry Smith #endif
155*ce78bad3SBarry Smith   PetscCallExternal(hwloc_topology_load, ctrl->topology);
156*ce78bad3SBarry Smith 
157*ce78bad3SBarry Smith   /*=================================================================================
158*ce78bad3SBarry Smith     Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
159*ce78bad3SBarry Smith     physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
160*ce78bad3SBarry Smith     others are called slaves. OMP Masters make up a new comm called omp_master_comm,
161*ce78bad3SBarry Smith     which is usually passed to third party libraries.
162*ce78bad3SBarry Smith    ==================================================================================*/
163*ce78bad3SBarry Smith 
164*ce78bad3SBarry Smith   /* fetch the stored shared memory communicator */
165*ce78bad3SBarry Smith   PetscCall(PetscShmCommGet(petsc_comm, &pshmcomm));
166*ce78bad3SBarry Smith   PetscCall(PetscShmCommGetMpiShmComm(pshmcomm, &shm_comm));
167*ce78bad3SBarry Smith 
168*ce78bad3SBarry Smith   PetscCallMPI(MPI_Comm_rank(shm_comm, &shm_rank));
169*ce78bad3SBarry Smith   PetscCallMPI(MPI_Comm_size(shm_comm, &shm_comm_size));
170*ce78bad3SBarry Smith 
171*ce78bad3SBarry Smith   /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
172*ce78bad3SBarry Smith   if (nthreads == -1) {
173*ce78bad3SBarry Smith     num_packages = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE);
174*ce78bad3SBarry Smith     num_cores    = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE);
175*ce78bad3SBarry Smith     nthreads     = num_cores / num_packages;
176*ce78bad3SBarry Smith     if (nthreads > shm_comm_size) nthreads = shm_comm_size;
177*ce78bad3SBarry Smith   }
178*ce78bad3SBarry Smith 
179*ce78bad3SBarry Smith   PetscCheck(nthreads >= 1 && nthreads <= shm_comm_size, petsc_comm, PETSC_ERR_ARG_OUTOFRANGE, "number of OpenMP threads %" PetscInt_FMT " can not be < 1 or > the MPI shared memory communicator size %d", nthreads, shm_comm_size);
180*ce78bad3SBarry Smith   if (shm_comm_size % nthreads) PetscCall(PetscPrintf(petsc_comm, "Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n", nthreads, shm_comm_size));
181*ce78bad3SBarry Smith 
182*ce78bad3SBarry Smith   /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
183*ce78bad3SBarry Smith      shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
184*ce78bad3SBarry Smith      color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
185*ce78bad3SBarry Smith      be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
186*ce78bad3SBarry Smith      Use 0 as key so that rank ordering wont change in new comm.
187*ce78bad3SBarry Smith    */
188*ce78bad3SBarry Smith   color = shm_rank / nthreads;
189*ce78bad3SBarry Smith   PetscCallMPI(MPI_Comm_split(shm_comm, color, 0 /*key*/, &ctrl->omp_comm));
190*ce78bad3SBarry Smith 
191*ce78bad3SBarry Smith   /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
192*ce78bad3SBarry Smith   PetscCallMPI(MPI_Comm_rank(ctrl->omp_comm, &omp_rank));
193*ce78bad3SBarry Smith   if (!omp_rank) {
194*ce78bad3SBarry Smith     ctrl->is_omp_master = PETSC_TRUE; /* master */
195*ce78bad3SBarry Smith     color               = 0;
196*ce78bad3SBarry Smith   } else {
197*ce78bad3SBarry Smith     ctrl->is_omp_master = PETSC_FALSE;   /* slave */
198*ce78bad3SBarry Smith     color               = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
199*ce78bad3SBarry Smith   }
200*ce78bad3SBarry Smith   PetscCallMPI(MPI_Comm_split(petsc_comm, color, 0 /*key*/, &ctrl->omp_master_comm));
201*ce78bad3SBarry Smith 
202*ce78bad3SBarry Smith   /*=================================================================================
203*ce78bad3SBarry Smith     Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
204*ce78bad3SBarry Smith     slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
205*ce78bad3SBarry Smith     and run them on the idle CPUs.
206*ce78bad3SBarry Smith    ==================================================================================*/
207*ce78bad3SBarry Smith   PetscCall(PetscOmpCtrlCreateBarrier(ctrl));
208*ce78bad3SBarry Smith 
209*ce78bad3SBarry Smith   /*=================================================================================
210*ce78bad3SBarry Smith     omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
211*ce78bad3SBarry Smith     is the union of the bindings of all ranks in the omp_comm
212*ce78bad3SBarry Smith     =================================================================================*/
213*ce78bad3SBarry Smith 
214*ce78bad3SBarry Smith   ctrl->cpuset = hwloc_bitmap_alloc();
215*ce78bad3SBarry Smith   PetscCheck(ctrl->cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
216*ce78bad3SBarry Smith   PetscCallExternal(hwloc_get_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);
217*ce78bad3SBarry Smith 
218*ce78bad3SBarry Smith   /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
219*ce78bad3SBarry Smith   nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset(ctrl->topology)) + sizeof(unsigned long) * 8) / sizeof(unsigned long) / 8;
220*ce78bad3SBarry Smith   PetscCall(PetscMalloc1(nr_cpu_ulongs, &cpu_ulongs));
221*ce78bad3SBarry Smith   if (nr_cpu_ulongs == 1) {
222*ce78bad3SBarry Smith     cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
223*ce78bad3SBarry Smith   } else {
224*ce78bad3SBarry Smith     for (i = 0; i < nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset, (unsigned)i);
225*ce78bad3SBarry Smith   }
226*ce78bad3SBarry Smith 
227*ce78bad3SBarry Smith   PetscCallMPI(MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs, nr_cpu_ulongs, MPI_UNSIGNED_LONG, MPI_BOR, 0, ctrl->omp_comm));
228*ce78bad3SBarry Smith 
229*ce78bad3SBarry Smith   if (ctrl->is_omp_master) {
230*ce78bad3SBarry Smith     ctrl->omp_cpuset = hwloc_bitmap_alloc();
231*ce78bad3SBarry Smith     PetscCheck(ctrl->omp_cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
232*ce78bad3SBarry Smith     if (nr_cpu_ulongs == 1) {
233*ce78bad3SBarry Smith #if HWLOC_API_VERSION >= 0x00020000
234*ce78bad3SBarry Smith       PetscCallExternal(hwloc_bitmap_from_ulong, ctrl->omp_cpuset, cpu_ulongs[0]);
235*ce78bad3SBarry Smith #else
236*ce78bad3SBarry Smith       hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]);
237*ce78bad3SBarry Smith #endif
238*ce78bad3SBarry Smith     } else {
239*ce78bad3SBarry Smith       for (i = 0; i < nr_cpu_ulongs; i++) {
240*ce78bad3SBarry Smith #if HWLOC_API_VERSION >= 0x00020000
241*ce78bad3SBarry Smith         PetscCallExternal(hwloc_bitmap_set_ith_ulong, ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
242*ce78bad3SBarry Smith #else
243*ce78bad3SBarry Smith         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
244*ce78bad3SBarry Smith #endif
245*ce78bad3SBarry Smith       }
246*ce78bad3SBarry Smith     }
247*ce78bad3SBarry Smith   }
248*ce78bad3SBarry Smith   PetscCall(PetscFree(cpu_ulongs));
249*ce78bad3SBarry Smith   *pctrl = ctrl;
250*ce78bad3SBarry Smith   PetscFunctionReturn(PETSC_SUCCESS);
251*ce78bad3SBarry Smith }
252*ce78bad3SBarry Smith 
253*ce78bad3SBarry Smith /*@C
254*ce78bad3SBarry Smith   PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller
255*ce78bad3SBarry Smith 
256*ce78bad3SBarry Smith   Input Parameter:
257*ce78bad3SBarry Smith . pctrl - a PETSc OpenMP controller
258*ce78bad3SBarry Smith 
259*ce78bad3SBarry Smith   Level: developer
260*ce78bad3SBarry Smith 
261*ce78bad3SBarry Smith .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
262*ce78bad3SBarry Smith @*/
263*ce78bad3SBarry Smith PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
264*ce78bad3SBarry Smith {
265*ce78bad3SBarry Smith   PetscOmpCtrl ctrl = *pctrl;
266*ce78bad3SBarry Smith 
267*ce78bad3SBarry Smith   PetscFunctionBegin;
268*ce78bad3SBarry Smith   hwloc_bitmap_free(ctrl->cpuset);
269*ce78bad3SBarry Smith   hwloc_topology_destroy(ctrl->topology);
270*ce78bad3SBarry Smith   PetscCall(PetscOmpCtrlDestroyBarrier(ctrl));
271*ce78bad3SBarry Smith   PetscCallMPI(MPI_Comm_free(&ctrl->omp_comm));
272*ce78bad3SBarry Smith   if (ctrl->is_omp_master) {
273*ce78bad3SBarry Smith     hwloc_bitmap_free(ctrl->omp_cpuset);
274*ce78bad3SBarry Smith     PetscCallMPI(MPI_Comm_free(&ctrl->omp_master_comm));
275*ce78bad3SBarry Smith   }
276*ce78bad3SBarry Smith   PetscCall(PetscFree(ctrl));
277*ce78bad3SBarry Smith   PetscFunctionReturn(PETSC_SUCCESS);
278*ce78bad3SBarry Smith }
279*ce78bad3SBarry Smith 
280*ce78bad3SBarry Smith /*@C
281*ce78bad3SBarry Smith   PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller
282*ce78bad3SBarry Smith 
283*ce78bad3SBarry Smith   Input Parameter:
284*ce78bad3SBarry Smith . ctrl - a PETSc OMP controller
285*ce78bad3SBarry Smith 
286*ce78bad3SBarry Smith   Output Parameters:
287*ce78bad3SBarry Smith + omp_comm        - a communicator that includes a master rank and slave ranks where master spawns threads
288*ce78bad3SBarry Smith . omp_master_comm - on master ranks, return a communicator that include master ranks of each omp_comm;
289*ce78bad3SBarry Smith                     on slave ranks, `MPI_COMM_NULL` will be return in reality.
290*ce78bad3SBarry Smith - is_omp_master   - true if the calling process is an OMP master rank.
291*ce78bad3SBarry Smith 
292*ce78bad3SBarry Smith   Note:
293*ce78bad3SBarry Smith   Any output parameter can be `NULL`. The parameter is just ignored.
294*ce78bad3SBarry Smith 
295*ce78bad3SBarry Smith   Level: developer
296*ce78bad3SBarry Smith 
297*ce78bad3SBarry Smith .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
298*ce78bad3SBarry Smith @*/
299*ce78bad3SBarry Smith PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl, MPI_Comm *omp_comm, MPI_Comm *omp_master_comm, PetscBool *is_omp_master)
300*ce78bad3SBarry Smith {
301*ce78bad3SBarry Smith   PetscFunctionBegin;
302*ce78bad3SBarry Smith   if (omp_comm) *omp_comm = ctrl->omp_comm;
303*ce78bad3SBarry Smith   if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
304*ce78bad3SBarry Smith   if (is_omp_master) *is_omp_master = ctrl->is_omp_master;
305*ce78bad3SBarry Smith   PetscFunctionReturn(PETSC_SUCCESS);
306*ce78bad3SBarry Smith }
307*ce78bad3SBarry Smith 
308*ce78bad3SBarry Smith /*@C
309*ce78bad3SBarry Smith   PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU)
310*ce78bad3SBarry Smith 
311*ce78bad3SBarry Smith   Input Parameter:
312*ce78bad3SBarry Smith . ctrl - a PETSc OMP controller
313*ce78bad3SBarry Smith 
314*ce78bad3SBarry Smith   Notes:
315*ce78bad3SBarry Smith   This is a pthread barrier on MPI ranks. Using `MPI_Barrier()` instead is conceptually correct. But MPI standard does not
316*ce78bad3SBarry Smith   require processes blocked by `MPI_Barrier()` free their CPUs to let other processes progress. In practice, to minilize latency,
317*ce78bad3SBarry Smith   MPI ranks stuck in `MPI_Barrier()` keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.
318*ce78bad3SBarry Smith 
319*ce78bad3SBarry Smith   A code using `PetscOmpCtrlBarrier()` would be like this,
320*ce78bad3SBarry Smith .vb
321*ce78bad3SBarry Smith   if (is_omp_master) {
322*ce78bad3SBarry Smith     PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
323*ce78bad3SBarry Smith     Call the library using OpenMP
324*ce78bad3SBarry Smith     PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
325*ce78bad3SBarry Smith   }
326*ce78bad3SBarry Smith   PetscOmpCtrlBarrier(ctrl);
327*ce78bad3SBarry Smith .ve
328*ce78bad3SBarry Smith 
329*ce78bad3SBarry Smith   Level: developer
330*ce78bad3SBarry Smith 
331*ce78bad3SBarry Smith .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`,
332*ce78bad3SBarry Smith @*/
333*ce78bad3SBarry Smith PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
334*ce78bad3SBarry Smith {
335*ce78bad3SBarry Smith   int err;
336*ce78bad3SBarry Smith 
337*ce78bad3SBarry Smith   PetscFunctionBegin;
338*ce78bad3SBarry Smith   err = pthread_barrier_wait(ctrl->barrier);
339*ce78bad3SBarry Smith   PetscCheck(!err || err == PTHREAD_BARRIER_SERIAL_THREAD, PETSC_COMM_SELF, PETSC_ERR_LIB, "pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %d", err);
340*ce78bad3SBarry Smith   PetscFunctionReturn(PETSC_SUCCESS);
341*ce78bad3SBarry Smith }
342*ce78bad3SBarry Smith 
343*ce78bad3SBarry Smith /*@C
344*ce78bad3SBarry Smith   PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks
345*ce78bad3SBarry Smith 
346*ce78bad3SBarry Smith   Input Parameter:
347*ce78bad3SBarry Smith . ctrl - a PETSc OMP controller
348*ce78bad3SBarry Smith 
349*ce78bad3SBarry Smith   Note:
350*ce78bad3SBarry Smith   Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
351*ce78bad3SBarry Smith   This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime
352*ce78bad3SBarry Smith 
353*ce78bad3SBarry Smith   Level: developer
354*ce78bad3SBarry Smith 
355*ce78bad3SBarry Smith .seealso: `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
356*ce78bad3SBarry Smith @*/
357*ce78bad3SBarry Smith PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
358*ce78bad3SBarry Smith {
359*ce78bad3SBarry Smith   PetscFunctionBegin;
360*ce78bad3SBarry Smith   PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->omp_cpuset, HWLOC_CPUBIND_PROCESS);
361*ce78bad3SBarry Smith   omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
362*ce78bad3SBarry Smith   PetscFunctionReturn(PETSC_SUCCESS);
363*ce78bad3SBarry Smith }
364*ce78bad3SBarry Smith 
365*ce78bad3SBarry Smith /*@C
366*ce78bad3SBarry Smith   PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks
367*ce78bad3SBarry Smith 
368*ce78bad3SBarry Smith   Input Parameter:
369*ce78bad3SBarry Smith . ctrl - a PETSc OMP controller
370*ce78bad3SBarry Smith 
371*ce78bad3SBarry Smith   Note:
372*ce78bad3SBarry Smith   Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
373*ce78bad3SBarry Smith   This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.
374*ce78bad3SBarry Smith 
375*ce78bad3SBarry Smith   Level: developer
376*ce78bad3SBarry Smith 
377*ce78bad3SBarry Smith .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
378*ce78bad3SBarry Smith @*/
379*ce78bad3SBarry Smith PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
380*ce78bad3SBarry Smith {
381*ce78bad3SBarry Smith   PetscFunctionBegin;
382*ce78bad3SBarry Smith   PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);
383*ce78bad3SBarry Smith   omp_set_num_threads(1);
384*ce78bad3SBarry Smith   PetscFunctionReturn(PETSC_SUCCESS);
385*ce78bad3SBarry Smith }
386*ce78bad3SBarry Smith 
387*ce78bad3SBarry Smith #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
388