xref: /petsc/src/sys/utils/mpishm.c (revision 3ba1676111f5c958fe6c2729b46ca4d523958bb3)
15f7487a0SJunchao Zhang #include <petscsys.h> /*I  "petscsys.h"  I*/
25f7487a0SJunchao Zhang #include <petsc/private/petscimpl.h>
35f7487a0SJunchao Zhang 
45f7487a0SJunchao Zhang struct _n_PetscShmComm {
55f7487a0SJunchao Zhang   PetscMPIInt *globranks;         /* global ranks of each rank in the shared memory communicator */
65f7487a0SJunchao Zhang   PetscMPIInt  shmsize;           /* size of the shared memory communicator */
75f7487a0SJunchao Zhang   MPI_Comm     globcomm, shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */
85f7487a0SJunchao Zhang };
95f7487a0SJunchao Zhang 
105f7487a0SJunchao Zhang /*
1133779a13SJunchao Zhang    Private routine to delete internal shared memory communicator when a communicator is freed.
125f7487a0SJunchao Zhang 
135f7487a0SJunchao Zhang    This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this  data as an attribute is freed.
145f7487a0SJunchao Zhang 
155f7487a0SJunchao Zhang    Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval()
165f7487a0SJunchao Zhang 
175f7487a0SJunchao Zhang */
18d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscMPIInt MPIAPI Petsc_ShmComm_Attr_Delete_Fn(MPI_Comm comm, PetscMPIInt keyval, void *val, void *extra_state)
19d71ae5a4SJacob Faibussowitsch {
205f7487a0SJunchao Zhang   PetscShmComm p = (PetscShmComm)val;
215f7487a0SJunchao Zhang 
225f7487a0SJunchao Zhang   PetscFunctionBegin;
239566063dSJacob Faibussowitsch   PetscCallMPI(PetscInfo(NULL, "Deleting shared memory subcommunicator in a MPI_Comm %ld\n", (long)comm));
249566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_free(&p->shmcomm));
259566063dSJacob Faibussowitsch   PetscCallMPI(PetscFree(p->globranks));
269566063dSJacob Faibussowitsch   PetscCallMPI(PetscFree(val));
275f7487a0SJunchao Zhang   PetscFunctionReturn(MPI_SUCCESS);
285f7487a0SJunchao Zhang }
295f7487a0SJunchao Zhang 
30b48189acSJunchao Zhang #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
31b48189acSJunchao Zhang   /* Data structures to support freeing comms created in PetscShmCommGet().
32b48189acSJunchao Zhang   Since we predict communicators passed to PetscShmCommGet() are very likely
33b48189acSJunchao Zhang   either a petsc inner communicator or an MPI communicator with a linked petsc
34b48189acSJunchao Zhang   inner communicator, we use a simple static array to store dupped communicators
35b48189acSJunchao Zhang   on rare cases otherwise.
36b48189acSJunchao Zhang  */
37b48189acSJunchao Zhang   #define MAX_SHMCOMM_DUPPED_COMMS 16
38b48189acSJunchao Zhang static PetscInt       num_dupped_comms = 0;
39b48189acSJunchao Zhang static MPI_Comm       shmcomm_dupped_comms[MAX_SHMCOMM_DUPPED_COMMS];
40d71ae5a4SJacob Faibussowitsch static PetscErrorCode PetscShmCommDestroyDuppedComms(void)
41d71ae5a4SJacob Faibussowitsch {
42b48189acSJunchao Zhang   PetscInt i;
43b48189acSJunchao Zhang   PetscFunctionBegin;
449566063dSJacob Faibussowitsch   for (i = 0; i < num_dupped_comms; i++) PetscCall(PetscCommDestroy(&shmcomm_dupped_comms[i]));
45b48189acSJunchao Zhang   num_dupped_comms = 0; /* reset so that PETSc can be reinitialized */
46*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
47b48189acSJunchao Zhang }
48b48189acSJunchao Zhang #endif
49b48189acSJunchao Zhang 
505f7487a0SJunchao Zhang /*@C
51811af0c4SBarry Smith     PetscShmCommGet - Returns a sub-communicator of all ranks that share a common memory
525f7487a0SJunchao Zhang 
53d083f849SBarry Smith     Collective.
545f7487a0SJunchao Zhang 
555f7487a0SJunchao Zhang     Input Parameter:
56811af0c4SBarry Smith .   globcomm - `MPI_Comm`, which can be a user MPI_Comm or a PETSc inner MPI_Comm
575f7487a0SJunchao Zhang 
585f7487a0SJunchao Zhang     Output Parameter:
595f7487a0SJunchao Zhang .   pshmcomm - the PETSc shared memory communicator object
605f7487a0SJunchao Zhang 
615f7487a0SJunchao Zhang     Level: developer
625f7487a0SJunchao Zhang 
63811af0c4SBarry Smith     Note:
645f7487a0SJunchao Zhang        When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis
655f7487a0SJunchao Zhang 
66811af0c4SBarry Smith .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()`
675f7487a0SJunchao Zhang @*/
68d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscShmCommGet(MPI_Comm globcomm, PetscShmComm *pshmcomm)
69d71ae5a4SJacob Faibussowitsch {
705f7487a0SJunchao Zhang #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
715f7487a0SJunchao Zhang   MPI_Group         globgroup, shmgroup;
725f7487a0SJunchao Zhang   PetscMPIInt      *shmranks, i, flg;
735f7487a0SJunchao Zhang   PetscCommCounter *counter;
745f7487a0SJunchao Zhang 
755f7487a0SJunchao Zhang   PetscFunctionBegin;
763ca90d2dSJacob Faibussowitsch   PetscValidPointer(pshmcomm, 2);
77b48189acSJunchao Zhang   /* Get a petsc inner comm, since we always want to stash pshmcomm on petsc inner comms */
789566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_Counter_keyval, &counter, &flg));
79b48189acSJunchao Zhang   if (!flg) { /* globcomm is not a petsc comm */
809371c9d4SSatish Balay     union
819371c9d4SSatish Balay     {
829371c9d4SSatish Balay       MPI_Comm comm;
839371c9d4SSatish Balay       void    *ptr;
849371c9d4SSatish Balay     } ucomm;
85b48189acSJunchao Zhang     /* check if globcomm already has a linked petsc inner comm */
869566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_InnerComm_keyval, &ucomm, &flg));
87b48189acSJunchao Zhang     if (!flg) {
88b48189acSJunchao Zhang       /* globcomm does not have a linked petsc inner comm, so we create one and replace globcomm with it */
8908401ef6SPierre Jolivet       PetscCheck(num_dupped_comms < MAX_SHMCOMM_DUPPED_COMMS, globcomm, PETSC_ERR_PLIB, "PetscShmCommGet() is trying to dup more than %d MPI_Comms", MAX_SHMCOMM_DUPPED_COMMS);
909566063dSJacob Faibussowitsch       PetscCall(PetscCommDuplicate(globcomm, &globcomm, NULL));
91b48189acSJunchao Zhang       /* Register a function to free the dupped petsc comms at PetscFinalize at the first time */
929566063dSJacob Faibussowitsch       if (num_dupped_comms == 0) PetscCall(PetscRegisterFinalize(PetscShmCommDestroyDuppedComms));
93b48189acSJunchao Zhang       shmcomm_dupped_comms[num_dupped_comms] = globcomm;
94b48189acSJunchao Zhang       num_dupped_comms++;
95b48189acSJunchao Zhang     } else {
96b48189acSJunchao Zhang       /* otherwise, we pull out the inner comm and use it as globcomm */
97b48189acSJunchao Zhang       globcomm = ucomm.comm;
98b48189acSJunchao Zhang     }
99b48189acSJunchao Zhang   }
1005f7487a0SJunchao Zhang 
101b48189acSJunchao Zhang   /* Check if globcomm already has an attached pshmcomm. If no, create one */
1029566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_ShmComm_keyval, pshmcomm, &flg));
103*3ba16761SJacob Faibussowitsch   if (flg) PetscFunctionReturn(PETSC_SUCCESS);
1045f7487a0SJunchao Zhang 
1059566063dSJacob Faibussowitsch   PetscCall(PetscNew(pshmcomm));
1065f7487a0SJunchao Zhang   (*pshmcomm)->globcomm = globcomm;
1075f7487a0SJunchao Zhang 
1089566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &(*pshmcomm)->shmcomm));
1095f7487a0SJunchao Zhang 
1109566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size((*pshmcomm)->shmcomm, &(*pshmcomm)->shmsize));
1119566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_group(globcomm, &globgroup));
1129566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup));
1139566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &shmranks));
1149566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &(*pshmcomm)->globranks));
1155f7487a0SJunchao Zhang   for (i = 0; i < (*pshmcomm)->shmsize; i++) shmranks[i] = i;
1169566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks));
1179566063dSJacob Faibussowitsch   PetscCall(PetscFree(shmranks));
1189566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Group_free(&globgroup));
1199566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Group_free(&shmgroup));
1205f7487a0SJunchao Zhang 
12148a46eb9SPierre Jolivet   for (i = 0; i < (*pshmcomm)->shmsize; i++) PetscCall(PetscInfo(NULL, "Shared memory rank %d global rank %d\n", i, (*pshmcomm)->globranks[i]));
1229566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_set_attr(globcomm, Petsc_ShmComm_keyval, *pshmcomm));
123*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1245f7487a0SJunchao Zhang #else
1255f7487a0SJunchao Zhang   SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich.");
1265f7487a0SJunchao Zhang #endif
1275f7487a0SJunchao Zhang }
1285f7487a0SJunchao Zhang 
1295f7487a0SJunchao Zhang /*@C
1305f7487a0SJunchao Zhang     PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator
1315f7487a0SJunchao Zhang 
1325f7487a0SJunchao Zhang     Input Parameters:
1335f7487a0SJunchao Zhang +   pshmcomm - the shared memory communicator object
1345f7487a0SJunchao Zhang -   grank    - the global rank
1355f7487a0SJunchao Zhang 
1365f7487a0SJunchao Zhang     Output Parameter:
137811af0c4SBarry Smith .   lrank - the local rank, or `MPI_PROC_NULL` if it does not exist
1385f7487a0SJunchao Zhang 
1395f7487a0SJunchao Zhang     Level: developer
1405f7487a0SJunchao Zhang 
1415f7487a0SJunchao Zhang     Developer Notes:
1425f7487a0SJunchao Zhang     Assumes the pshmcomm->globranks[] is sorted
1435f7487a0SJunchao Zhang 
1445f7487a0SJunchao Zhang     It may be better to rewrite this to map multiple global ranks to local in the same function call
1455f7487a0SJunchao Zhang 
146811af0c4SBarry Smith .seealso: `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()`
1475f7487a0SJunchao Zhang @*/
148d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm, PetscMPIInt grank, PetscMPIInt *lrank)
149d71ae5a4SJacob Faibussowitsch {
1505f7487a0SJunchao Zhang   PetscMPIInt low, high, t, i;
1515f7487a0SJunchao Zhang   PetscBool   flg = PETSC_FALSE;
1525f7487a0SJunchao Zhang 
1535f7487a0SJunchao Zhang   PetscFunctionBegin;
1543ca90d2dSJacob Faibussowitsch   PetscValidPointer(pshmcomm, 1);
155dadcf809SJacob Faibussowitsch   PetscValidIntPointer(lrank, 3);
1565f7487a0SJunchao Zhang   *lrank = MPI_PROC_NULL;
157*3ba16761SJacob Faibussowitsch   if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(PETSC_SUCCESS);
158*3ba16761SJacob Faibussowitsch   if (grank > pshmcomm->globranks[pshmcomm->shmsize - 1]) PetscFunctionReturn(PETSC_SUCCESS);
1599566063dSJacob Faibussowitsch   PetscCall(PetscOptionsGetBool(NULL, NULL, "-noshared", &flg, NULL));
160*3ba16761SJacob Faibussowitsch   if (flg) PetscFunctionReturn(PETSC_SUCCESS);
1615f7487a0SJunchao Zhang   low  = 0;
1625f7487a0SJunchao Zhang   high = pshmcomm->shmsize;
1635f7487a0SJunchao Zhang   while (high - low > 5) {
1645f7487a0SJunchao Zhang     t = (low + high) / 2;
1655f7487a0SJunchao Zhang     if (pshmcomm->globranks[t] > grank) high = t;
1665f7487a0SJunchao Zhang     else low = t;
1675f7487a0SJunchao Zhang   }
1685f7487a0SJunchao Zhang   for (i = low; i < high; i++) {
169*3ba16761SJacob Faibussowitsch     if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(PETSC_SUCCESS);
1705f7487a0SJunchao Zhang     if (pshmcomm->globranks[i] == grank) {
1715f7487a0SJunchao Zhang       *lrank = i;
172*3ba16761SJacob Faibussowitsch       PetscFunctionReturn(PETSC_SUCCESS);
1735f7487a0SJunchao Zhang     }
1745f7487a0SJunchao Zhang   }
175*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1765f7487a0SJunchao Zhang }
1775f7487a0SJunchao Zhang 
1785f7487a0SJunchao Zhang /*@C
1795f7487a0SJunchao Zhang     PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank
1805f7487a0SJunchao Zhang 
1815f7487a0SJunchao Zhang     Input Parameters:
1825f7487a0SJunchao Zhang +   pshmcomm - the shared memory communicator object
1835f7487a0SJunchao Zhang -   lrank    - the local rank in the shared memory communicator
1845f7487a0SJunchao Zhang 
1855f7487a0SJunchao Zhang     Output Parameter:
1865f7487a0SJunchao Zhang .   grank - the global rank in the global communicator where the shared memory communicator is built
1875f7487a0SJunchao Zhang 
1885f7487a0SJunchao Zhang     Level: developer
1895f7487a0SJunchao Zhang 
190811af0c4SBarry Smith .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommGetMpiShmComm()`
1915f7487a0SJunchao Zhang @*/
192d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm, PetscMPIInt lrank, PetscMPIInt *grank)
193d71ae5a4SJacob Faibussowitsch {
1945f7487a0SJunchao Zhang   PetscFunctionBegin;
1953ca90d2dSJacob Faibussowitsch   PetscValidPointer(pshmcomm, 1);
196dadcf809SJacob Faibussowitsch   PetscValidIntPointer(grank, 3);
1972c71b3e2SJacob Faibussowitsch   PetscCheck(lrank >= 0 && lrank < pshmcomm->shmsize, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "No rank %d in the shared memory communicator", lrank);
1985f7487a0SJunchao Zhang   *grank = pshmcomm->globranks[lrank];
199*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2005f7487a0SJunchao Zhang }
2015f7487a0SJunchao Zhang 
2025f7487a0SJunchao Zhang /*@C
2035f7487a0SJunchao Zhang     PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory
2045f7487a0SJunchao Zhang 
2055f7487a0SJunchao Zhang     Input Parameter:
2065f7487a0SJunchao Zhang .   pshmcomm - PetscShmComm object obtained with PetscShmCommGet()
2075f7487a0SJunchao Zhang 
2085f7487a0SJunchao Zhang     Output Parameter:
2095f7487a0SJunchao Zhang .   comm     - the MPI communicator
2105f7487a0SJunchao Zhang 
2115f7487a0SJunchao Zhang     Level: developer
2125f7487a0SJunchao Zhang 
213811af0c4SBarry Smith .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()`
2145f7487a0SJunchao Zhang @*/
215d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm, MPI_Comm *comm)
216d71ae5a4SJacob Faibussowitsch {
2175f7487a0SJunchao Zhang   PetscFunctionBegin;
2183ca90d2dSJacob Faibussowitsch   PetscValidPointer(pshmcomm, 1);
2193ca90d2dSJacob Faibussowitsch   PetscValidPointer(comm, 2);
2205f7487a0SJunchao Zhang   *comm = pshmcomm->shmcomm;
221*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2225f7487a0SJunchao Zhang }
2235f7487a0SJunchao Zhang 
22420b3346cSJunchao Zhang #if defined(PETSC_HAVE_OPENMP_SUPPORT)
225a32e93adSJunchao Zhang   #include <pthread.h>
226a32e93adSJunchao Zhang   #include <hwloc.h>
227a32e93adSJunchao Zhang   #include <omp.h>
228a32e93adSJunchao Zhang 
229eff715bbSJunchao Zhang   /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
230eff715bbSJunchao Zhang    otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
2314df5c2c7SJunchao Zhang    simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
2324df5c2c7SJunchao Zhang    by 50%. Until the reason is found out, we use mmap() instead.
2334df5c2c7SJunchao Zhang */
2344df5c2c7SJunchao Zhang   #define USE_MMAP_ALLOCATE_SHARED_MEMORY
2354df5c2c7SJunchao Zhang 
2364df5c2c7SJunchao Zhang   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
2374df5c2c7SJunchao Zhang     #include <sys/mman.h>
2384df5c2c7SJunchao Zhang     #include <sys/types.h>
2394df5c2c7SJunchao Zhang     #include <sys/stat.h>
2404df5c2c7SJunchao Zhang     #include <fcntl.h>
2414df5c2c7SJunchao Zhang   #endif
2424df5c2c7SJunchao Zhang 
243a32e93adSJunchao Zhang struct _n_PetscOmpCtrl {
244a32e93adSJunchao Zhang   MPI_Comm           omp_comm;        /* a shared memory communicator to spawn omp threads */
245a32e93adSJunchao Zhang   MPI_Comm           omp_master_comm; /* a communicator to give to third party libraries */
246a32e93adSJunchao Zhang   PetscMPIInt        omp_comm_size;   /* size of omp_comm, a kind of OMP_NUM_THREADS */
247a32e93adSJunchao Zhang   PetscBool          is_omp_master;   /* rank 0's in omp_comm */
248a32e93adSJunchao Zhang   MPI_Win            omp_win;         /* a shared memory window containing a barrier */
249a32e93adSJunchao Zhang   pthread_barrier_t *barrier;         /* pointer to the barrier */
250a32e93adSJunchao Zhang   hwloc_topology_t   topology;
251a32e93adSJunchao Zhang   hwloc_cpuset_t     cpuset;     /* cpu bindings of omp master */
252a32e93adSJunchao Zhang   hwloc_cpuset_t     omp_cpuset; /* union of cpu bindings of ranks in omp_comm */
253a32e93adSJunchao Zhang };
254a32e93adSJunchao Zhang 
255eff715bbSJunchao Zhang /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
2568fcaa860SBarry Smith    contained by the controller.
257eff715bbSJunchao Zhang 
2588fcaa860SBarry Smith    PETSc OpenMP controller users do not call this function directly. This function exists
259eff715bbSJunchao Zhang    only because we want to separate shared memory allocation methods from other code.
260eff715bbSJunchao Zhang  */
261d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
262d71ae5a4SJacob Faibussowitsch {
263a32e93adSJunchao Zhang   MPI_Aint              size;
264a32e93adSJunchao Zhang   void                 *baseptr;
265a32e93adSJunchao Zhang   pthread_barrierattr_t attr;
266a32e93adSJunchao Zhang 
2674df5c2c7SJunchao Zhang   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
2684df5c2c7SJunchao Zhang   PetscInt  fd;
2694df5c2c7SJunchao Zhang   PetscChar pathname[PETSC_MAX_PATH_LEN];
2704df5c2c7SJunchao Zhang   #else
2714df5c2c7SJunchao Zhang   PetscMPIInt disp_unit;
2724df5c2c7SJunchao Zhang   #endif
2734df5c2c7SJunchao Zhang 
2744df5c2c7SJunchao Zhang   PetscFunctionBegin;
2754df5c2c7SJunchao Zhang   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
2764df5c2c7SJunchao Zhang   size = sizeof(pthread_barrier_t);
2774df5c2c7SJunchao Zhang   if (ctrl->is_omp_master) {
278eff715bbSJunchao Zhang     /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
2799566063dSJacob Faibussowitsch     PetscCall(PetscGetTmp(PETSC_COMM_SELF, pathname, PETSC_MAX_PATH_LEN));
2809566063dSJacob Faibussowitsch     PetscCall(PetscStrlcat(pathname, "/petsc-shm-XXXXXX", PETSC_MAX_PATH_LEN));
2814df5c2c7SJunchao Zhang     /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
2829371c9d4SSatish Balay     fd = mkstemp(pathname);
2839371c9d4SSatish Balay     PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not create tmp file %s with mkstemp", pathname);
2849566063dSJacob Faibussowitsch     PetscCall(ftruncate(fd, size));
2859371c9d4SSatish Balay     baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
2869371c9d4SSatish Balay     PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
2879566063dSJacob Faibussowitsch     PetscCall(close(fd));
2889566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
289eff715bbSJunchao Zhang     /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
2909566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
2919566063dSJacob Faibussowitsch     PetscCall(unlink(pathname));
2924df5c2c7SJunchao Zhang   } else {
2939566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
2949371c9d4SSatish Balay     fd = open(pathname, O_RDWR);
2959371c9d4SSatish Balay     PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not open tmp file %s", pathname);
2969371c9d4SSatish Balay     baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
2979371c9d4SSatish Balay     PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
2989566063dSJacob Faibussowitsch     PetscCall(close(fd));
2999566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
3004df5c2c7SJunchao Zhang   }
3014df5c2c7SJunchao Zhang   #else
302a32e93adSJunchao Zhang   size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
3039566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Win_allocate_shared(size, 1, MPI_INFO_NULL, ctrl->omp_comm, &baseptr, &ctrl->omp_win));
3049566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Win_shared_query(ctrl->omp_win, 0, &size, &disp_unit, &baseptr));
3054df5c2c7SJunchao Zhang   #endif
306a32e93adSJunchao Zhang   ctrl->barrier = (pthread_barrier_t *)baseptr;
307a32e93adSJunchao Zhang 
308a32e93adSJunchao Zhang   /* omp master initializes the barrier */
309a32e93adSJunchao Zhang   if (ctrl->is_omp_master) {
3109566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Comm_size(ctrl->omp_comm, &ctrl->omp_comm_size));
3119566063dSJacob Faibussowitsch     PetscCall(pthread_barrierattr_init(&attr));
3129566063dSJacob Faibussowitsch     PetscCall(pthread_barrierattr_setpshared(&attr, PTHREAD_PROCESS_SHARED)); /* make the barrier also work for processes */
3139566063dSJacob Faibussowitsch     PetscCall(pthread_barrier_init(ctrl->barrier, &attr, (unsigned int)ctrl->omp_comm_size));
3149566063dSJacob Faibussowitsch     PetscCall(pthread_barrierattr_destroy(&attr));
315a32e93adSJunchao Zhang   }
316a32e93adSJunchao Zhang 
3174df5c2c7SJunchao Zhang   /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
3189566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
319*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
320a32e93adSJunchao Zhang }
321a32e93adSJunchao Zhang 
3228fcaa860SBarry Smith /* Destroy the pthread barrier in the PETSc OpenMP controller */
323d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
324d71ae5a4SJacob Faibussowitsch {
3254df5c2c7SJunchao Zhang   PetscFunctionBegin;
3264df5c2c7SJunchao Zhang   /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
3279566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
3289566063dSJacob Faibussowitsch   if (ctrl->is_omp_master) PetscCall(pthread_barrier_destroy(ctrl->barrier));
3294df5c2c7SJunchao Zhang 
3304df5c2c7SJunchao Zhang   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
3319566063dSJacob Faibussowitsch   PetscCall(munmap(ctrl->barrier, sizeof(pthread_barrier_t)));
3324df5c2c7SJunchao Zhang   #else
3339566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Win_free(&ctrl->omp_win));
3344df5c2c7SJunchao Zhang   #endif
335*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
336a32e93adSJunchao Zhang }
337a32e93adSJunchao Zhang 
338eff715bbSJunchao Zhang /*@C
339811af0c4SBarry Smith     PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries that use OpenMP
340eff715bbSJunchao Zhang 
341d8d19677SJose E. Roman     Input Parameters:
342eff715bbSJunchao Zhang +   petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
343a2b725a8SWilliam Gropp -   nthreads   - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value
344eff715bbSJunchao Zhang 
345eff715bbSJunchao Zhang     Output Parameter:
3468fcaa860SBarry Smith .   pctrl      - a PETSc OpenMP controller
347eff715bbSJunchao Zhang 
348eff715bbSJunchao Zhang     Level: developer
349eff715bbSJunchao Zhang 
350811af0c4SBarry Smith     Developer Note:
351811af0c4SBarry Smith     Possibly use the variable `PetscNumOMPThreads` to determine the number for threads to use
3528fcaa860SBarry Smith 
353811af0c4SBarry Smith .seealso: `PetscOmpCtrlDestroy()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
354eff715bbSJunchao Zhang @*/
355d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm, PetscInt nthreads, PetscOmpCtrl *pctrl)
356d71ae5a4SJacob Faibussowitsch {
357a32e93adSJunchao Zhang   PetscOmpCtrl   ctrl;
358a32e93adSJunchao Zhang   unsigned long *cpu_ulongs = NULL;
359a32e93adSJunchao Zhang   PetscInt       i, nr_cpu_ulongs;
360a32e93adSJunchao Zhang   PetscShmComm   pshmcomm;
361a32e93adSJunchao Zhang   MPI_Comm       shm_comm;
362a32e93adSJunchao Zhang   PetscMPIInt    shm_rank, shm_comm_size, omp_rank, color;
3637c405c4aSJunchao Zhang   PetscInt       num_packages, num_cores;
364a32e93adSJunchao Zhang 
365a32e93adSJunchao Zhang   PetscFunctionBegin;
3669566063dSJacob Faibussowitsch   PetscCall(PetscNew(&ctrl));
367a32e93adSJunchao Zhang 
368a32e93adSJunchao Zhang   /*=================================================================================
3697c405c4aSJunchao Zhang     Init hwloc
3707c405c4aSJunchao Zhang    ==================================================================================*/
3719566063dSJacob Faibussowitsch   PetscCall(hwloc_topology_init(&ctrl->topology));
3727c405c4aSJunchao Zhang   #if HWLOC_API_VERSION >= 0x00020000
3737c405c4aSJunchao Zhang   /* to filter out unneeded info and have faster hwloc_topology_load */
3749566063dSJacob Faibussowitsch   PetscCall(hwloc_topology_set_all_types_filter(ctrl->topology, HWLOC_TYPE_FILTER_KEEP_NONE));
3759566063dSJacob Faibussowitsch   PetscCall(hwloc_topology_set_type_filter(ctrl->topology, HWLOC_OBJ_CORE, HWLOC_TYPE_FILTER_KEEP_ALL));
3767c405c4aSJunchao Zhang   #endif
3779566063dSJacob Faibussowitsch   PetscCall(hwloc_topology_load(ctrl->topology));
3787c405c4aSJunchao Zhang 
3797c405c4aSJunchao Zhang   /*=================================================================================
380a32e93adSJunchao Zhang     Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
381a32e93adSJunchao Zhang     physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
382a32e93adSJunchao Zhang     others are called slaves. OMP Masters make up a new comm called omp_master_comm,
383a32e93adSJunchao Zhang     which is usually passed to third party libraries.
384a32e93adSJunchao Zhang    ==================================================================================*/
385a32e93adSJunchao Zhang 
386a32e93adSJunchao Zhang   /* fetch the stored shared memory communicator */
3879566063dSJacob Faibussowitsch   PetscCall(PetscShmCommGet(petsc_comm, &pshmcomm));
3889566063dSJacob Faibussowitsch   PetscCall(PetscShmCommGetMpiShmComm(pshmcomm, &shm_comm));
389a32e93adSJunchao Zhang 
3909566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_rank(shm_comm, &shm_rank));
3919566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(shm_comm, &shm_comm_size));
392a32e93adSJunchao Zhang 
3937c405c4aSJunchao Zhang   /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
3947c405c4aSJunchao Zhang   if (nthreads == -1) {
395a312e481SBarry Smith     num_packages = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE);
396a312e481SBarry Smith     num_cores    = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE);
3977c405c4aSJunchao Zhang     nthreads     = num_cores / num_packages;
3987c405c4aSJunchao Zhang     if (nthreads > shm_comm_size) nthreads = shm_comm_size;
3997c405c4aSJunchao Zhang   }
4007c405c4aSJunchao Zhang 
4015f80ce2aSJacob Faibussowitsch   PetscCheck(nthreads >= 1 && nthreads <= shm_comm_size, petsc_comm, PETSC_ERR_ARG_OUTOFRANGE, "number of OpenMP threads %" PetscInt_FMT " can not be < 1 or > the MPI shared memory communicator size %d", nthreads, shm_comm_size);
4029566063dSJacob Faibussowitsch   if (shm_comm_size % nthreads) PetscCall(PetscPrintf(petsc_comm, "Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n", nthreads, shm_comm_size));
403a32e93adSJunchao Zhang 
404a32e93adSJunchao Zhang   /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
405a32e93adSJunchao Zhang      shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
406a32e93adSJunchao Zhang      color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
407a32e93adSJunchao Zhang      be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
408a32e93adSJunchao Zhang      Use 0 as key so that rank ordering wont change in new comm.
409a32e93adSJunchao Zhang    */
410a32e93adSJunchao Zhang   color = shm_rank / nthreads;
4119566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_split(shm_comm, color, 0 /*key*/, &ctrl->omp_comm));
412a32e93adSJunchao Zhang 
413a32e93adSJunchao Zhang   /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
4149566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_rank(ctrl->omp_comm, &omp_rank));
415a32e93adSJunchao Zhang   if (!omp_rank) {
416a32e93adSJunchao Zhang     ctrl->is_omp_master = PETSC_TRUE; /* master */
417a32e93adSJunchao Zhang     color               = 0;
418a32e93adSJunchao Zhang   } else {
419a32e93adSJunchao Zhang     ctrl->is_omp_master = PETSC_FALSE;   /* slave */
420a32e93adSJunchao Zhang     color               = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
421a32e93adSJunchao Zhang   }
4229566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_split(petsc_comm, color, 0 /*key*/, &ctrl->omp_master_comm));
423a32e93adSJunchao Zhang 
424a32e93adSJunchao Zhang   /*=================================================================================
425a32e93adSJunchao Zhang     Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
426a32e93adSJunchao Zhang     slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
427a32e93adSJunchao Zhang     and run them on the idle CPUs.
428a32e93adSJunchao Zhang    ==================================================================================*/
4299566063dSJacob Faibussowitsch   PetscCall(PetscOmpCtrlCreateBarrier(ctrl));
430a32e93adSJunchao Zhang 
431a32e93adSJunchao Zhang   /*=================================================================================
432a32e93adSJunchao Zhang     omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
433a32e93adSJunchao Zhang     is the union of the bindings of all ranks in the omp_comm
434a32e93adSJunchao Zhang     =================================================================================*/
435a32e93adSJunchao Zhang 
4369371c9d4SSatish Balay   ctrl->cpuset = hwloc_bitmap_alloc();
4379371c9d4SSatish Balay   PetscCheck(ctrl->cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
4389566063dSJacob Faibussowitsch   PetscCall(hwloc_get_cpubind(ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS));
439a32e93adSJunchao Zhang 
4403ab56b82SJunchao Zhang   /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
441a32e93adSJunchao Zhang   nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset(ctrl->topology)) + sizeof(unsigned long) * 8) / sizeof(unsigned long) / 8;
4429566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nr_cpu_ulongs, &cpu_ulongs));
443a32e93adSJunchao Zhang   if (nr_cpu_ulongs == 1) {
444a32e93adSJunchao Zhang     cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
445a32e93adSJunchao Zhang   } else {
446a32e93adSJunchao Zhang     for (i = 0; i < nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset, (unsigned)i);
447a32e93adSJunchao Zhang   }
448a32e93adSJunchao Zhang 
4499566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs, nr_cpu_ulongs, MPI_UNSIGNED_LONG, MPI_BOR, 0, ctrl->omp_comm));
450a32e93adSJunchao Zhang 
451a32e93adSJunchao Zhang   if (ctrl->is_omp_master) {
4529371c9d4SSatish Balay     ctrl->omp_cpuset = hwloc_bitmap_alloc();
4539371c9d4SSatish Balay     PetscCheck(ctrl->omp_cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
454a32e93adSJunchao Zhang     if (nr_cpu_ulongs == 1) {
4553ab56b82SJunchao Zhang   #if HWLOC_API_VERSION >= 0x00020000
4569566063dSJacob Faibussowitsch       PetscCall(hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]));
4573ab56b82SJunchao Zhang   #else
4583ab56b82SJunchao Zhang       hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]);
4593ab56b82SJunchao Zhang   #endif
460a32e93adSJunchao Zhang     } else {
4613ab56b82SJunchao Zhang       for (i = 0; i < nr_cpu_ulongs; i++) {
4623ab56b82SJunchao Zhang   #if HWLOC_API_VERSION >= 0x00020000
4639566063dSJacob Faibussowitsch         PetscCall(hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]));
4643ab56b82SJunchao Zhang   #else
4653ab56b82SJunchao Zhang         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
4663ab56b82SJunchao Zhang   #endif
4673ab56b82SJunchao Zhang       }
468a32e93adSJunchao Zhang     }
469a32e93adSJunchao Zhang   }
4709566063dSJacob Faibussowitsch   PetscCall(PetscFree(cpu_ulongs));
471a32e93adSJunchao Zhang   *pctrl = ctrl;
472*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
473a32e93adSJunchao Zhang }
474a32e93adSJunchao Zhang 
475eff715bbSJunchao Zhang /*@C
47664f49babSJed Brown     PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller
477eff715bbSJunchao Zhang 
478eff715bbSJunchao Zhang     Input Parameter:
4798fcaa860SBarry Smith .   pctrl  - a PETSc OpenMP controller
480eff715bbSJunchao Zhang 
481eff715bbSJunchao Zhang     Level: developer
482eff715bbSJunchao Zhang 
483811af0c4SBarry Smith .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
484eff715bbSJunchao Zhang @*/
485d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
486d71ae5a4SJacob Faibussowitsch {
487a32e93adSJunchao Zhang   PetscOmpCtrl ctrl = *pctrl;
488a32e93adSJunchao Zhang 
489a32e93adSJunchao Zhang   PetscFunctionBegin;
490a32e93adSJunchao Zhang   hwloc_bitmap_free(ctrl->cpuset);
491a32e93adSJunchao Zhang   hwloc_topology_destroy(ctrl->topology);
492*3ba16761SJacob Faibussowitsch   PetscCall(PetscOmpCtrlDestroyBarrier(ctrl));
4939566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_free(&ctrl->omp_comm));
494a32e93adSJunchao Zhang   if (ctrl->is_omp_master) {
495a32e93adSJunchao Zhang     hwloc_bitmap_free(ctrl->omp_cpuset);
4969566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Comm_free(&ctrl->omp_master_comm));
497a32e93adSJunchao Zhang   }
4989566063dSJacob Faibussowitsch   PetscCall(PetscFree(ctrl));
499*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
500a32e93adSJunchao Zhang }
501a32e93adSJunchao Zhang 
502a32e93adSJunchao Zhang /*@C
5038fcaa860SBarry Smith     PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller
504a32e93adSJunchao Zhang 
505a32e93adSJunchao Zhang     Input Parameter:
5068fcaa860SBarry Smith .   ctrl - a PETSc OMP controller
507a32e93adSJunchao Zhang 
508d8d19677SJose E. Roman     Output Parameters:
509eff715bbSJunchao Zhang +   omp_comm         - a communicator that includes a master rank and slave ranks where master spawns threads
510a32e93adSJunchao Zhang .   omp_master_comm  - on master ranks, return a communicator that include master ranks of each omp_comm;
511811af0c4SBarry Smith                        on slave ranks, `MPI_COMM_NULL` will be return in reality.
512a32e93adSJunchao Zhang -   is_omp_master    - true if the calling process is an OMP master rank.
513a32e93adSJunchao Zhang 
514811af0c4SBarry Smith     Note:
515811af0c4SBarry Smith     Any output parameter can be NULL. The parameter is just ignored.
516eff715bbSJunchao Zhang 
517a32e93adSJunchao Zhang     Level: developer
518811af0c4SBarry Smith 
519811af0c4SBarry Smith .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
520a32e93adSJunchao Zhang @*/
521d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl, MPI_Comm *omp_comm, MPI_Comm *omp_master_comm, PetscBool *is_omp_master)
522d71ae5a4SJacob Faibussowitsch {
523a32e93adSJunchao Zhang   PetscFunctionBegin;
524a32e93adSJunchao Zhang   if (omp_comm) *omp_comm = ctrl->omp_comm;
525a32e93adSJunchao Zhang   if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
526a32e93adSJunchao Zhang   if (is_omp_master) *is_omp_master = ctrl->is_omp_master;
527*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
528a32e93adSJunchao Zhang }
529a32e93adSJunchao Zhang 
530eff715bbSJunchao Zhang /*@C
5318fcaa860SBarry Smith     PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU)
532eff715bbSJunchao Zhang 
533eff715bbSJunchao Zhang     Input Parameter:
5348fcaa860SBarry Smith .   ctrl - a PETSc OMP controller
535eff715bbSJunchao Zhang 
536eff715bbSJunchao Zhang     Notes:
537811af0c4SBarry Smith     this is a pthread barrier on MPI ranks. Using `MPI_Barrier()` instead is conceptually correct. But MPI standard does not
538811af0c4SBarry Smith     require processes blocked by `MPI_Barrier()` free their CPUs to let other processes progress. In practice, to minilize latency,
539811af0c4SBarry Smith     MPI ranks stuck in `MPI_Barrier()` keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.
540eff715bbSJunchao Zhang 
541811af0c4SBarry Smith     A code using `PetscOmpCtrlBarrier()` would be like this,
542811af0c4SBarry Smith .vb
543eff715bbSJunchao Zhang     if (is_omp_master) {
544eff715bbSJunchao Zhang       PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
545eff715bbSJunchao Zhang       Call the library using OpenMP
546eff715bbSJunchao Zhang       PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
547eff715bbSJunchao Zhang     }
548eff715bbSJunchao Zhang     PetscOmpCtrlBarrier(ctrl);
549811af0c4SBarry Smith .ve
550eff715bbSJunchao Zhang 
551eff715bbSJunchao Zhang     Level: developer
552eff715bbSJunchao Zhang 
553811af0c4SBarry Smith .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`,
554eff715bbSJunchao Zhang @*/
555d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
556d71ae5a4SJacob Faibussowitsch {
5572da392ccSBarry Smith   int err;
558a32e93adSJunchao Zhang 
559a32e93adSJunchao Zhang   PetscFunctionBegin;
5602da392ccSBarry Smith   err = pthread_barrier_wait(ctrl->barrier);
56139619372SPierre Jolivet   PetscCheck(!err || err == PTHREAD_BARRIER_SERIAL_THREAD, PETSC_COMM_SELF, PETSC_ERR_LIB, "pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %d", err);
562*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
563a32e93adSJunchao Zhang }
564a32e93adSJunchao Zhang 
565eff715bbSJunchao Zhang /*@C
566eff715bbSJunchao Zhang     PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks
567eff715bbSJunchao Zhang 
568eff715bbSJunchao Zhang     Input Parameter:
5698fcaa860SBarry Smith .   ctrl - a PETSc OMP controller
570eff715bbSJunchao Zhang 
571811af0c4SBarry Smith     Note:
572811af0c4SBarry Smith     Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
573eff715bbSJunchao Zhang     This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime
574eff715bbSJunchao Zhang 
575eff715bbSJunchao Zhang     Level: developer
576eff715bbSJunchao Zhang 
577811af0c4SBarry Smith .seealso: `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
578eff715bbSJunchao Zhang @*/
579d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
580d71ae5a4SJacob Faibussowitsch {
581a32e93adSJunchao Zhang   PetscFunctionBegin;
5829566063dSJacob Faibussowitsch   PetscCall(hwloc_set_cpubind(ctrl->topology, ctrl->omp_cpuset, HWLOC_CPUBIND_PROCESS));
583eff715bbSJunchao Zhang   omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
584*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
585a32e93adSJunchao Zhang }
586a32e93adSJunchao Zhang 
587eff715bbSJunchao Zhang /*@C
588eff715bbSJunchao Zhang    PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks
589eff715bbSJunchao Zhang 
590eff715bbSJunchao Zhang    Input Parameter:
5918fcaa860SBarry Smith .  ctrl - a PETSc OMP controller
592eff715bbSJunchao Zhang 
593811af0c4SBarry Smith    Note:
594811af0c4SBarry Smith    Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
595eff715bbSJunchao Zhang    This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.
596eff715bbSJunchao Zhang 
597eff715bbSJunchao Zhang    Level: developer
598eff715bbSJunchao Zhang 
599811af0c4SBarry Smith .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
600eff715bbSJunchao Zhang @*/
601d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
602d71ae5a4SJacob Faibussowitsch {
603a32e93adSJunchao Zhang   PetscFunctionBegin;
6049566063dSJacob Faibussowitsch   PetscCall(hwloc_set_cpubind(ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS));
605eff715bbSJunchao Zhang   omp_set_num_threads(1);
606*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
607a32e93adSJunchao Zhang }
608a32e93adSJunchao Zhang 
6094df5c2c7SJunchao Zhang   #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
61020b3346cSJunchao Zhang #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */
611