xref: /petsc/src/sys/utils/mpishm.c (revision dc9a610e8a93b2ceab441eb761644f4cbb77655c)
15f7487a0SJunchao Zhang #include <petscsys.h> /*I  "petscsys.h"  I*/
25f7487a0SJunchao Zhang #include <petsc/private/petscimpl.h>
35f7487a0SJunchao Zhang 
45f7487a0SJunchao Zhang struct _n_PetscShmComm {
55f7487a0SJunchao Zhang   PetscMPIInt *globranks;         /* global ranks of each rank in the shared memory communicator */
65f7487a0SJunchao Zhang   PetscMPIInt  shmsize;           /* size of the shared memory communicator */
75f7487a0SJunchao Zhang   MPI_Comm     globcomm, shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */
85f7487a0SJunchao Zhang };
95f7487a0SJunchao Zhang 
105f7487a0SJunchao Zhang /*
1133779a13SJunchao Zhang    Private routine to delete internal shared memory communicator when a communicator is freed.
125f7487a0SJunchao Zhang 
135f7487a0SJunchao Zhang    This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this  data as an attribute is freed.
145f7487a0SJunchao Zhang 
155f7487a0SJunchao Zhang    Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval()
165f7487a0SJunchao Zhang 
175f7487a0SJunchao Zhang */
188434afd1SBarry Smith PETSC_EXTERN PetscMPIInt MPIAPI Petsc_ShmComm_Attr_DeleteFn(MPI_Comm comm, PetscMPIInt keyval, void *val, void *extra_state)
19d71ae5a4SJacob Faibussowitsch {
205f7487a0SJunchao Zhang   PetscShmComm p = (PetscShmComm)val;
215f7487a0SJunchao Zhang 
225f7487a0SJunchao Zhang   PetscFunctionBegin;
239566063dSJacob Faibussowitsch   PetscCallMPI(PetscInfo(NULL, "Deleting shared memory subcommunicator in a MPI_Comm %ld\n", (long)comm));
249566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_free(&p->shmcomm));
259566063dSJacob Faibussowitsch   PetscCallMPI(PetscFree(p->globranks));
269566063dSJacob Faibussowitsch   PetscCallMPI(PetscFree(val));
275f7487a0SJunchao Zhang   PetscFunctionReturn(MPI_SUCCESS);
285f7487a0SJunchao Zhang }
295f7487a0SJunchao Zhang 
30b48189acSJunchao Zhang #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
31b48189acSJunchao Zhang   /* Data structures to support freeing comms created in PetscShmCommGet().
32b48189acSJunchao Zhang   Since we predict communicators passed to PetscShmCommGet() are very likely
33b48189acSJunchao Zhang   either a petsc inner communicator or an MPI communicator with a linked petsc
34b48189acSJunchao Zhang   inner communicator, we use a simple static array to store dupped communicators
35b48189acSJunchao Zhang   on rare cases otherwise.
36b48189acSJunchao Zhang  */
37b48189acSJunchao Zhang   #define MAX_SHMCOMM_DUPPED_COMMS 16
38b48189acSJunchao Zhang static PetscInt       num_dupped_comms = 0;
39b48189acSJunchao Zhang static MPI_Comm       shmcomm_dupped_comms[MAX_SHMCOMM_DUPPED_COMMS];
40d71ae5a4SJacob Faibussowitsch static PetscErrorCode PetscShmCommDestroyDuppedComms(void)
41d71ae5a4SJacob Faibussowitsch {
42b48189acSJunchao Zhang   PetscInt i;
434d86920dSPierre Jolivet 
44b48189acSJunchao Zhang   PetscFunctionBegin;
459566063dSJacob Faibussowitsch   for (i = 0; i < num_dupped_comms; i++) PetscCall(PetscCommDestroy(&shmcomm_dupped_comms[i]));
46b48189acSJunchao Zhang   num_dupped_comms = 0; /* reset so that PETSc can be reinitialized */
473ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
48b48189acSJunchao Zhang }
49b48189acSJunchao Zhang #endif
50b48189acSJunchao Zhang 
515f7487a0SJunchao Zhang /*@C
52811af0c4SBarry Smith   PetscShmCommGet - Returns a sub-communicator of all ranks that share a common memory
535f7487a0SJunchao Zhang 
54d083f849SBarry Smith   Collective.
555f7487a0SJunchao Zhang 
565f7487a0SJunchao Zhang   Input Parameter:
57811af0c4SBarry Smith . globcomm - `MPI_Comm`, which can be a user MPI_Comm or a PETSc inner MPI_Comm
585f7487a0SJunchao Zhang 
595f7487a0SJunchao Zhang   Output Parameter:
605f7487a0SJunchao Zhang . pshmcomm - the PETSc shared memory communicator object
615f7487a0SJunchao Zhang 
625f7487a0SJunchao Zhang   Level: developer
635f7487a0SJunchao Zhang 
64811af0c4SBarry Smith   Note:
655f7487a0SJunchao Zhang   When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis
665f7487a0SJunchao Zhang 
67811af0c4SBarry Smith .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()`
685f7487a0SJunchao Zhang @*/
69d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscShmCommGet(MPI_Comm globcomm, PetscShmComm *pshmcomm)
70d71ae5a4SJacob Faibussowitsch {
715f7487a0SJunchao Zhang #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
725f7487a0SJunchao Zhang   MPI_Group         globgroup, shmgroup;
735f7487a0SJunchao Zhang   PetscMPIInt      *shmranks, i, flg;
745f7487a0SJunchao Zhang   PetscCommCounter *counter;
755f7487a0SJunchao Zhang 
765f7487a0SJunchao Zhang   PetscFunctionBegin;
774f572ea9SToby Isaac   PetscAssertPointer(pshmcomm, 2);
78b48189acSJunchao Zhang   /* Get a petsc inner comm, since we always want to stash pshmcomm on petsc inner comms */
799566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_Counter_keyval, &counter, &flg));
80b48189acSJunchao Zhang   if (!flg) { /* globcomm is not a petsc comm */
819371c9d4SSatish Balay     union
829371c9d4SSatish Balay     {
839371c9d4SSatish Balay       MPI_Comm comm;
849371c9d4SSatish Balay       void    *ptr;
859371c9d4SSatish Balay     } ucomm;
86b48189acSJunchao Zhang     /* check if globcomm already has a linked petsc inner comm */
879566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_InnerComm_keyval, &ucomm, &flg));
88b48189acSJunchao Zhang     if (!flg) {
89b48189acSJunchao Zhang       /* globcomm does not have a linked petsc inner comm, so we create one and replace globcomm with it */
9008401ef6SPierre Jolivet       PetscCheck(num_dupped_comms < MAX_SHMCOMM_DUPPED_COMMS, globcomm, PETSC_ERR_PLIB, "PetscShmCommGet() is trying to dup more than %d MPI_Comms", MAX_SHMCOMM_DUPPED_COMMS);
919566063dSJacob Faibussowitsch       PetscCall(PetscCommDuplicate(globcomm, &globcomm, NULL));
92b48189acSJunchao Zhang       /* Register a function to free the dupped petsc comms at PetscFinalize at the first time */
939566063dSJacob Faibussowitsch       if (num_dupped_comms == 0) PetscCall(PetscRegisterFinalize(PetscShmCommDestroyDuppedComms));
94b48189acSJunchao Zhang       shmcomm_dupped_comms[num_dupped_comms] = globcomm;
95b48189acSJunchao Zhang       num_dupped_comms++;
96b48189acSJunchao Zhang     } else {
97b48189acSJunchao Zhang       /* otherwise, we pull out the inner comm and use it as globcomm */
98b48189acSJunchao Zhang       globcomm = ucomm.comm;
99b48189acSJunchao Zhang     }
100b48189acSJunchao Zhang   }
1015f7487a0SJunchao Zhang 
102b48189acSJunchao Zhang   /* Check if globcomm already has an attached pshmcomm. If no, create one */
1039566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_ShmComm_keyval, pshmcomm, &flg));
1043ba16761SJacob Faibussowitsch   if (flg) PetscFunctionReturn(PETSC_SUCCESS);
1055f7487a0SJunchao Zhang 
1069566063dSJacob Faibussowitsch   PetscCall(PetscNew(pshmcomm));
1075f7487a0SJunchao Zhang   (*pshmcomm)->globcomm = globcomm;
1085f7487a0SJunchao Zhang 
1099566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &(*pshmcomm)->shmcomm));
1105f7487a0SJunchao Zhang 
1119566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size((*pshmcomm)->shmcomm, &(*pshmcomm)->shmsize));
1129566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_group(globcomm, &globgroup));
1139566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup));
1149566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &shmranks));
1159566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &(*pshmcomm)->globranks));
1165f7487a0SJunchao Zhang   for (i = 0; i < (*pshmcomm)->shmsize; i++) shmranks[i] = i;
1179566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks));
1189566063dSJacob Faibussowitsch   PetscCall(PetscFree(shmranks));
1199566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Group_free(&globgroup));
1209566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Group_free(&shmgroup));
1215f7487a0SJunchao Zhang 
12248a46eb9SPierre Jolivet   for (i = 0; i < (*pshmcomm)->shmsize; i++) PetscCall(PetscInfo(NULL, "Shared memory rank %d global rank %d\n", i, (*pshmcomm)->globranks[i]));
1239566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_set_attr(globcomm, Petsc_ShmComm_keyval, *pshmcomm));
1243ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1255f7487a0SJunchao Zhang #else
1265f7487a0SJunchao Zhang   SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich.");
1275f7487a0SJunchao Zhang #endif
1285f7487a0SJunchao Zhang }
1295f7487a0SJunchao Zhang 
1305f7487a0SJunchao Zhang /*@C
1315f7487a0SJunchao Zhang   PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator
1325f7487a0SJunchao Zhang 
1335f7487a0SJunchao Zhang   Input Parameters:
1345f7487a0SJunchao Zhang + pshmcomm - the shared memory communicator object
1355f7487a0SJunchao Zhang - grank    - the global rank
1365f7487a0SJunchao Zhang 
1375f7487a0SJunchao Zhang   Output Parameter:
138811af0c4SBarry Smith . lrank - the local rank, or `MPI_PROC_NULL` if it does not exist
1395f7487a0SJunchao Zhang 
1405f7487a0SJunchao Zhang   Level: developer
1415f7487a0SJunchao Zhang 
1425f7487a0SJunchao Zhang   Developer Notes:
1435f7487a0SJunchao Zhang   Assumes the pshmcomm->globranks[] is sorted
1445f7487a0SJunchao Zhang 
1455f7487a0SJunchao Zhang   It may be better to rewrite this to map multiple global ranks to local in the same function call
1465f7487a0SJunchao Zhang 
147811af0c4SBarry Smith .seealso: `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()`
1485f7487a0SJunchao Zhang @*/
149d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm, PetscMPIInt grank, PetscMPIInt *lrank)
150d71ae5a4SJacob Faibussowitsch {
1515f7487a0SJunchao Zhang   PetscMPIInt low, high, t, i;
1525f7487a0SJunchao Zhang   PetscBool   flg = PETSC_FALSE;
1535f7487a0SJunchao Zhang 
1545f7487a0SJunchao Zhang   PetscFunctionBegin;
1554f572ea9SToby Isaac   PetscAssertPointer(pshmcomm, 1);
1564f572ea9SToby Isaac   PetscAssertPointer(lrank, 3);
1575f7487a0SJunchao Zhang   *lrank = MPI_PROC_NULL;
1583ba16761SJacob Faibussowitsch   if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(PETSC_SUCCESS);
1593ba16761SJacob Faibussowitsch   if (grank > pshmcomm->globranks[pshmcomm->shmsize - 1]) PetscFunctionReturn(PETSC_SUCCESS);
1609566063dSJacob Faibussowitsch   PetscCall(PetscOptionsGetBool(NULL, NULL, "-noshared", &flg, NULL));
1613ba16761SJacob Faibussowitsch   if (flg) PetscFunctionReturn(PETSC_SUCCESS);
1625f7487a0SJunchao Zhang   low  = 0;
1635f7487a0SJunchao Zhang   high = pshmcomm->shmsize;
1645f7487a0SJunchao Zhang   while (high - low > 5) {
1655f7487a0SJunchao Zhang     t = (low + high) / 2;
1665f7487a0SJunchao Zhang     if (pshmcomm->globranks[t] > grank) high = t;
1675f7487a0SJunchao Zhang     else low = t;
1685f7487a0SJunchao Zhang   }
1695f7487a0SJunchao Zhang   for (i = low; i < high; i++) {
1703ba16761SJacob Faibussowitsch     if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(PETSC_SUCCESS);
1715f7487a0SJunchao Zhang     if (pshmcomm->globranks[i] == grank) {
1725f7487a0SJunchao Zhang       *lrank = i;
1733ba16761SJacob Faibussowitsch       PetscFunctionReturn(PETSC_SUCCESS);
1745f7487a0SJunchao Zhang     }
1755f7487a0SJunchao Zhang   }
1763ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1775f7487a0SJunchao Zhang }
1785f7487a0SJunchao Zhang 
1795f7487a0SJunchao Zhang /*@C
1805f7487a0SJunchao Zhang   PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank
1815f7487a0SJunchao Zhang 
1825f7487a0SJunchao Zhang   Input Parameters:
1835f7487a0SJunchao Zhang + pshmcomm - the shared memory communicator object
1845f7487a0SJunchao Zhang - lrank    - the local rank in the shared memory communicator
1855f7487a0SJunchao Zhang 
1865f7487a0SJunchao Zhang   Output Parameter:
1875f7487a0SJunchao Zhang . grank - the global rank in the global communicator where the shared memory communicator is built
1885f7487a0SJunchao Zhang 
1895f7487a0SJunchao Zhang   Level: developer
1905f7487a0SJunchao Zhang 
191811af0c4SBarry Smith .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommGetMpiShmComm()`
1925f7487a0SJunchao Zhang @*/
193d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm, PetscMPIInt lrank, PetscMPIInt *grank)
194d71ae5a4SJacob Faibussowitsch {
1955f7487a0SJunchao Zhang   PetscFunctionBegin;
1964f572ea9SToby Isaac   PetscAssertPointer(pshmcomm, 1);
1974f572ea9SToby Isaac   PetscAssertPointer(grank, 3);
1982c71b3e2SJacob Faibussowitsch   PetscCheck(lrank >= 0 && lrank < pshmcomm->shmsize, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "No rank %d in the shared memory communicator", lrank);
1995f7487a0SJunchao Zhang   *grank = pshmcomm->globranks[lrank];
2003ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2015f7487a0SJunchao Zhang }
2025f7487a0SJunchao Zhang 
2035f7487a0SJunchao Zhang /*@C
2045f7487a0SJunchao Zhang   PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory
2055f7487a0SJunchao Zhang 
2065f7487a0SJunchao Zhang   Input Parameter:
2075f7487a0SJunchao Zhang . pshmcomm - PetscShmComm object obtained with PetscShmCommGet()
2085f7487a0SJunchao Zhang 
2095f7487a0SJunchao Zhang   Output Parameter:
2105f7487a0SJunchao Zhang . comm - the MPI communicator
2115f7487a0SJunchao Zhang 
2125f7487a0SJunchao Zhang   Level: developer
2135f7487a0SJunchao Zhang 
214811af0c4SBarry Smith .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()`
2155f7487a0SJunchao Zhang @*/
216d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm, MPI_Comm *comm)
217d71ae5a4SJacob Faibussowitsch {
2185f7487a0SJunchao Zhang   PetscFunctionBegin;
2194f572ea9SToby Isaac   PetscAssertPointer(pshmcomm, 1);
2204f572ea9SToby Isaac   PetscAssertPointer(comm, 2);
2215f7487a0SJunchao Zhang   *comm = pshmcomm->shmcomm;
2223ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2235f7487a0SJunchao Zhang }
2245f7487a0SJunchao Zhang 
22520b3346cSJunchao Zhang #if defined(PETSC_HAVE_OPENMP_SUPPORT)
226a32e93adSJunchao Zhang   #include <pthread.h>
227a32e93adSJunchao Zhang   #include <hwloc.h>
228a32e93adSJunchao Zhang   #include <omp.h>
229a32e93adSJunchao Zhang 
230eff715bbSJunchao Zhang   /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
231eff715bbSJunchao Zhang    otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
2324df5c2c7SJunchao Zhang    simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
2334df5c2c7SJunchao Zhang    by 50%. Until the reason is found out, we use mmap() instead.
2344df5c2c7SJunchao Zhang */
2354df5c2c7SJunchao Zhang   #define USE_MMAP_ALLOCATE_SHARED_MEMORY
2364df5c2c7SJunchao Zhang 
2374df5c2c7SJunchao Zhang   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
2384df5c2c7SJunchao Zhang     #include <sys/mman.h>
2394df5c2c7SJunchao Zhang     #include <sys/types.h>
2404df5c2c7SJunchao Zhang     #include <sys/stat.h>
2414df5c2c7SJunchao Zhang     #include <fcntl.h>
2424df5c2c7SJunchao Zhang   #endif
2434df5c2c7SJunchao Zhang 
244a32e93adSJunchao Zhang struct _n_PetscOmpCtrl {
245a32e93adSJunchao Zhang   MPI_Comm           omp_comm;        /* a shared memory communicator to spawn omp threads */
246a32e93adSJunchao Zhang   MPI_Comm           omp_master_comm; /* a communicator to give to third party libraries */
247a32e93adSJunchao Zhang   PetscMPIInt        omp_comm_size;   /* size of omp_comm, a kind of OMP_NUM_THREADS */
248a32e93adSJunchao Zhang   PetscBool          is_omp_master;   /* rank 0's in omp_comm */
249a32e93adSJunchao Zhang   MPI_Win            omp_win;         /* a shared memory window containing a barrier */
250a32e93adSJunchao Zhang   pthread_barrier_t *barrier;         /* pointer to the barrier */
251a32e93adSJunchao Zhang   hwloc_topology_t   topology;
252a32e93adSJunchao Zhang   hwloc_cpuset_t     cpuset;     /* cpu bindings of omp master */
253a32e93adSJunchao Zhang   hwloc_cpuset_t     omp_cpuset; /* union of cpu bindings of ranks in omp_comm */
254a32e93adSJunchao Zhang };
255a32e93adSJunchao Zhang 
256eff715bbSJunchao Zhang /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
2578fcaa860SBarry Smith    contained by the controller.
258eff715bbSJunchao Zhang 
2598fcaa860SBarry Smith    PETSc OpenMP controller users do not call this function directly. This function exists
260eff715bbSJunchao Zhang    only because we want to separate shared memory allocation methods from other code.
261eff715bbSJunchao Zhang  */
262d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
263d71ae5a4SJacob Faibussowitsch {
264a32e93adSJunchao Zhang   MPI_Aint              size;
265a32e93adSJunchao Zhang   void                 *baseptr;
266a32e93adSJunchao Zhang   pthread_barrierattr_t attr;
267a32e93adSJunchao Zhang 
2684df5c2c7SJunchao Zhang   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
26961ef3065SPierre Jolivet   int       fd;
2704df5c2c7SJunchao Zhang   PetscChar pathname[PETSC_MAX_PATH_LEN];
2714df5c2c7SJunchao Zhang   #else
2724df5c2c7SJunchao Zhang   PetscMPIInt disp_unit;
2734df5c2c7SJunchao Zhang   #endif
2744df5c2c7SJunchao Zhang 
2754df5c2c7SJunchao Zhang   PetscFunctionBegin;
2764df5c2c7SJunchao Zhang   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
2774df5c2c7SJunchao Zhang   size = sizeof(pthread_barrier_t);
2784df5c2c7SJunchao Zhang   if (ctrl->is_omp_master) {
279eff715bbSJunchao Zhang     /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
2809566063dSJacob Faibussowitsch     PetscCall(PetscGetTmp(PETSC_COMM_SELF, pathname, PETSC_MAX_PATH_LEN));
2819566063dSJacob Faibussowitsch     PetscCall(PetscStrlcat(pathname, "/petsc-shm-XXXXXX", PETSC_MAX_PATH_LEN));
2824df5c2c7SJunchao Zhang     /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
2839371c9d4SSatish Balay     fd = mkstemp(pathname);
2849371c9d4SSatish Balay     PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not create tmp file %s with mkstemp", pathname);
28561ef3065SPierre Jolivet     PetscCallExternal(ftruncate, fd, size);
2869371c9d4SSatish Balay     baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
2879371c9d4SSatish Balay     PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
28861ef3065SPierre Jolivet     PetscCallExternal(close, fd);
2899566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
290eff715bbSJunchao Zhang     /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
2919566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
29261ef3065SPierre Jolivet     PetscCallExternal(unlink, pathname);
2934df5c2c7SJunchao Zhang   } else {
2949566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
2959371c9d4SSatish Balay     fd = open(pathname, O_RDWR);
2969371c9d4SSatish Balay     PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not open tmp file %s", pathname);
2979371c9d4SSatish Balay     baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
2989371c9d4SSatish Balay     PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
29961ef3065SPierre Jolivet     PetscCallExternal(close, fd);
3009566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
3014df5c2c7SJunchao Zhang   }
3024df5c2c7SJunchao Zhang   #else
303a32e93adSJunchao Zhang   size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
3049566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Win_allocate_shared(size, 1, MPI_INFO_NULL, ctrl->omp_comm, &baseptr, &ctrl->omp_win));
3059566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Win_shared_query(ctrl->omp_win, 0, &size, &disp_unit, &baseptr));
3064df5c2c7SJunchao Zhang   #endif
307a32e93adSJunchao Zhang   ctrl->barrier = (pthread_barrier_t *)baseptr;
308a32e93adSJunchao Zhang 
309a32e93adSJunchao Zhang   /* omp master initializes the barrier */
310a32e93adSJunchao Zhang   if (ctrl->is_omp_master) {
3119566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Comm_size(ctrl->omp_comm, &ctrl->omp_comm_size));
31261ef3065SPierre Jolivet     PetscCallExternal(pthread_barrierattr_init, &attr);
31361ef3065SPierre Jolivet     PetscCallExternal(pthread_barrierattr_setpshared, &attr, PTHREAD_PROCESS_SHARED); /* make the barrier also work for processes */
31461ef3065SPierre Jolivet     PetscCallExternal(pthread_barrier_init, ctrl->barrier, &attr, (unsigned int)ctrl->omp_comm_size);
31561ef3065SPierre Jolivet     PetscCallExternal(pthread_barrierattr_destroy, &attr);
316a32e93adSJunchao Zhang   }
317a32e93adSJunchao Zhang 
3184df5c2c7SJunchao Zhang   /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
3199566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
3203ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
321a32e93adSJunchao Zhang }
322a32e93adSJunchao Zhang 
3238fcaa860SBarry Smith /* Destroy the pthread barrier in the PETSc OpenMP controller */
324d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
325d71ae5a4SJacob Faibussowitsch {
3264df5c2c7SJunchao Zhang   PetscFunctionBegin;
3274df5c2c7SJunchao Zhang   /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
3289566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
32961ef3065SPierre Jolivet   if (ctrl->is_omp_master) PetscCallExternal(pthread_barrier_destroy, ctrl->barrier);
3304df5c2c7SJunchao Zhang 
3314df5c2c7SJunchao Zhang   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
33261ef3065SPierre Jolivet   PetscCallExternal(munmap, ctrl->barrier, sizeof(pthread_barrier_t));
3334df5c2c7SJunchao Zhang   #else
3349566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Win_free(&ctrl->omp_win));
3354df5c2c7SJunchao Zhang   #endif
3363ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
337a32e93adSJunchao Zhang }
338a32e93adSJunchao Zhang 
339eff715bbSJunchao Zhang /*@C
340811af0c4SBarry Smith     PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries that use OpenMP
341eff715bbSJunchao Zhang 
342d8d19677SJose E. Roman     Input Parameters:
343eff715bbSJunchao Zhang +   petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
344a2b725a8SWilliam Gropp -   nthreads   - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value
345eff715bbSJunchao Zhang 
346eff715bbSJunchao Zhang     Output Parameter:
3478fcaa860SBarry Smith .   pctrl      - a PETSc OpenMP controller
348eff715bbSJunchao Zhang 
349eff715bbSJunchao Zhang     Level: developer
350eff715bbSJunchao Zhang 
351811af0c4SBarry Smith     Developer Note:
352811af0c4SBarry Smith     Possibly use the variable `PetscNumOMPThreads` to determine the number for threads to use
3538fcaa860SBarry Smith 
354811af0c4SBarry Smith .seealso: `PetscOmpCtrlDestroy()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
355eff715bbSJunchao Zhang @*/
356d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm, PetscInt nthreads, PetscOmpCtrl *pctrl)
357d71ae5a4SJacob Faibussowitsch {
358a32e93adSJunchao Zhang   PetscOmpCtrl   ctrl;
359a32e93adSJunchao Zhang   unsigned long *cpu_ulongs = NULL;
360a32e93adSJunchao Zhang   PetscInt       i, nr_cpu_ulongs;
361a32e93adSJunchao Zhang   PetscShmComm   pshmcomm;
362a32e93adSJunchao Zhang   MPI_Comm       shm_comm;
363a32e93adSJunchao Zhang   PetscMPIInt    shm_rank, shm_comm_size, omp_rank, color;
3647c405c4aSJunchao Zhang   PetscInt       num_packages, num_cores;
365a32e93adSJunchao Zhang 
366a32e93adSJunchao Zhang   PetscFunctionBegin;
3679566063dSJacob Faibussowitsch   PetscCall(PetscNew(&ctrl));
368a32e93adSJunchao Zhang 
369a32e93adSJunchao Zhang   /*=================================================================================
3707c405c4aSJunchao Zhang     Init hwloc
3717c405c4aSJunchao Zhang    ==================================================================================*/
37261ef3065SPierre Jolivet   PetscCallExternal(hwloc_topology_init, &ctrl->topology);
3737c405c4aSJunchao Zhang   #if HWLOC_API_VERSION >= 0x00020000
3747c405c4aSJunchao Zhang   /* to filter out unneeded info and have faster hwloc_topology_load */
37561ef3065SPierre Jolivet   PetscCallExternal(hwloc_topology_set_all_types_filter, ctrl->topology, HWLOC_TYPE_FILTER_KEEP_NONE);
37661ef3065SPierre Jolivet   PetscCallExternal(hwloc_topology_set_type_filter, ctrl->topology, HWLOC_OBJ_CORE, HWLOC_TYPE_FILTER_KEEP_ALL);
3777c405c4aSJunchao Zhang   #endif
37861ef3065SPierre Jolivet   PetscCallExternal(hwloc_topology_load, ctrl->topology);
3797c405c4aSJunchao Zhang 
3807c405c4aSJunchao Zhang   /*=================================================================================
381a32e93adSJunchao Zhang     Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
382a32e93adSJunchao Zhang     physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
383a32e93adSJunchao Zhang     others are called slaves. OMP Masters make up a new comm called omp_master_comm,
384a32e93adSJunchao Zhang     which is usually passed to third party libraries.
385a32e93adSJunchao Zhang    ==================================================================================*/
386a32e93adSJunchao Zhang 
387a32e93adSJunchao Zhang   /* fetch the stored shared memory communicator */
3889566063dSJacob Faibussowitsch   PetscCall(PetscShmCommGet(petsc_comm, &pshmcomm));
3899566063dSJacob Faibussowitsch   PetscCall(PetscShmCommGetMpiShmComm(pshmcomm, &shm_comm));
390a32e93adSJunchao Zhang 
3919566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_rank(shm_comm, &shm_rank));
3929566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(shm_comm, &shm_comm_size));
393a32e93adSJunchao Zhang 
3947c405c4aSJunchao Zhang   /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
3957c405c4aSJunchao Zhang   if (nthreads == -1) {
396a312e481SBarry Smith     num_packages = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE);
397a312e481SBarry Smith     num_cores    = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE);
3987c405c4aSJunchao Zhang     nthreads     = num_cores / num_packages;
3997c405c4aSJunchao Zhang     if (nthreads > shm_comm_size) nthreads = shm_comm_size;
4007c405c4aSJunchao Zhang   }
4017c405c4aSJunchao Zhang 
4025f80ce2aSJacob Faibussowitsch   PetscCheck(nthreads >= 1 && nthreads <= shm_comm_size, petsc_comm, PETSC_ERR_ARG_OUTOFRANGE, "number of OpenMP threads %" PetscInt_FMT " can not be < 1 or > the MPI shared memory communicator size %d", nthreads, shm_comm_size);
4039566063dSJacob Faibussowitsch   if (shm_comm_size % nthreads) PetscCall(PetscPrintf(petsc_comm, "Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n", nthreads, shm_comm_size));
404a32e93adSJunchao Zhang 
405a32e93adSJunchao Zhang   /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
406a32e93adSJunchao Zhang      shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
407a32e93adSJunchao Zhang      color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
408a32e93adSJunchao Zhang      be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
409a32e93adSJunchao Zhang      Use 0 as key so that rank ordering wont change in new comm.
410a32e93adSJunchao Zhang    */
411a32e93adSJunchao Zhang   color = shm_rank / nthreads;
4129566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_split(shm_comm, color, 0 /*key*/, &ctrl->omp_comm));
413a32e93adSJunchao Zhang 
414a32e93adSJunchao Zhang   /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
4159566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_rank(ctrl->omp_comm, &omp_rank));
416a32e93adSJunchao Zhang   if (!omp_rank) {
417a32e93adSJunchao Zhang     ctrl->is_omp_master = PETSC_TRUE; /* master */
418a32e93adSJunchao Zhang     color               = 0;
419a32e93adSJunchao Zhang   } else {
420a32e93adSJunchao Zhang     ctrl->is_omp_master = PETSC_FALSE;   /* slave */
421a32e93adSJunchao Zhang     color               = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
422a32e93adSJunchao Zhang   }
4239566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_split(petsc_comm, color, 0 /*key*/, &ctrl->omp_master_comm));
424a32e93adSJunchao Zhang 
425a32e93adSJunchao Zhang   /*=================================================================================
426a32e93adSJunchao Zhang     Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
427a32e93adSJunchao Zhang     slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
428a32e93adSJunchao Zhang     and run them on the idle CPUs.
429a32e93adSJunchao Zhang    ==================================================================================*/
4309566063dSJacob Faibussowitsch   PetscCall(PetscOmpCtrlCreateBarrier(ctrl));
431a32e93adSJunchao Zhang 
432a32e93adSJunchao Zhang   /*=================================================================================
433a32e93adSJunchao Zhang     omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
434a32e93adSJunchao Zhang     is the union of the bindings of all ranks in the omp_comm
435a32e93adSJunchao Zhang     =================================================================================*/
436a32e93adSJunchao Zhang 
4379371c9d4SSatish Balay   ctrl->cpuset = hwloc_bitmap_alloc();
4389371c9d4SSatish Balay   PetscCheck(ctrl->cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
43961ef3065SPierre Jolivet   PetscCallExternal(hwloc_get_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);
440a32e93adSJunchao Zhang 
4413ab56b82SJunchao Zhang   /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
442a32e93adSJunchao Zhang   nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset(ctrl->topology)) + sizeof(unsigned long) * 8) / sizeof(unsigned long) / 8;
4439566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nr_cpu_ulongs, &cpu_ulongs));
444a32e93adSJunchao Zhang   if (nr_cpu_ulongs == 1) {
445a32e93adSJunchao Zhang     cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
446a32e93adSJunchao Zhang   } else {
447a32e93adSJunchao Zhang     for (i = 0; i < nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset, (unsigned)i);
448a32e93adSJunchao Zhang   }
449a32e93adSJunchao Zhang 
4509566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs, nr_cpu_ulongs, MPI_UNSIGNED_LONG, MPI_BOR, 0, ctrl->omp_comm));
451a32e93adSJunchao Zhang 
452a32e93adSJunchao Zhang   if (ctrl->is_omp_master) {
4539371c9d4SSatish Balay     ctrl->omp_cpuset = hwloc_bitmap_alloc();
4549371c9d4SSatish Balay     PetscCheck(ctrl->omp_cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
455a32e93adSJunchao Zhang     if (nr_cpu_ulongs == 1) {
4563ab56b82SJunchao Zhang   #if HWLOC_API_VERSION >= 0x00020000
45761ef3065SPierre Jolivet       PetscCallExternal(hwloc_bitmap_from_ulong, ctrl->omp_cpuset, cpu_ulongs[0]);
4583ab56b82SJunchao Zhang   #else
4593ab56b82SJunchao Zhang       hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]);
4603ab56b82SJunchao Zhang   #endif
461a32e93adSJunchao Zhang     } else {
4623ab56b82SJunchao Zhang       for (i = 0; i < nr_cpu_ulongs; i++) {
4633ab56b82SJunchao Zhang   #if HWLOC_API_VERSION >= 0x00020000
46461ef3065SPierre Jolivet         PetscCallExternal(hwloc_bitmap_set_ith_ulong, ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
4653ab56b82SJunchao Zhang   #else
4663ab56b82SJunchao Zhang         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
4673ab56b82SJunchao Zhang   #endif
4683ab56b82SJunchao Zhang       }
469a32e93adSJunchao Zhang     }
470a32e93adSJunchao Zhang   }
4719566063dSJacob Faibussowitsch   PetscCall(PetscFree(cpu_ulongs));
472a32e93adSJunchao Zhang   *pctrl = ctrl;
4733ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
474a32e93adSJunchao Zhang }
475a32e93adSJunchao Zhang 
476eff715bbSJunchao Zhang /*@C
47764f49babSJed Brown     PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller
478eff715bbSJunchao Zhang 
479eff715bbSJunchao Zhang     Input Parameter:
4808fcaa860SBarry Smith .   pctrl  - a PETSc OpenMP controller
481eff715bbSJunchao Zhang 
482eff715bbSJunchao Zhang     Level: developer
483eff715bbSJunchao Zhang 
484811af0c4SBarry Smith .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
485eff715bbSJunchao Zhang @*/
486d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
487d71ae5a4SJacob Faibussowitsch {
488a32e93adSJunchao Zhang   PetscOmpCtrl ctrl = *pctrl;
489a32e93adSJunchao Zhang 
490a32e93adSJunchao Zhang   PetscFunctionBegin;
491a32e93adSJunchao Zhang   hwloc_bitmap_free(ctrl->cpuset);
492a32e93adSJunchao Zhang   hwloc_topology_destroy(ctrl->topology);
4933ba16761SJacob Faibussowitsch   PetscCall(PetscOmpCtrlDestroyBarrier(ctrl));
4949566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_free(&ctrl->omp_comm));
495a32e93adSJunchao Zhang   if (ctrl->is_omp_master) {
496a32e93adSJunchao Zhang     hwloc_bitmap_free(ctrl->omp_cpuset);
4979566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Comm_free(&ctrl->omp_master_comm));
498a32e93adSJunchao Zhang   }
4999566063dSJacob Faibussowitsch   PetscCall(PetscFree(ctrl));
5003ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
501a32e93adSJunchao Zhang }
502a32e93adSJunchao Zhang 
503a32e93adSJunchao Zhang /*@C
5048fcaa860SBarry Smith     PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller
505a32e93adSJunchao Zhang 
506a32e93adSJunchao Zhang     Input Parameter:
5078fcaa860SBarry Smith .   ctrl - a PETSc OMP controller
508a32e93adSJunchao Zhang 
509d8d19677SJose E. Roman     Output Parameters:
510eff715bbSJunchao Zhang +   omp_comm         - a communicator that includes a master rank and slave ranks where master spawns threads
511a32e93adSJunchao Zhang .   omp_master_comm  - on master ranks, return a communicator that include master ranks of each omp_comm;
512811af0c4SBarry Smith                        on slave ranks, `MPI_COMM_NULL` will be return in reality.
513a32e93adSJunchao Zhang -   is_omp_master    - true if the calling process is an OMP master rank.
514a32e93adSJunchao Zhang 
515811af0c4SBarry Smith     Note:
516*dc9a610eSPierre Jolivet     Any output parameter can be `NULL`. The parameter is just ignored.
517eff715bbSJunchao Zhang 
518a32e93adSJunchao Zhang     Level: developer
519811af0c4SBarry Smith 
520811af0c4SBarry Smith .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
521a32e93adSJunchao Zhang @*/
522d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl, MPI_Comm *omp_comm, MPI_Comm *omp_master_comm, PetscBool *is_omp_master)
523d71ae5a4SJacob Faibussowitsch {
524a32e93adSJunchao Zhang   PetscFunctionBegin;
525a32e93adSJunchao Zhang   if (omp_comm) *omp_comm = ctrl->omp_comm;
526a32e93adSJunchao Zhang   if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
527a32e93adSJunchao Zhang   if (is_omp_master) *is_omp_master = ctrl->is_omp_master;
5283ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
529a32e93adSJunchao Zhang }
530a32e93adSJunchao Zhang 
531eff715bbSJunchao Zhang /*@C
5328fcaa860SBarry Smith     PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU)
533eff715bbSJunchao Zhang 
534eff715bbSJunchao Zhang     Input Parameter:
5358fcaa860SBarry Smith .   ctrl - a PETSc OMP controller
536eff715bbSJunchao Zhang 
537eff715bbSJunchao Zhang     Notes:
538811af0c4SBarry Smith     this is a pthread barrier on MPI ranks. Using `MPI_Barrier()` instead is conceptually correct. But MPI standard does not
539811af0c4SBarry Smith     require processes blocked by `MPI_Barrier()` free their CPUs to let other processes progress. In practice, to minilize latency,
540811af0c4SBarry Smith     MPI ranks stuck in `MPI_Barrier()` keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.
541eff715bbSJunchao Zhang 
542811af0c4SBarry Smith     A code using `PetscOmpCtrlBarrier()` would be like this,
543811af0c4SBarry Smith .vb
544eff715bbSJunchao Zhang     if (is_omp_master) {
545eff715bbSJunchao Zhang       PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
546eff715bbSJunchao Zhang       Call the library using OpenMP
547eff715bbSJunchao Zhang       PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
548eff715bbSJunchao Zhang     }
549eff715bbSJunchao Zhang     PetscOmpCtrlBarrier(ctrl);
550811af0c4SBarry Smith .ve
551eff715bbSJunchao Zhang 
552eff715bbSJunchao Zhang     Level: developer
553eff715bbSJunchao Zhang 
554811af0c4SBarry Smith .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`,
555eff715bbSJunchao Zhang @*/
556d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
557d71ae5a4SJacob Faibussowitsch {
5582da392ccSBarry Smith   int err;
559a32e93adSJunchao Zhang 
560a32e93adSJunchao Zhang   PetscFunctionBegin;
5612da392ccSBarry Smith   err = pthread_barrier_wait(ctrl->barrier);
56239619372SPierre Jolivet   PetscCheck(!err || err == PTHREAD_BARRIER_SERIAL_THREAD, PETSC_COMM_SELF, PETSC_ERR_LIB, "pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %d", err);
5633ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
564a32e93adSJunchao Zhang }
565a32e93adSJunchao Zhang 
566eff715bbSJunchao Zhang /*@C
567eff715bbSJunchao Zhang     PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks
568eff715bbSJunchao Zhang 
569eff715bbSJunchao Zhang     Input Parameter:
5708fcaa860SBarry Smith .   ctrl - a PETSc OMP controller
571eff715bbSJunchao Zhang 
572811af0c4SBarry Smith     Note:
573811af0c4SBarry Smith     Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
574eff715bbSJunchao Zhang     This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime
575eff715bbSJunchao Zhang 
576eff715bbSJunchao Zhang     Level: developer
577eff715bbSJunchao Zhang 
578811af0c4SBarry Smith .seealso: `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
579eff715bbSJunchao Zhang @*/
580d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
581d71ae5a4SJacob Faibussowitsch {
582a32e93adSJunchao Zhang   PetscFunctionBegin;
58361ef3065SPierre Jolivet   PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->omp_cpuset, HWLOC_CPUBIND_PROCESS);
584eff715bbSJunchao Zhang   omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
5853ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
586a32e93adSJunchao Zhang }
587a32e93adSJunchao Zhang 
588eff715bbSJunchao Zhang /*@C
589eff715bbSJunchao Zhang    PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks
590eff715bbSJunchao Zhang 
591eff715bbSJunchao Zhang    Input Parameter:
5928fcaa860SBarry Smith .  ctrl - a PETSc OMP controller
593eff715bbSJunchao Zhang 
594811af0c4SBarry Smith    Note:
595811af0c4SBarry Smith    Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
596eff715bbSJunchao Zhang    This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.
597eff715bbSJunchao Zhang 
598eff715bbSJunchao Zhang    Level: developer
599eff715bbSJunchao Zhang 
600811af0c4SBarry Smith .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
601eff715bbSJunchao Zhang @*/
602d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
603d71ae5a4SJacob Faibussowitsch {
604a32e93adSJunchao Zhang   PetscFunctionBegin;
60561ef3065SPierre Jolivet   PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);
606eff715bbSJunchao Zhang   omp_set_num_threads(1);
6073ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
608a32e93adSJunchao Zhang }
609a32e93adSJunchao Zhang 
6104df5c2c7SJunchao Zhang   #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
61120b3346cSJunchao Zhang #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */
612