xref: /petsc/src/sys/utils/mpishm.c (revision cf27e480737f7f40fb2f5c38a85b346ae95c4aac)
15f7487a0SJunchao Zhang #include <petscsys.h> /*I  "petscsys.h"  I*/
25f7487a0SJunchao Zhang #include <petsc/private/petscimpl.h>
35f7487a0SJunchao Zhang 
45f7487a0SJunchao Zhang struct _n_PetscShmComm {
55f7487a0SJunchao Zhang   PetscMPIInt *globranks;         /* global ranks of each rank in the shared memory communicator */
65f7487a0SJunchao Zhang   PetscMPIInt  shmsize;           /* size of the shared memory communicator */
75f7487a0SJunchao Zhang   MPI_Comm     globcomm, shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */
85f7487a0SJunchao Zhang };
95f7487a0SJunchao Zhang 
105f7487a0SJunchao Zhang /*
1133779a13SJunchao Zhang    Private routine to delete internal shared memory communicator when a communicator is freed.
125f7487a0SJunchao Zhang 
135f7487a0SJunchao Zhang    This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this  data as an attribute is freed.
145f7487a0SJunchao Zhang 
155f7487a0SJunchao Zhang    Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval()
165f7487a0SJunchao Zhang 
175f7487a0SJunchao Zhang */
188434afd1SBarry Smith PETSC_EXTERN PetscMPIInt MPIAPI Petsc_ShmComm_Attr_DeleteFn(MPI_Comm comm, PetscMPIInt keyval, void *val, void *extra_state)
19d71ae5a4SJacob Faibussowitsch {
205f7487a0SJunchao Zhang   PetscShmComm p = (PetscShmComm)val;
215f7487a0SJunchao Zhang 
225f7487a0SJunchao Zhang   PetscFunctionBegin;
237c5b2466SBarry Smith   PetscCallReturnMPI(PetscInfo(NULL, "Deleting shared memory subcommunicator in a MPI_Comm %ld\n", (long)comm));
247c5b2466SBarry Smith   PetscCallMPIReturnMPI(MPI_Comm_free(&p->shmcomm));
257c5b2466SBarry Smith   PetscCallReturnMPI(PetscFree(p->globranks));
267c5b2466SBarry Smith   PetscCallReturnMPI(PetscFree(val));
275f7487a0SJunchao Zhang   PetscFunctionReturn(MPI_SUCCESS);
285f7487a0SJunchao Zhang }
295f7487a0SJunchao Zhang 
30b48189acSJunchao Zhang #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
31b48189acSJunchao Zhang   /* Data structures to support freeing comms created in PetscShmCommGet().
32b48189acSJunchao Zhang   Since we predict communicators passed to PetscShmCommGet() are very likely
33b48189acSJunchao Zhang   either a petsc inner communicator or an MPI communicator with a linked petsc
34b48189acSJunchao Zhang   inner communicator, we use a simple static array to store dupped communicators
35b48189acSJunchao Zhang   on rare cases otherwise.
36b48189acSJunchao Zhang  */
37b48189acSJunchao Zhang   #define MAX_SHMCOMM_DUPPED_COMMS 16
38b48189acSJunchao Zhang static PetscInt       num_dupped_comms = 0;
39b48189acSJunchao Zhang static MPI_Comm       shmcomm_dupped_comms[MAX_SHMCOMM_DUPPED_COMMS];
40d71ae5a4SJacob Faibussowitsch static PetscErrorCode PetscShmCommDestroyDuppedComms(void)
41d71ae5a4SJacob Faibussowitsch {
42b48189acSJunchao Zhang   PetscFunctionBegin;
43*cf27e480SPierre Jolivet   for (PetscInt i = 0; i < num_dupped_comms; i++) PetscCall(PetscCommDestroy(&shmcomm_dupped_comms[i]));
44b48189acSJunchao Zhang   num_dupped_comms = 0; /* reset so that PETSc can be reinitialized */
453ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
46b48189acSJunchao Zhang }
47b48189acSJunchao Zhang #endif
48b48189acSJunchao Zhang 
495f7487a0SJunchao Zhang /*@C
50811af0c4SBarry Smith   PetscShmCommGet - Returns a sub-communicator of all ranks that share a common memory
515f7487a0SJunchao Zhang 
52d083f849SBarry Smith   Collective.
535f7487a0SJunchao Zhang 
545f7487a0SJunchao Zhang   Input Parameter:
55a3b724e8SBarry Smith . globcomm - `MPI_Comm`, which can be a user `MPI_Comm` or a PETSc inner `MPI_Comm`
565f7487a0SJunchao Zhang 
575f7487a0SJunchao Zhang   Output Parameter:
585f7487a0SJunchao Zhang . pshmcomm - the PETSc shared memory communicator object
595f7487a0SJunchao Zhang 
605f7487a0SJunchao Zhang   Level: developer
615f7487a0SJunchao Zhang 
62811af0c4SBarry Smith   Note:
63a3b724e8SBarry Smith   When used with MPICH, MPICH must be configured with `--download-mpich-device=ch3:nemesis`
645f7487a0SJunchao Zhang 
65811af0c4SBarry Smith .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()`
665f7487a0SJunchao Zhang @*/
67d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscShmCommGet(MPI_Comm globcomm, PetscShmComm *pshmcomm)
68d71ae5a4SJacob Faibussowitsch {
695f7487a0SJunchao Zhang #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
705f7487a0SJunchao Zhang   MPI_Group         globgroup, shmgroup;
715f7487a0SJunchao Zhang   PetscMPIInt      *shmranks, i, flg;
725f7487a0SJunchao Zhang   PetscCommCounter *counter;
735f7487a0SJunchao Zhang 
745f7487a0SJunchao Zhang   PetscFunctionBegin;
754f572ea9SToby Isaac   PetscAssertPointer(pshmcomm, 2);
76b48189acSJunchao Zhang   /* Get a petsc inner comm, since we always want to stash pshmcomm on petsc inner comms */
779566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_Counter_keyval, &counter, &flg));
78b48189acSJunchao Zhang   if (!flg) { /* globcomm is not a petsc comm */
799371c9d4SSatish Balay     union
809371c9d4SSatish Balay     {
819371c9d4SSatish Balay       MPI_Comm comm;
829371c9d4SSatish Balay       void    *ptr;
839371c9d4SSatish Balay     } ucomm;
84b48189acSJunchao Zhang     /* check if globcomm already has a linked petsc inner comm */
859566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_InnerComm_keyval, &ucomm, &flg));
86b48189acSJunchao Zhang     if (!flg) {
87b48189acSJunchao Zhang       /* globcomm does not have a linked petsc inner comm, so we create one and replace globcomm with it */
8808401ef6SPierre Jolivet       PetscCheck(num_dupped_comms < MAX_SHMCOMM_DUPPED_COMMS, globcomm, PETSC_ERR_PLIB, "PetscShmCommGet() is trying to dup more than %d MPI_Comms", MAX_SHMCOMM_DUPPED_COMMS);
899566063dSJacob Faibussowitsch       PetscCall(PetscCommDuplicate(globcomm, &globcomm, NULL));
90b48189acSJunchao Zhang       /* Register a function to free the dupped petsc comms at PetscFinalize at the first time */
919566063dSJacob Faibussowitsch       if (num_dupped_comms == 0) PetscCall(PetscRegisterFinalize(PetscShmCommDestroyDuppedComms));
92b48189acSJunchao Zhang       shmcomm_dupped_comms[num_dupped_comms] = globcomm;
93b48189acSJunchao Zhang       num_dupped_comms++;
94b48189acSJunchao Zhang     } else {
95b48189acSJunchao Zhang       /* otherwise, we pull out the inner comm and use it as globcomm */
96b48189acSJunchao Zhang       globcomm = ucomm.comm;
97b48189acSJunchao Zhang     }
98b48189acSJunchao Zhang   }
995f7487a0SJunchao Zhang 
100b48189acSJunchao Zhang   /* Check if globcomm already has an attached pshmcomm. If no, create one */
1019566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_ShmComm_keyval, pshmcomm, &flg));
1023ba16761SJacob Faibussowitsch   if (flg) PetscFunctionReturn(PETSC_SUCCESS);
1035f7487a0SJunchao Zhang 
1049566063dSJacob Faibussowitsch   PetscCall(PetscNew(pshmcomm));
1055f7487a0SJunchao Zhang   (*pshmcomm)->globcomm = globcomm;
1065f7487a0SJunchao Zhang 
1079566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &(*pshmcomm)->shmcomm));
1085f7487a0SJunchao Zhang 
1099566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size((*pshmcomm)->shmcomm, &(*pshmcomm)->shmsize));
1109566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_group(globcomm, &globgroup));
1119566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup));
1129566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &shmranks));
1139566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &(*pshmcomm)->globranks));
1145f7487a0SJunchao Zhang   for (i = 0; i < (*pshmcomm)->shmsize; i++) shmranks[i] = i;
1159566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks));
1169566063dSJacob Faibussowitsch   PetscCall(PetscFree(shmranks));
1179566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Group_free(&globgroup));
1189566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Group_free(&shmgroup));
1195f7487a0SJunchao Zhang 
12048a46eb9SPierre Jolivet   for (i = 0; i < (*pshmcomm)->shmsize; i++) PetscCall(PetscInfo(NULL, "Shared memory rank %d global rank %d\n", i, (*pshmcomm)->globranks[i]));
1219566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_set_attr(globcomm, Petsc_ShmComm_keyval, *pshmcomm));
1223ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1235f7487a0SJunchao Zhang #else
1245f7487a0SJunchao Zhang   SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich.");
1255f7487a0SJunchao Zhang #endif
1265f7487a0SJunchao Zhang }
1275f7487a0SJunchao Zhang 
1285f7487a0SJunchao Zhang /*@C
1295f7487a0SJunchao Zhang   PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator
1305f7487a0SJunchao Zhang 
1315f7487a0SJunchao Zhang   Input Parameters:
1325f7487a0SJunchao Zhang + pshmcomm - the shared memory communicator object
1335f7487a0SJunchao Zhang - grank    - the global rank
1345f7487a0SJunchao Zhang 
1355f7487a0SJunchao Zhang   Output Parameter:
136811af0c4SBarry Smith . lrank - the local rank, or `MPI_PROC_NULL` if it does not exist
1375f7487a0SJunchao Zhang 
1385f7487a0SJunchao Zhang   Level: developer
1395f7487a0SJunchao Zhang 
1405f7487a0SJunchao Zhang   Developer Notes:
1415f7487a0SJunchao Zhang   Assumes the pshmcomm->globranks[] is sorted
1425f7487a0SJunchao Zhang 
1435f7487a0SJunchao Zhang   It may be better to rewrite this to map multiple global ranks to local in the same function call
1445f7487a0SJunchao Zhang 
145811af0c4SBarry Smith .seealso: `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()`
1465f7487a0SJunchao Zhang @*/
147d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm, PetscMPIInt grank, PetscMPIInt *lrank)
148d71ae5a4SJacob Faibussowitsch {
1495f7487a0SJunchao Zhang   PetscMPIInt low, high, t, i;
1505f7487a0SJunchao Zhang   PetscBool   flg = PETSC_FALSE;
1515f7487a0SJunchao Zhang 
1525f7487a0SJunchao Zhang   PetscFunctionBegin;
1534f572ea9SToby Isaac   PetscAssertPointer(pshmcomm, 1);
1544f572ea9SToby Isaac   PetscAssertPointer(lrank, 3);
1555f7487a0SJunchao Zhang   *lrank = MPI_PROC_NULL;
1563ba16761SJacob Faibussowitsch   if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(PETSC_SUCCESS);
1573ba16761SJacob Faibussowitsch   if (grank > pshmcomm->globranks[pshmcomm->shmsize - 1]) PetscFunctionReturn(PETSC_SUCCESS);
1589566063dSJacob Faibussowitsch   PetscCall(PetscOptionsGetBool(NULL, NULL, "-noshared", &flg, NULL));
1593ba16761SJacob Faibussowitsch   if (flg) PetscFunctionReturn(PETSC_SUCCESS);
1605f7487a0SJunchao Zhang   low  = 0;
1615f7487a0SJunchao Zhang   high = pshmcomm->shmsize;
1625f7487a0SJunchao Zhang   while (high - low > 5) {
1635f7487a0SJunchao Zhang     t = (low + high) / 2;
1645f7487a0SJunchao Zhang     if (pshmcomm->globranks[t] > grank) high = t;
1655f7487a0SJunchao Zhang     else low = t;
1665f7487a0SJunchao Zhang   }
1675f7487a0SJunchao Zhang   for (i = low; i < high; i++) {
1683ba16761SJacob Faibussowitsch     if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(PETSC_SUCCESS);
1695f7487a0SJunchao Zhang     if (pshmcomm->globranks[i] == grank) {
1705f7487a0SJunchao Zhang       *lrank = i;
1713ba16761SJacob Faibussowitsch       PetscFunctionReturn(PETSC_SUCCESS);
1725f7487a0SJunchao Zhang     }
1735f7487a0SJunchao Zhang   }
1743ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1755f7487a0SJunchao Zhang }
1765f7487a0SJunchao Zhang 
1775f7487a0SJunchao Zhang /*@C
1785f7487a0SJunchao Zhang   PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank
1795f7487a0SJunchao Zhang 
1805f7487a0SJunchao Zhang   Input Parameters:
1815f7487a0SJunchao Zhang + pshmcomm - the shared memory communicator object
1825f7487a0SJunchao Zhang - lrank    - the local rank in the shared memory communicator
1835f7487a0SJunchao Zhang 
1845f7487a0SJunchao Zhang   Output Parameter:
1855f7487a0SJunchao Zhang . grank - the global rank in the global communicator where the shared memory communicator is built
1865f7487a0SJunchao Zhang 
1875f7487a0SJunchao Zhang   Level: developer
1885f7487a0SJunchao Zhang 
189811af0c4SBarry Smith .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommGetMpiShmComm()`
1905f7487a0SJunchao Zhang @*/
191d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm, PetscMPIInt lrank, PetscMPIInt *grank)
192d71ae5a4SJacob Faibussowitsch {
1935f7487a0SJunchao Zhang   PetscFunctionBegin;
1944f572ea9SToby Isaac   PetscAssertPointer(pshmcomm, 1);
1954f572ea9SToby Isaac   PetscAssertPointer(grank, 3);
1962c71b3e2SJacob Faibussowitsch   PetscCheck(lrank >= 0 && lrank < pshmcomm->shmsize, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "No rank %d in the shared memory communicator", lrank);
1975f7487a0SJunchao Zhang   *grank = pshmcomm->globranks[lrank];
1983ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1995f7487a0SJunchao Zhang }
2005f7487a0SJunchao Zhang 
2015f7487a0SJunchao Zhang /*@C
2025f7487a0SJunchao Zhang   PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory
2035f7487a0SJunchao Zhang 
2045f7487a0SJunchao Zhang   Input Parameter:
2055f7487a0SJunchao Zhang . pshmcomm - PetscShmComm object obtained with PetscShmCommGet()
2065f7487a0SJunchao Zhang 
2075f7487a0SJunchao Zhang   Output Parameter:
2085f7487a0SJunchao Zhang . comm - the MPI communicator
2095f7487a0SJunchao Zhang 
2105f7487a0SJunchao Zhang   Level: developer
2115f7487a0SJunchao Zhang 
212811af0c4SBarry Smith .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()`
2135f7487a0SJunchao Zhang @*/
214d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm, MPI_Comm *comm)
215d71ae5a4SJacob Faibussowitsch {
2165f7487a0SJunchao Zhang   PetscFunctionBegin;
2174f572ea9SToby Isaac   PetscAssertPointer(pshmcomm, 1);
2184f572ea9SToby Isaac   PetscAssertPointer(comm, 2);
2195f7487a0SJunchao Zhang   *comm = pshmcomm->shmcomm;
2203ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2215f7487a0SJunchao Zhang }
2225f7487a0SJunchao Zhang 
22320b3346cSJunchao Zhang #if defined(PETSC_HAVE_OPENMP_SUPPORT)
224a32e93adSJunchao Zhang   #include <pthread.h>
225a32e93adSJunchao Zhang   #include <hwloc.h>
226a32e93adSJunchao Zhang   #include <omp.h>
227a32e93adSJunchao Zhang 
228eff715bbSJunchao Zhang   /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
229eff715bbSJunchao Zhang    otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
2304df5c2c7SJunchao Zhang    simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
2314df5c2c7SJunchao Zhang    by 50%. Until the reason is found out, we use mmap() instead.
2324df5c2c7SJunchao Zhang */
2334df5c2c7SJunchao Zhang   #define USE_MMAP_ALLOCATE_SHARED_MEMORY
2344df5c2c7SJunchao Zhang 
2354df5c2c7SJunchao Zhang   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
2364df5c2c7SJunchao Zhang     #include <sys/mman.h>
2374df5c2c7SJunchao Zhang     #include <sys/types.h>
2384df5c2c7SJunchao Zhang     #include <sys/stat.h>
2394df5c2c7SJunchao Zhang     #include <fcntl.h>
2404df5c2c7SJunchao Zhang   #endif
2414df5c2c7SJunchao Zhang 
242a32e93adSJunchao Zhang struct _n_PetscOmpCtrl {
243a32e93adSJunchao Zhang   MPI_Comm           omp_comm;        /* a shared memory communicator to spawn omp threads */
244a32e93adSJunchao Zhang   MPI_Comm           omp_master_comm; /* a communicator to give to third party libraries */
245a32e93adSJunchao Zhang   PetscMPIInt        omp_comm_size;   /* size of omp_comm, a kind of OMP_NUM_THREADS */
246a32e93adSJunchao Zhang   PetscBool          is_omp_master;   /* rank 0's in omp_comm */
247a32e93adSJunchao Zhang   MPI_Win            omp_win;         /* a shared memory window containing a barrier */
248a32e93adSJunchao Zhang   pthread_barrier_t *barrier;         /* pointer to the barrier */
249a32e93adSJunchao Zhang   hwloc_topology_t   topology;
250a32e93adSJunchao Zhang   hwloc_cpuset_t     cpuset;     /* cpu bindings of omp master */
251a32e93adSJunchao Zhang   hwloc_cpuset_t     omp_cpuset; /* union of cpu bindings of ranks in omp_comm */
252a32e93adSJunchao Zhang };
253a32e93adSJunchao Zhang 
254eff715bbSJunchao Zhang /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
2558fcaa860SBarry Smith    contained by the controller.
256eff715bbSJunchao Zhang 
2578fcaa860SBarry Smith    PETSc OpenMP controller users do not call this function directly. This function exists
258eff715bbSJunchao Zhang    only because we want to separate shared memory allocation methods from other code.
259eff715bbSJunchao Zhang  */
260d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
261d71ae5a4SJacob Faibussowitsch {
262a32e93adSJunchao Zhang   MPI_Aint              size;
263a32e93adSJunchao Zhang   void                 *baseptr;
264a32e93adSJunchao Zhang   pthread_barrierattr_t attr;
265a32e93adSJunchao Zhang 
2664df5c2c7SJunchao Zhang   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
26761ef3065SPierre Jolivet   int       fd;
2684df5c2c7SJunchao Zhang   PetscChar pathname[PETSC_MAX_PATH_LEN];
2694df5c2c7SJunchao Zhang   #else
2704df5c2c7SJunchao Zhang   PetscMPIInt disp_unit;
2714df5c2c7SJunchao Zhang   #endif
2724df5c2c7SJunchao Zhang 
2734df5c2c7SJunchao Zhang   PetscFunctionBegin;
2744df5c2c7SJunchao Zhang   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
2754df5c2c7SJunchao Zhang   size = sizeof(pthread_barrier_t);
2764df5c2c7SJunchao Zhang   if (ctrl->is_omp_master) {
277eff715bbSJunchao Zhang     /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
2789566063dSJacob Faibussowitsch     PetscCall(PetscGetTmp(PETSC_COMM_SELF, pathname, PETSC_MAX_PATH_LEN));
2799566063dSJacob Faibussowitsch     PetscCall(PetscStrlcat(pathname, "/petsc-shm-XXXXXX", PETSC_MAX_PATH_LEN));
2804df5c2c7SJunchao Zhang     /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
2819371c9d4SSatish Balay     fd = mkstemp(pathname);
2829371c9d4SSatish Balay     PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not create tmp file %s with mkstemp", pathname);
28361ef3065SPierre Jolivet     PetscCallExternal(ftruncate, fd, size);
2849371c9d4SSatish Balay     baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
2859371c9d4SSatish Balay     PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
28661ef3065SPierre Jolivet     PetscCallExternal(close, fd);
2879566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
288eff715bbSJunchao Zhang     /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
2899566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
29061ef3065SPierre Jolivet     PetscCallExternal(unlink, pathname);
2914df5c2c7SJunchao Zhang   } else {
2929566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
2939371c9d4SSatish Balay     fd = open(pathname, O_RDWR);
2949371c9d4SSatish Balay     PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not open tmp file %s", pathname);
2959371c9d4SSatish Balay     baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
2969371c9d4SSatish Balay     PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
29761ef3065SPierre Jolivet     PetscCallExternal(close, fd);
2989566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
2994df5c2c7SJunchao Zhang   }
3004df5c2c7SJunchao Zhang   #else
301a32e93adSJunchao Zhang   size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
3029566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Win_allocate_shared(size, 1, MPI_INFO_NULL, ctrl->omp_comm, &baseptr, &ctrl->omp_win));
3039566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Win_shared_query(ctrl->omp_win, 0, &size, &disp_unit, &baseptr));
3044df5c2c7SJunchao Zhang   #endif
305a32e93adSJunchao Zhang   ctrl->barrier = (pthread_barrier_t *)baseptr;
306a32e93adSJunchao Zhang 
307a32e93adSJunchao Zhang   /* omp master initializes the barrier */
308a32e93adSJunchao Zhang   if (ctrl->is_omp_master) {
3099566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Comm_size(ctrl->omp_comm, &ctrl->omp_comm_size));
31061ef3065SPierre Jolivet     PetscCallExternal(pthread_barrierattr_init, &attr);
31161ef3065SPierre Jolivet     PetscCallExternal(pthread_barrierattr_setpshared, &attr, PTHREAD_PROCESS_SHARED); /* make the barrier also work for processes */
31261ef3065SPierre Jolivet     PetscCallExternal(pthread_barrier_init, ctrl->barrier, &attr, (unsigned int)ctrl->omp_comm_size);
31361ef3065SPierre Jolivet     PetscCallExternal(pthread_barrierattr_destroy, &attr);
314a32e93adSJunchao Zhang   }
315a32e93adSJunchao Zhang 
3164df5c2c7SJunchao Zhang   /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
3179566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
3183ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
319a32e93adSJunchao Zhang }
320a32e93adSJunchao Zhang 
3218fcaa860SBarry Smith /* Destroy the pthread barrier in the PETSc OpenMP controller */
322d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
323d71ae5a4SJacob Faibussowitsch {
3244df5c2c7SJunchao Zhang   PetscFunctionBegin;
3254df5c2c7SJunchao Zhang   /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
3269566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
32761ef3065SPierre Jolivet   if (ctrl->is_omp_master) PetscCallExternal(pthread_barrier_destroy, ctrl->barrier);
3284df5c2c7SJunchao Zhang 
3294df5c2c7SJunchao Zhang   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
33061ef3065SPierre Jolivet   PetscCallExternal(munmap, ctrl->barrier, sizeof(pthread_barrier_t));
3314df5c2c7SJunchao Zhang   #else
3329566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Win_free(&ctrl->omp_win));
3334df5c2c7SJunchao Zhang   #endif
3343ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
335a32e93adSJunchao Zhang }
336a32e93adSJunchao Zhang 
337eff715bbSJunchao Zhang /*@C
338811af0c4SBarry Smith     PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries that use OpenMP
339eff715bbSJunchao Zhang 
340d8d19677SJose E. Roman     Input Parameters:
341eff715bbSJunchao Zhang +   petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
342a2b725a8SWilliam Gropp -   nthreads   - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value
343eff715bbSJunchao Zhang 
344eff715bbSJunchao Zhang     Output Parameter:
3458fcaa860SBarry Smith .   pctrl      - a PETSc OpenMP controller
346eff715bbSJunchao Zhang 
347eff715bbSJunchao Zhang     Level: developer
348eff715bbSJunchao Zhang 
349811af0c4SBarry Smith     Developer Note:
350811af0c4SBarry Smith     Possibly use the variable `PetscNumOMPThreads` to determine the number for threads to use
3518fcaa860SBarry Smith 
352811af0c4SBarry Smith .seealso: `PetscOmpCtrlDestroy()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
353eff715bbSJunchao Zhang @*/
354d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm, PetscInt nthreads, PetscOmpCtrl *pctrl)
355d71ae5a4SJacob Faibussowitsch {
356a32e93adSJunchao Zhang   PetscOmpCtrl   ctrl;
357a32e93adSJunchao Zhang   unsigned long *cpu_ulongs = NULL;
358a32e93adSJunchao Zhang   PetscShmComm   pshmcomm;
359a32e93adSJunchao Zhang   MPI_Comm       shm_comm;
360*cf27e480SPierre Jolivet   PetscMPIInt    shm_rank, shm_comm_size, omp_rank, color, nr_cpu_ulongs;
3617c405c4aSJunchao Zhang   PetscInt       num_packages, num_cores;
362a32e93adSJunchao Zhang 
363a32e93adSJunchao Zhang   PetscFunctionBegin;
3649566063dSJacob Faibussowitsch   PetscCall(PetscNew(&ctrl));
365a32e93adSJunchao Zhang 
366a32e93adSJunchao Zhang   /*=================================================================================
3677c405c4aSJunchao Zhang     Init hwloc
3687c405c4aSJunchao Zhang    ==================================================================================*/
36961ef3065SPierre Jolivet   PetscCallExternal(hwloc_topology_init, &ctrl->topology);
3707c405c4aSJunchao Zhang   #if HWLOC_API_VERSION >= 0x00020000
3717c405c4aSJunchao Zhang   /* to filter out unneeded info and have faster hwloc_topology_load */
37261ef3065SPierre Jolivet   PetscCallExternal(hwloc_topology_set_all_types_filter, ctrl->topology, HWLOC_TYPE_FILTER_KEEP_NONE);
37361ef3065SPierre Jolivet   PetscCallExternal(hwloc_topology_set_type_filter, ctrl->topology, HWLOC_OBJ_CORE, HWLOC_TYPE_FILTER_KEEP_ALL);
3747c405c4aSJunchao Zhang   #endif
37561ef3065SPierre Jolivet   PetscCallExternal(hwloc_topology_load, ctrl->topology);
3767c405c4aSJunchao Zhang 
3777c405c4aSJunchao Zhang   /*=================================================================================
378a32e93adSJunchao Zhang     Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
379a32e93adSJunchao Zhang     physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
380a32e93adSJunchao Zhang     others are called slaves. OMP Masters make up a new comm called omp_master_comm,
381a32e93adSJunchao Zhang     which is usually passed to third party libraries.
382a32e93adSJunchao Zhang    ==================================================================================*/
383a32e93adSJunchao Zhang 
384a32e93adSJunchao Zhang   /* fetch the stored shared memory communicator */
3859566063dSJacob Faibussowitsch   PetscCall(PetscShmCommGet(petsc_comm, &pshmcomm));
3869566063dSJacob Faibussowitsch   PetscCall(PetscShmCommGetMpiShmComm(pshmcomm, &shm_comm));
387a32e93adSJunchao Zhang 
3889566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_rank(shm_comm, &shm_rank));
3899566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(shm_comm, &shm_comm_size));
390a32e93adSJunchao Zhang 
3917c405c4aSJunchao Zhang   /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
3927c405c4aSJunchao Zhang   if (nthreads == -1) {
393a312e481SBarry Smith     num_packages = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE);
394a312e481SBarry Smith     num_cores    = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE);
3957c405c4aSJunchao Zhang     nthreads     = num_cores / num_packages;
3967c405c4aSJunchao Zhang     if (nthreads > shm_comm_size) nthreads = shm_comm_size;
3977c405c4aSJunchao Zhang   }
3987c405c4aSJunchao Zhang 
3995f80ce2aSJacob Faibussowitsch   PetscCheck(nthreads >= 1 && nthreads <= shm_comm_size, petsc_comm, PETSC_ERR_ARG_OUTOFRANGE, "number of OpenMP threads %" PetscInt_FMT " can not be < 1 or > the MPI shared memory communicator size %d", nthreads, shm_comm_size);
4009566063dSJacob Faibussowitsch   if (shm_comm_size % nthreads) PetscCall(PetscPrintf(petsc_comm, "Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n", nthreads, shm_comm_size));
401a32e93adSJunchao Zhang 
402a32e93adSJunchao Zhang   /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
403a32e93adSJunchao Zhang      shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
404a32e93adSJunchao Zhang      color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
405a32e93adSJunchao Zhang      be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
406a32e93adSJunchao Zhang      Use 0 as key so that rank ordering wont change in new comm.
407a32e93adSJunchao Zhang    */
408a32e93adSJunchao Zhang   color = shm_rank / nthreads;
4099566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_split(shm_comm, color, 0 /*key*/, &ctrl->omp_comm));
410a32e93adSJunchao Zhang 
411a32e93adSJunchao Zhang   /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
4129566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_rank(ctrl->omp_comm, &omp_rank));
413a32e93adSJunchao Zhang   if (!omp_rank) {
414a32e93adSJunchao Zhang     ctrl->is_omp_master = PETSC_TRUE; /* master */
415a32e93adSJunchao Zhang     color               = 0;
416a32e93adSJunchao Zhang   } else {
417a32e93adSJunchao Zhang     ctrl->is_omp_master = PETSC_FALSE;   /* slave */
418a32e93adSJunchao Zhang     color               = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
419a32e93adSJunchao Zhang   }
4209566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_split(petsc_comm, color, 0 /*key*/, &ctrl->omp_master_comm));
421a32e93adSJunchao Zhang 
422a32e93adSJunchao Zhang   /*=================================================================================
423a32e93adSJunchao Zhang     Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
424a32e93adSJunchao Zhang     slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
425a32e93adSJunchao Zhang     and run them on the idle CPUs.
426a32e93adSJunchao Zhang    ==================================================================================*/
4279566063dSJacob Faibussowitsch   PetscCall(PetscOmpCtrlCreateBarrier(ctrl));
428a32e93adSJunchao Zhang 
429a32e93adSJunchao Zhang   /*=================================================================================
430a32e93adSJunchao Zhang     omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
431a32e93adSJunchao Zhang     is the union of the bindings of all ranks in the omp_comm
432a32e93adSJunchao Zhang     =================================================================================*/
433a32e93adSJunchao Zhang 
4349371c9d4SSatish Balay   ctrl->cpuset = hwloc_bitmap_alloc();
4359371c9d4SSatish Balay   PetscCheck(ctrl->cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
43661ef3065SPierre Jolivet   PetscCallExternal(hwloc_get_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);
437a32e93adSJunchao Zhang 
4383ab56b82SJunchao Zhang   /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
439a32e93adSJunchao Zhang   nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset(ctrl->topology)) + sizeof(unsigned long) * 8) / sizeof(unsigned long) / 8;
4409566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nr_cpu_ulongs, &cpu_ulongs));
441a32e93adSJunchao Zhang   if (nr_cpu_ulongs == 1) {
442a32e93adSJunchao Zhang     cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
443a32e93adSJunchao Zhang   } else {
444*cf27e480SPierre Jolivet     for (PetscInt i = 0; i < nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset, (unsigned)i);
445a32e93adSJunchao Zhang   }
446a32e93adSJunchao Zhang 
4479566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs, nr_cpu_ulongs, MPI_UNSIGNED_LONG, MPI_BOR, 0, ctrl->omp_comm));
448a32e93adSJunchao Zhang 
449a32e93adSJunchao Zhang   if (ctrl->is_omp_master) {
4509371c9d4SSatish Balay     ctrl->omp_cpuset = hwloc_bitmap_alloc();
4519371c9d4SSatish Balay     PetscCheck(ctrl->omp_cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
452a32e93adSJunchao Zhang     if (nr_cpu_ulongs == 1) {
4533ab56b82SJunchao Zhang   #if HWLOC_API_VERSION >= 0x00020000
45461ef3065SPierre Jolivet       PetscCallExternal(hwloc_bitmap_from_ulong, ctrl->omp_cpuset, cpu_ulongs[0]);
4553ab56b82SJunchao Zhang   #else
4563ab56b82SJunchao Zhang       hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]);
4573ab56b82SJunchao Zhang   #endif
458a32e93adSJunchao Zhang     } else {
459*cf27e480SPierre Jolivet       for (PetscInt i = 0; i < nr_cpu_ulongs; i++) {
4603ab56b82SJunchao Zhang   #if HWLOC_API_VERSION >= 0x00020000
46161ef3065SPierre Jolivet         PetscCallExternal(hwloc_bitmap_set_ith_ulong, ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
4623ab56b82SJunchao Zhang   #else
4633ab56b82SJunchao Zhang         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
4643ab56b82SJunchao Zhang   #endif
4653ab56b82SJunchao Zhang       }
466a32e93adSJunchao Zhang     }
467a32e93adSJunchao Zhang   }
4689566063dSJacob Faibussowitsch   PetscCall(PetscFree(cpu_ulongs));
469a32e93adSJunchao Zhang   *pctrl = ctrl;
4703ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
471a32e93adSJunchao Zhang }
472a32e93adSJunchao Zhang 
473eff715bbSJunchao Zhang /*@C
47464f49babSJed Brown     PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller
475eff715bbSJunchao Zhang 
476eff715bbSJunchao Zhang     Input Parameter:
4778fcaa860SBarry Smith .   pctrl  - a PETSc OpenMP controller
478eff715bbSJunchao Zhang 
479eff715bbSJunchao Zhang     Level: developer
480eff715bbSJunchao Zhang 
481811af0c4SBarry Smith .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
482eff715bbSJunchao Zhang @*/
483d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
484d71ae5a4SJacob Faibussowitsch {
485a32e93adSJunchao Zhang   PetscOmpCtrl ctrl = *pctrl;
486a32e93adSJunchao Zhang 
487a32e93adSJunchao Zhang   PetscFunctionBegin;
488a32e93adSJunchao Zhang   hwloc_bitmap_free(ctrl->cpuset);
489a32e93adSJunchao Zhang   hwloc_topology_destroy(ctrl->topology);
4903ba16761SJacob Faibussowitsch   PetscCall(PetscOmpCtrlDestroyBarrier(ctrl));
4919566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_free(&ctrl->omp_comm));
492a32e93adSJunchao Zhang   if (ctrl->is_omp_master) {
493a32e93adSJunchao Zhang     hwloc_bitmap_free(ctrl->omp_cpuset);
4949566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Comm_free(&ctrl->omp_master_comm));
495a32e93adSJunchao Zhang   }
4969566063dSJacob Faibussowitsch   PetscCall(PetscFree(ctrl));
4973ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
498a32e93adSJunchao Zhang }
499a32e93adSJunchao Zhang 
500a32e93adSJunchao Zhang /*@C
5018fcaa860SBarry Smith     PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller
502a32e93adSJunchao Zhang 
503a32e93adSJunchao Zhang     Input Parameter:
5048fcaa860SBarry Smith .   ctrl - a PETSc OMP controller
505a32e93adSJunchao Zhang 
506d8d19677SJose E. Roman     Output Parameters:
507eff715bbSJunchao Zhang +   omp_comm         - a communicator that includes a master rank and slave ranks where master spawns threads
508a32e93adSJunchao Zhang .   omp_master_comm  - on master ranks, return a communicator that include master ranks of each omp_comm;
509811af0c4SBarry Smith                        on slave ranks, `MPI_COMM_NULL` will be return in reality.
510a32e93adSJunchao Zhang -   is_omp_master    - true if the calling process is an OMP master rank.
511a32e93adSJunchao Zhang 
512811af0c4SBarry Smith     Note:
513dc9a610eSPierre Jolivet     Any output parameter can be `NULL`. The parameter is just ignored.
514eff715bbSJunchao Zhang 
515a32e93adSJunchao Zhang     Level: developer
516811af0c4SBarry Smith 
517811af0c4SBarry Smith .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
518a32e93adSJunchao Zhang @*/
519d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl, MPI_Comm *omp_comm, MPI_Comm *omp_master_comm, PetscBool *is_omp_master)
520d71ae5a4SJacob Faibussowitsch {
521a32e93adSJunchao Zhang   PetscFunctionBegin;
522a32e93adSJunchao Zhang   if (omp_comm) *omp_comm = ctrl->omp_comm;
523a32e93adSJunchao Zhang   if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
524a32e93adSJunchao Zhang   if (is_omp_master) *is_omp_master = ctrl->is_omp_master;
5253ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
526a32e93adSJunchao Zhang }
527a32e93adSJunchao Zhang 
528eff715bbSJunchao Zhang /*@C
5298fcaa860SBarry Smith     PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU)
530eff715bbSJunchao Zhang 
531eff715bbSJunchao Zhang     Input Parameter:
5328fcaa860SBarry Smith .   ctrl - a PETSc OMP controller
533eff715bbSJunchao Zhang 
534eff715bbSJunchao Zhang     Notes:
535811af0c4SBarry Smith     this is a pthread barrier on MPI ranks. Using `MPI_Barrier()` instead is conceptually correct. But MPI standard does not
536811af0c4SBarry Smith     require processes blocked by `MPI_Barrier()` free their CPUs to let other processes progress. In practice, to minilize latency,
537811af0c4SBarry Smith     MPI ranks stuck in `MPI_Barrier()` keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.
538eff715bbSJunchao Zhang 
539811af0c4SBarry Smith     A code using `PetscOmpCtrlBarrier()` would be like this,
540811af0c4SBarry Smith .vb
541eff715bbSJunchao Zhang     if (is_omp_master) {
542eff715bbSJunchao Zhang       PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
543eff715bbSJunchao Zhang       Call the library using OpenMP
544eff715bbSJunchao Zhang       PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
545eff715bbSJunchao Zhang     }
546eff715bbSJunchao Zhang     PetscOmpCtrlBarrier(ctrl);
547811af0c4SBarry Smith .ve
548eff715bbSJunchao Zhang 
549eff715bbSJunchao Zhang     Level: developer
550eff715bbSJunchao Zhang 
551811af0c4SBarry Smith .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`,
552eff715bbSJunchao Zhang @*/
553d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
554d71ae5a4SJacob Faibussowitsch {
5552da392ccSBarry Smith   int err;
556a32e93adSJunchao Zhang 
557a32e93adSJunchao Zhang   PetscFunctionBegin;
5582da392ccSBarry Smith   err = pthread_barrier_wait(ctrl->barrier);
55939619372SPierre Jolivet   PetscCheck(!err || err == PTHREAD_BARRIER_SERIAL_THREAD, PETSC_COMM_SELF, PETSC_ERR_LIB, "pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %d", err);
5603ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
561a32e93adSJunchao Zhang }
562a32e93adSJunchao Zhang 
563eff715bbSJunchao Zhang /*@C
564eff715bbSJunchao Zhang     PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks
565eff715bbSJunchao Zhang 
566eff715bbSJunchao Zhang     Input Parameter:
5678fcaa860SBarry Smith .   ctrl - a PETSc OMP controller
568eff715bbSJunchao Zhang 
569811af0c4SBarry Smith     Note:
570811af0c4SBarry Smith     Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
571eff715bbSJunchao Zhang     This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime
572eff715bbSJunchao Zhang 
573eff715bbSJunchao Zhang     Level: developer
574eff715bbSJunchao Zhang 
575811af0c4SBarry Smith .seealso: `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
576eff715bbSJunchao Zhang @*/
577d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
578d71ae5a4SJacob Faibussowitsch {
579a32e93adSJunchao Zhang   PetscFunctionBegin;
58061ef3065SPierre Jolivet   PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->omp_cpuset, HWLOC_CPUBIND_PROCESS);
581eff715bbSJunchao Zhang   omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
5823ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
583a32e93adSJunchao Zhang }
584a32e93adSJunchao Zhang 
585eff715bbSJunchao Zhang /*@C
586eff715bbSJunchao Zhang    PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks
587eff715bbSJunchao Zhang 
588eff715bbSJunchao Zhang    Input Parameter:
5898fcaa860SBarry Smith .  ctrl - a PETSc OMP controller
590eff715bbSJunchao Zhang 
591811af0c4SBarry Smith    Note:
592811af0c4SBarry Smith    Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
593eff715bbSJunchao Zhang    This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.
594eff715bbSJunchao Zhang 
595eff715bbSJunchao Zhang    Level: developer
596eff715bbSJunchao Zhang 
597811af0c4SBarry Smith .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
598eff715bbSJunchao Zhang @*/
599d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
600d71ae5a4SJacob Faibussowitsch {
601a32e93adSJunchao Zhang   PetscFunctionBegin;
60261ef3065SPierre Jolivet   PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);
603eff715bbSJunchao Zhang   omp_set_num_threads(1);
6043ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
605a32e93adSJunchao Zhang }
606a32e93adSJunchao Zhang 
6074df5c2c7SJunchao Zhang   #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
60820b3346cSJunchao Zhang #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */
609