xref: /petsc/src/sys/utils/mpishm.c (revision 5f80ce2ab25dff0f4601e710601cbbcecf323266)
15f7487a0SJunchao Zhang #include <petscsys.h>        /*I  "petscsys.h"  I*/
25f7487a0SJunchao Zhang #include <petsc/private/petscimpl.h>
35f7487a0SJunchao Zhang 
45f7487a0SJunchao Zhang struct _n_PetscShmComm {
55f7487a0SJunchao Zhang   PetscMPIInt *globranks;       /* global ranks of each rank in the shared memory communicator */
65f7487a0SJunchao Zhang   PetscMPIInt shmsize;          /* size of the shared memory communicator */
75f7487a0SJunchao Zhang   MPI_Comm    globcomm,shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */
85f7487a0SJunchao Zhang };
95f7487a0SJunchao Zhang 
105f7487a0SJunchao Zhang /*
1133779a13SJunchao Zhang    Private routine to delete internal shared memory communicator when a communicator is freed.
125f7487a0SJunchao Zhang 
135f7487a0SJunchao Zhang    This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this  data as an attribute is freed.
145f7487a0SJunchao Zhang 
155f7487a0SJunchao Zhang    Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval()
165f7487a0SJunchao Zhang 
175f7487a0SJunchao Zhang */
1833779a13SJunchao Zhang PETSC_EXTERN PetscMPIInt MPIAPI Petsc_ShmComm_Attr_Delete_Fn(MPI_Comm comm,PetscMPIInt keyval,void *val,void *extra_state)
195f7487a0SJunchao Zhang {
205f7487a0SJunchao Zhang   PetscShmComm p = (PetscShmComm)val;
215f7487a0SJunchao Zhang 
225f7487a0SJunchao Zhang   PetscFunctionBegin;
23*5f80ce2aSJacob Faibussowitsch   CHKERRMPI(PetscInfo(NULL,"Deleting shared memory subcommunicator in a MPI_Comm %ld\n",(long)comm));
24*5f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Comm_free(&p->shmcomm));
25*5f80ce2aSJacob Faibussowitsch   CHKERRMPI(PetscFree(p->globranks));
26*5f80ce2aSJacob Faibussowitsch   CHKERRMPI(PetscFree(val));
275f7487a0SJunchao Zhang   PetscFunctionReturn(MPI_SUCCESS);
285f7487a0SJunchao Zhang }
295f7487a0SJunchao Zhang 
30b48189acSJunchao Zhang #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
31b48189acSJunchao Zhang /* Data structures to support freeing comms created in PetscShmCommGet().
32b48189acSJunchao Zhang   Since we predict communicators passed to PetscShmCommGet() are very likely
33b48189acSJunchao Zhang   either a petsc inner communicator or an MPI communicator with a linked petsc
34b48189acSJunchao Zhang   inner communicator, we use a simple static array to store dupped communicators
35b48189acSJunchao Zhang   on rare cases otherwise.
36b48189acSJunchao Zhang  */
37b48189acSJunchao Zhang #define MAX_SHMCOMM_DUPPED_COMMS 16
38b48189acSJunchao Zhang static PetscInt       num_dupped_comms=0;
39b48189acSJunchao Zhang static MPI_Comm       shmcomm_dupped_comms[MAX_SHMCOMM_DUPPED_COMMS];
40b48189acSJunchao Zhang static PetscErrorCode PetscShmCommDestroyDuppedComms(void)
41b48189acSJunchao Zhang {
42b48189acSJunchao Zhang   PetscInt         i;
43b48189acSJunchao Zhang   PetscFunctionBegin;
44*5f80ce2aSJacob Faibussowitsch   for (i=0; i<num_dupped_comms; i++) CHKERRQ(PetscCommDestroy(&shmcomm_dupped_comms[i]));
45b48189acSJunchao Zhang   num_dupped_comms = 0; /* reset so that PETSc can be reinitialized */
46b48189acSJunchao Zhang   PetscFunctionReturn(0);
47b48189acSJunchao Zhang }
48b48189acSJunchao Zhang #endif
49b48189acSJunchao Zhang 
505f7487a0SJunchao Zhang /*@C
51b48189acSJunchao Zhang     PetscShmCommGet - Given a communicator returns a sub-communicator of all ranks that share a common memory
525f7487a0SJunchao Zhang 
53d083f849SBarry Smith     Collective.
545f7487a0SJunchao Zhang 
555f7487a0SJunchao Zhang     Input Parameter:
56b48189acSJunchao Zhang .   globcomm - MPI_Comm, which can be a user MPI_Comm or a PETSc inner MPI_Comm
575f7487a0SJunchao Zhang 
585f7487a0SJunchao Zhang     Output Parameter:
595f7487a0SJunchao Zhang .   pshmcomm - the PETSc shared memory communicator object
605f7487a0SJunchao Zhang 
615f7487a0SJunchao Zhang     Level: developer
625f7487a0SJunchao Zhang 
635f7487a0SJunchao Zhang     Notes:
645f7487a0SJunchao Zhang        When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis
655f7487a0SJunchao Zhang 
665f7487a0SJunchao Zhang @*/
675f7487a0SJunchao Zhang PetscErrorCode PetscShmCommGet(MPI_Comm globcomm,PetscShmComm *pshmcomm)
685f7487a0SJunchao Zhang {
695f7487a0SJunchao Zhang #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
705f7487a0SJunchao Zhang   MPI_Group        globgroup,shmgroup;
715f7487a0SJunchao Zhang   PetscMPIInt      *shmranks,i,flg;
725f7487a0SJunchao Zhang   PetscCommCounter *counter;
735f7487a0SJunchao Zhang 
745f7487a0SJunchao Zhang   PetscFunctionBegin;
753ca90d2dSJacob Faibussowitsch   PetscValidPointer(pshmcomm,2);
76b48189acSJunchao Zhang   /* Get a petsc inner comm, since we always want to stash pshmcomm on petsc inner comms */
77*5f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Comm_get_attr(globcomm,Petsc_Counter_keyval,&counter,&flg));
78b48189acSJunchao Zhang   if (!flg) { /* globcomm is not a petsc comm */
79b48189acSJunchao Zhang     union {MPI_Comm comm; void *ptr;} ucomm;
80b48189acSJunchao Zhang     /* check if globcomm already has a linked petsc inner comm */
81*5f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Comm_get_attr(globcomm,Petsc_InnerComm_keyval,&ucomm,&flg));
82b48189acSJunchao Zhang     if (!flg) {
83b48189acSJunchao Zhang       /* globcomm does not have a linked petsc inner comm, so we create one and replace globcomm with it */
842c71b3e2SJacob Faibussowitsch       PetscCheckFalse(num_dupped_comms >= MAX_SHMCOMM_DUPPED_COMMS,globcomm,PETSC_ERR_PLIB,"PetscShmCommGet() is trying to dup more than %d MPI_Comms",MAX_SHMCOMM_DUPPED_COMMS);
85*5f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscCommDuplicate(globcomm,&globcomm,NULL));
86b48189acSJunchao Zhang       /* Register a function to free the dupped petsc comms at PetscFinalize at the first time */
87*5f80ce2aSJacob Faibussowitsch       if (num_dupped_comms == 0) CHKERRQ(PetscRegisterFinalize(PetscShmCommDestroyDuppedComms));
88b48189acSJunchao Zhang       shmcomm_dupped_comms[num_dupped_comms] = globcomm;
89b48189acSJunchao Zhang       num_dupped_comms++;
90b48189acSJunchao Zhang     } else {
91b48189acSJunchao Zhang       /* otherwise, we pull out the inner comm and use it as globcomm */
92b48189acSJunchao Zhang       globcomm = ucomm.comm;
93b48189acSJunchao Zhang     }
94b48189acSJunchao Zhang   }
955f7487a0SJunchao Zhang 
96b48189acSJunchao Zhang   /* Check if globcomm already has an attached pshmcomm. If no, create one */
97*5f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Comm_get_attr(globcomm,Petsc_ShmComm_keyval,pshmcomm,&flg));
985f7487a0SJunchao Zhang   if (flg) PetscFunctionReturn(0);
995f7487a0SJunchao Zhang 
100*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscNew(pshmcomm));
1015f7487a0SJunchao Zhang   (*pshmcomm)->globcomm = globcomm;
1025f7487a0SJunchao Zhang 
103*5f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED,0, MPI_INFO_NULL,&(*pshmcomm)->shmcomm));
1045f7487a0SJunchao Zhang 
105*5f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Comm_size((*pshmcomm)->shmcomm,&(*pshmcomm)->shmsize));
106*5f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Comm_group(globcomm, &globgroup));
107*5f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup));
108*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1((*pshmcomm)->shmsize,&shmranks));
109*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1((*pshmcomm)->shmsize,&(*pshmcomm)->globranks));
1105f7487a0SJunchao Zhang   for (i=0; i<(*pshmcomm)->shmsize; i++) shmranks[i] = i;
111*5f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks));
112*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(shmranks));
113*5f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Group_free(&globgroup));
114*5f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Group_free(&shmgroup));
1155f7487a0SJunchao Zhang 
1165f7487a0SJunchao Zhang   for (i=0; i<(*pshmcomm)->shmsize; i++) {
117*5f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscInfo(NULL,"Shared memory rank %d global rank %d\n",i,(*pshmcomm)->globranks[i]));
1185f7487a0SJunchao Zhang   }
119*5f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Comm_set_attr(globcomm,Petsc_ShmComm_keyval,*pshmcomm));
1205f7487a0SJunchao Zhang   PetscFunctionReturn(0);
1215f7487a0SJunchao Zhang #else
1225f7487a0SJunchao Zhang   SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich.");
1235f7487a0SJunchao Zhang #endif
1245f7487a0SJunchao Zhang }
1255f7487a0SJunchao Zhang 
1265f7487a0SJunchao Zhang /*@C
1275f7487a0SJunchao Zhang     PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator
1285f7487a0SJunchao Zhang 
1295f7487a0SJunchao Zhang     Input Parameters:
1305f7487a0SJunchao Zhang +   pshmcomm - the shared memory communicator object
1315f7487a0SJunchao Zhang -   grank    - the global rank
1325f7487a0SJunchao Zhang 
1335f7487a0SJunchao Zhang     Output Parameter:
1345f7487a0SJunchao Zhang .   lrank - the local rank, or MPI_PROC_NULL if it does not exist
1355f7487a0SJunchao Zhang 
1365f7487a0SJunchao Zhang     Level: developer
1375f7487a0SJunchao Zhang 
1385f7487a0SJunchao Zhang     Developer Notes:
1395f7487a0SJunchao Zhang     Assumes the pshmcomm->globranks[] is sorted
1405f7487a0SJunchao Zhang 
1415f7487a0SJunchao Zhang     It may be better to rewrite this to map multiple global ranks to local in the same function call
1425f7487a0SJunchao Zhang 
1435f7487a0SJunchao Zhang @*/
1445f7487a0SJunchao Zhang PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm,PetscMPIInt grank,PetscMPIInt *lrank)
1455f7487a0SJunchao Zhang {
1465f7487a0SJunchao Zhang   PetscMPIInt    low,high,t,i;
1475f7487a0SJunchao Zhang   PetscBool      flg = PETSC_FALSE;
1485f7487a0SJunchao Zhang 
1495f7487a0SJunchao Zhang   PetscFunctionBegin;
1503ca90d2dSJacob Faibussowitsch   PetscValidPointer(pshmcomm,1);
1513ca90d2dSJacob Faibussowitsch   PetscValidPointer(lrank,3);
1525f7487a0SJunchao Zhang   *lrank = MPI_PROC_NULL;
1535f7487a0SJunchao Zhang   if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(0);
1545f7487a0SJunchao Zhang   if (grank > pshmcomm->globranks[pshmcomm->shmsize-1]) PetscFunctionReturn(0);
155*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscOptionsGetBool(NULL,NULL,"-noshared",&flg,NULL));
1565f7487a0SJunchao Zhang   if (flg) PetscFunctionReturn(0);
1575f7487a0SJunchao Zhang   low  = 0;
1585f7487a0SJunchao Zhang   high = pshmcomm->shmsize;
1595f7487a0SJunchao Zhang   while (high-low > 5) {
1605f7487a0SJunchao Zhang     t = (low+high)/2;
1615f7487a0SJunchao Zhang     if (pshmcomm->globranks[t] > grank) high = t;
1625f7487a0SJunchao Zhang     else low = t;
1635f7487a0SJunchao Zhang   }
1645f7487a0SJunchao Zhang   for (i=low; i<high; i++) {
1655f7487a0SJunchao Zhang     if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(0);
1665f7487a0SJunchao Zhang     if (pshmcomm->globranks[i] == grank) {
1675f7487a0SJunchao Zhang       *lrank = i;
1685f7487a0SJunchao Zhang       PetscFunctionReturn(0);
1695f7487a0SJunchao Zhang     }
1705f7487a0SJunchao Zhang   }
1715f7487a0SJunchao Zhang   PetscFunctionReturn(0);
1725f7487a0SJunchao Zhang }
1735f7487a0SJunchao Zhang 
1745f7487a0SJunchao Zhang /*@C
1755f7487a0SJunchao Zhang     PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank
1765f7487a0SJunchao Zhang 
1775f7487a0SJunchao Zhang     Input Parameters:
1785f7487a0SJunchao Zhang +   pshmcomm - the shared memory communicator object
1795f7487a0SJunchao Zhang -   lrank    - the local rank in the shared memory communicator
1805f7487a0SJunchao Zhang 
1815f7487a0SJunchao Zhang     Output Parameter:
1825f7487a0SJunchao Zhang .   grank - the global rank in the global communicator where the shared memory communicator is built
1835f7487a0SJunchao Zhang 
1845f7487a0SJunchao Zhang     Level: developer
1855f7487a0SJunchao Zhang 
1865f7487a0SJunchao Zhang @*/
1875f7487a0SJunchao Zhang PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm,PetscMPIInt lrank,PetscMPIInt *grank)
1885f7487a0SJunchao Zhang {
1895f7487a0SJunchao Zhang   PetscFunctionBegin;
1903ca90d2dSJacob Faibussowitsch   PetscValidPointer(pshmcomm,1);
1913ca90d2dSJacob Faibussowitsch   PetscValidPointer(grank,3);
1922c71b3e2SJacob Faibussowitsch   PetscCheck(lrank >= 0 && lrank < pshmcomm->shmsize,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"No rank %d in the shared memory communicator",lrank);
1935f7487a0SJunchao Zhang   *grank = pshmcomm->globranks[lrank];
1945f7487a0SJunchao Zhang   PetscFunctionReturn(0);
1955f7487a0SJunchao Zhang }
1965f7487a0SJunchao Zhang 
1975f7487a0SJunchao Zhang /*@C
1985f7487a0SJunchao Zhang     PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory
1995f7487a0SJunchao Zhang 
2005f7487a0SJunchao Zhang     Input Parameter:
2015f7487a0SJunchao Zhang .   pshmcomm - PetscShmComm object obtained with PetscShmCommGet()
2025f7487a0SJunchao Zhang 
2035f7487a0SJunchao Zhang     Output Parameter:
2045f7487a0SJunchao Zhang .   comm     - the MPI communicator
2055f7487a0SJunchao Zhang 
2065f7487a0SJunchao Zhang     Level: developer
2075f7487a0SJunchao Zhang 
2085f7487a0SJunchao Zhang @*/
2095f7487a0SJunchao Zhang PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm,MPI_Comm *comm)
2105f7487a0SJunchao Zhang {
2115f7487a0SJunchao Zhang   PetscFunctionBegin;
2123ca90d2dSJacob Faibussowitsch   PetscValidPointer(pshmcomm,1);
2133ca90d2dSJacob Faibussowitsch   PetscValidPointer(comm,2);
2145f7487a0SJunchao Zhang   *comm = pshmcomm->shmcomm;
2155f7487a0SJunchao Zhang   PetscFunctionReturn(0);
2165f7487a0SJunchao Zhang }
2175f7487a0SJunchao Zhang 
21820b3346cSJunchao Zhang #if defined(PETSC_HAVE_OPENMP_SUPPORT)
219a32e93adSJunchao Zhang #include <pthread.h>
220a32e93adSJunchao Zhang #include <hwloc.h>
221a32e93adSJunchao Zhang #include <omp.h>
222a32e93adSJunchao Zhang 
223eff715bbSJunchao Zhang /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
224eff715bbSJunchao Zhang    otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
2254df5c2c7SJunchao Zhang    simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
2264df5c2c7SJunchao Zhang    by 50%. Until the reason is found out, we use mmap() instead.
2274df5c2c7SJunchao Zhang */
2284df5c2c7SJunchao Zhang #define USE_MMAP_ALLOCATE_SHARED_MEMORY
2294df5c2c7SJunchao Zhang 
2304df5c2c7SJunchao Zhang #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
2314df5c2c7SJunchao Zhang #include <sys/mman.h>
2324df5c2c7SJunchao Zhang #include <sys/types.h>
2334df5c2c7SJunchao Zhang #include <sys/stat.h>
2344df5c2c7SJunchao Zhang #include <fcntl.h>
2354df5c2c7SJunchao Zhang #endif
2364df5c2c7SJunchao Zhang 
237a32e93adSJunchao Zhang struct _n_PetscOmpCtrl {
238a32e93adSJunchao Zhang   MPI_Comm          omp_comm;        /* a shared memory communicator to spawn omp threads */
239a32e93adSJunchao Zhang   MPI_Comm          omp_master_comm; /* a communicator to give to third party libraries */
240a32e93adSJunchao Zhang   PetscMPIInt       omp_comm_size;   /* size of omp_comm, a kind of OMP_NUM_THREADS */
241a32e93adSJunchao Zhang   PetscBool         is_omp_master;   /* rank 0's in omp_comm */
242a32e93adSJunchao Zhang   MPI_Win           omp_win;         /* a shared memory window containing a barrier */
243a32e93adSJunchao Zhang   pthread_barrier_t *barrier;        /* pointer to the barrier */
244a32e93adSJunchao Zhang   hwloc_topology_t  topology;
245a32e93adSJunchao Zhang   hwloc_cpuset_t    cpuset;          /* cpu bindings of omp master */
246a32e93adSJunchao Zhang   hwloc_cpuset_t    omp_cpuset;      /* union of cpu bindings of ranks in omp_comm */
247a32e93adSJunchao Zhang };
248a32e93adSJunchao Zhang 
249eff715bbSJunchao Zhang /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
2508fcaa860SBarry Smith    contained by the controller.
251eff715bbSJunchao Zhang 
2528fcaa860SBarry Smith    PETSc OpenMP controller users do not call this function directly. This function exists
253eff715bbSJunchao Zhang    only because we want to separate shared memory allocation methods from other code.
254eff715bbSJunchao Zhang  */
2559fbee547SJacob Faibussowitsch static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
256a32e93adSJunchao Zhang {
257a32e93adSJunchao Zhang   MPI_Aint              size;
258a32e93adSJunchao Zhang   void                  *baseptr;
259a32e93adSJunchao Zhang   pthread_barrierattr_t  attr;
260a32e93adSJunchao Zhang 
2614df5c2c7SJunchao Zhang #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
2624df5c2c7SJunchao Zhang   PetscInt              fd;
2634df5c2c7SJunchao Zhang   PetscChar             pathname[PETSC_MAX_PATH_LEN];
2644df5c2c7SJunchao Zhang #else
2654df5c2c7SJunchao Zhang   PetscMPIInt           disp_unit;
2664df5c2c7SJunchao Zhang #endif
2674df5c2c7SJunchao Zhang 
2684df5c2c7SJunchao Zhang   PetscFunctionBegin;
2694df5c2c7SJunchao Zhang #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
2704df5c2c7SJunchao Zhang   size = sizeof(pthread_barrier_t);
2714df5c2c7SJunchao Zhang   if (ctrl->is_omp_master) {
272eff715bbSJunchao Zhang     /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
273*5f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscGetTmp(PETSC_COMM_SELF,pathname,PETSC_MAX_PATH_LEN));
274*5f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscStrlcat(pathname,"/petsc-shm-XXXXXX",PETSC_MAX_PATH_LEN));
2754df5c2c7SJunchao Zhang     /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
2762c71b3e2SJacob Faibussowitsch     fd      = mkstemp(pathname); PetscCheckFalse(fd == -1,PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not create tmp file %s with mkstemp", pathname);
277*5f80ce2aSJacob Faibussowitsch     CHKERRQ(ftruncate(fd,size));
2782c71b3e2SJacob Faibussowitsch     baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); PetscCheckFalse(baseptr == MAP_FAILED,PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed");
279*5f80ce2aSJacob Faibussowitsch     CHKERRQ(close(fd));
280*5f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm));
281eff715bbSJunchao Zhang     /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
282*5f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Barrier(ctrl->omp_comm));
283*5f80ce2aSJacob Faibussowitsch     CHKERRQ(unlink(pathname));
2844df5c2c7SJunchao Zhang   } else {
285*5f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm));
2862c71b3e2SJacob Faibussowitsch     fd      = open(pathname,O_RDWR); PetscCheckFalse(fd == -1,PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not open tmp file %s", pathname);
2872c71b3e2SJacob Faibussowitsch     baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); PetscCheckFalse(baseptr == MAP_FAILED,PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed");
288*5f80ce2aSJacob Faibussowitsch     CHKERRQ(close(fd));
289*5f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Barrier(ctrl->omp_comm));
2904df5c2c7SJunchao Zhang   }
2914df5c2c7SJunchao Zhang #else
292a32e93adSJunchao Zhang   size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
293*5f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Win_allocate_shared(size,1,MPI_INFO_NULL,ctrl->omp_comm,&baseptr,&ctrl->omp_win));
294*5f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Win_shared_query(ctrl->omp_win,0,&size,&disp_unit,&baseptr));
2954df5c2c7SJunchao Zhang #endif
296a32e93adSJunchao Zhang   ctrl->barrier = (pthread_barrier_t*)baseptr;
297a32e93adSJunchao Zhang 
298a32e93adSJunchao Zhang   /* omp master initializes the barrier */
299a32e93adSJunchao Zhang   if (ctrl->is_omp_master) {
300*5f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Comm_size(ctrl->omp_comm,&ctrl->omp_comm_size));
301*5f80ce2aSJacob Faibussowitsch     CHKERRQ(pthread_barrierattr_init(&attr));
302*5f80ce2aSJacob Faibussowitsch     CHKERRQ(pthread_barrierattr_setpshared(&attr,PTHREAD_PROCESS_SHARED)); /* make the barrier also work for processes */
303*5f80ce2aSJacob Faibussowitsch     CHKERRQ(pthread_barrier_init(ctrl->barrier,&attr,(unsigned int)ctrl->omp_comm_size));
304*5f80ce2aSJacob Faibussowitsch     CHKERRQ(pthread_barrierattr_destroy(&attr));
305a32e93adSJunchao Zhang   }
306a32e93adSJunchao Zhang 
3074df5c2c7SJunchao Zhang   /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
308*5f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Barrier(ctrl->omp_comm));
309a32e93adSJunchao Zhang   PetscFunctionReturn(0);
310a32e93adSJunchao Zhang }
311a32e93adSJunchao Zhang 
3128fcaa860SBarry Smith /* Destroy the pthread barrier in the PETSc OpenMP controller */
3139fbee547SJacob Faibussowitsch static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
314a32e93adSJunchao Zhang {
3154df5c2c7SJunchao Zhang   PetscFunctionBegin;
3164df5c2c7SJunchao Zhang   /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
317*5f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Barrier(ctrl->omp_comm));
318*5f80ce2aSJacob Faibussowitsch   if (ctrl->is_omp_master) CHKERRQ(pthread_barrier_destroy(ctrl->barrier));
3194df5c2c7SJunchao Zhang 
3204df5c2c7SJunchao Zhang #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
321*5f80ce2aSJacob Faibussowitsch   CHKERRQ(munmap(ctrl->barrier,sizeof(pthread_barrier_t)));
3224df5c2c7SJunchao Zhang #else
323*5f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Win_free(&ctrl->omp_win));
3244df5c2c7SJunchao Zhang #endif
325a32e93adSJunchao Zhang   PetscFunctionReturn(0);
326a32e93adSJunchao Zhang }
327a32e93adSJunchao Zhang 
328eff715bbSJunchao Zhang /*@C
3298fcaa860SBarry Smith     PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries using OpenMP
330eff715bbSJunchao Zhang 
331d8d19677SJose E. Roman     Input Parameters:
332eff715bbSJunchao Zhang +   petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
333a2b725a8SWilliam Gropp -   nthreads   - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value
334eff715bbSJunchao Zhang 
335eff715bbSJunchao Zhang     Output Parameter:
3368fcaa860SBarry Smith .   pctrl      - a PETSc OpenMP controller
337eff715bbSJunchao Zhang 
338eff715bbSJunchao Zhang     Level: developer
339eff715bbSJunchao Zhang 
3408fcaa860SBarry Smith     TODO: Possibly use the variable PetscNumOMPThreads to determine the number for threads to use
3418fcaa860SBarry Smith 
342eff715bbSJunchao Zhang .seealso PetscOmpCtrlDestroy()
343eff715bbSJunchao Zhang @*/
344a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm,PetscInt nthreads,PetscOmpCtrl *pctrl)
345a32e93adSJunchao Zhang {
346a32e93adSJunchao Zhang   PetscOmpCtrl   ctrl;
347a32e93adSJunchao Zhang   unsigned long *cpu_ulongs = NULL;
348a32e93adSJunchao Zhang   PetscInt       i,nr_cpu_ulongs;
349a32e93adSJunchao Zhang   PetscShmComm   pshmcomm;
350a32e93adSJunchao Zhang   MPI_Comm       shm_comm;
351a32e93adSJunchao Zhang   PetscMPIInt    shm_rank,shm_comm_size,omp_rank,color;
3527c405c4aSJunchao Zhang   PetscInt       num_packages,num_cores;
353a32e93adSJunchao Zhang 
354a32e93adSJunchao Zhang   PetscFunctionBegin;
355*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscNew(&ctrl));
356a32e93adSJunchao Zhang 
357a32e93adSJunchao Zhang   /*=================================================================================
3587c405c4aSJunchao Zhang     Init hwloc
3597c405c4aSJunchao Zhang    ==================================================================================*/
360*5f80ce2aSJacob Faibussowitsch   CHKERRQ(hwloc_topology_init(&ctrl->topology));
3617c405c4aSJunchao Zhang #if HWLOC_API_VERSION >= 0x00020000
3627c405c4aSJunchao Zhang   /* to filter out unneeded info and have faster hwloc_topology_load */
363*5f80ce2aSJacob Faibussowitsch   CHKERRQ(hwloc_topology_set_all_types_filter(ctrl->topology,HWLOC_TYPE_FILTER_KEEP_NONE));
364*5f80ce2aSJacob Faibussowitsch   CHKERRQ(hwloc_topology_set_type_filter(ctrl->topology,HWLOC_OBJ_CORE,HWLOC_TYPE_FILTER_KEEP_ALL));
3657c405c4aSJunchao Zhang #endif
366*5f80ce2aSJacob Faibussowitsch   CHKERRQ(hwloc_topology_load(ctrl->topology));
3677c405c4aSJunchao Zhang 
3687c405c4aSJunchao Zhang   /*=================================================================================
369a32e93adSJunchao Zhang     Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
370a32e93adSJunchao Zhang     physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
371a32e93adSJunchao Zhang     others are called slaves. OMP Masters make up a new comm called omp_master_comm,
372a32e93adSJunchao Zhang     which is usually passed to third party libraries.
373a32e93adSJunchao Zhang    ==================================================================================*/
374a32e93adSJunchao Zhang 
375a32e93adSJunchao Zhang   /* fetch the stored shared memory communicator */
376*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscShmCommGet(petsc_comm,&pshmcomm));
377*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscShmCommGetMpiShmComm(pshmcomm,&shm_comm));
378a32e93adSJunchao Zhang 
379*5f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Comm_rank(shm_comm,&shm_rank));
380*5f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Comm_size(shm_comm,&shm_comm_size));
381a32e93adSJunchao Zhang 
3827c405c4aSJunchao Zhang   /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
3837c405c4aSJunchao Zhang   if (nthreads == -1) {
384a312e481SBarry Smith     num_packages = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_PACKAGE);
385a312e481SBarry Smith     num_cores    = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_CORE) <= 0 ? 1 :  hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_CORE);
3867c405c4aSJunchao Zhang     nthreads     = num_cores/num_packages;
3877c405c4aSJunchao Zhang     if (nthreads > shm_comm_size) nthreads = shm_comm_size;
3887c405c4aSJunchao Zhang   }
3897c405c4aSJunchao Zhang 
390*5f80ce2aSJacob Faibussowitsch   PetscCheck(nthreads >= 1 && nthreads <= shm_comm_size,petsc_comm,PETSC_ERR_ARG_OUTOFRANGE,"number of OpenMP threads %" PetscInt_FMT " can not be < 1 or > the MPI shared memory communicator size %d",nthreads,shm_comm_size);
391*5f80ce2aSJacob Faibussowitsch   if (shm_comm_size % nthreads) CHKERRQ(PetscPrintf(petsc_comm,"Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n",nthreads,shm_comm_size));
392a32e93adSJunchao Zhang 
393a32e93adSJunchao Zhang   /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
394a32e93adSJunchao Zhang      shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
395a32e93adSJunchao Zhang      color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
396a32e93adSJunchao Zhang      be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
397a32e93adSJunchao Zhang      Use 0 as key so that rank ordering wont change in new comm.
398a32e93adSJunchao Zhang    */
399a32e93adSJunchao Zhang   color = shm_rank / nthreads;
400*5f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Comm_split(shm_comm,color,0/*key*/,&ctrl->omp_comm));
401a32e93adSJunchao Zhang 
402a32e93adSJunchao Zhang   /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
403*5f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Comm_rank(ctrl->omp_comm,&omp_rank));
404a32e93adSJunchao Zhang   if (!omp_rank) {
405a32e93adSJunchao Zhang     ctrl->is_omp_master = PETSC_TRUE;  /* master */
406a32e93adSJunchao Zhang     color = 0;
407a32e93adSJunchao Zhang   } else {
408a32e93adSJunchao Zhang     ctrl->is_omp_master = PETSC_FALSE; /* slave */
409a32e93adSJunchao Zhang     color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
410a32e93adSJunchao Zhang   }
411*5f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Comm_split(petsc_comm,color,0/*key*/,&ctrl->omp_master_comm));
412a32e93adSJunchao Zhang 
413a32e93adSJunchao Zhang   /*=================================================================================
414a32e93adSJunchao Zhang     Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
415a32e93adSJunchao Zhang     slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
416a32e93adSJunchao Zhang     and run them on the idle CPUs.
417a32e93adSJunchao Zhang    ==================================================================================*/
418*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscOmpCtrlCreateBarrier(ctrl));
419a32e93adSJunchao Zhang 
420a32e93adSJunchao Zhang   /*=================================================================================
421a32e93adSJunchao Zhang     omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
422a32e93adSJunchao Zhang     is the union of the bindings of all ranks in the omp_comm
423a32e93adSJunchao Zhang     =================================================================================*/
424a32e93adSJunchao Zhang 
4252c71b3e2SJacob Faibussowitsch   ctrl->cpuset = hwloc_bitmap_alloc(); PetscCheckFalse(!ctrl->cpuset,PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed");
426*5f80ce2aSJacob Faibussowitsch   CHKERRQ(hwloc_get_cpubind(ctrl->topology,ctrl->cpuset, HWLOC_CPUBIND_PROCESS));
427a32e93adSJunchao Zhang 
4283ab56b82SJunchao Zhang   /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
429a32e93adSJunchao Zhang   nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset (ctrl->topology))+sizeof(unsigned long)*8)/sizeof(unsigned long)/8;
430*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(nr_cpu_ulongs,&cpu_ulongs));
431a32e93adSJunchao Zhang   if (nr_cpu_ulongs == 1) {
432a32e93adSJunchao Zhang     cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
433a32e93adSJunchao Zhang   } else {
434a32e93adSJunchao Zhang     for (i=0; i<nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset,(unsigned)i);
435a32e93adSJunchao Zhang   }
436a32e93adSJunchao Zhang 
437*5f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs,nr_cpu_ulongs, MPI_UNSIGNED_LONG,MPI_BOR,0,ctrl->omp_comm));
438a32e93adSJunchao Zhang 
439a32e93adSJunchao Zhang   if (ctrl->is_omp_master) {
4402c71b3e2SJacob Faibussowitsch     ctrl->omp_cpuset = hwloc_bitmap_alloc(); PetscCheckFalse(!ctrl->omp_cpuset,PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed");
441a32e93adSJunchao Zhang     if (nr_cpu_ulongs == 1) {
4423ab56b82SJunchao Zhang #if HWLOC_API_VERSION >= 0x00020000
443*5f80ce2aSJacob Faibussowitsch       CHKERRQ(hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]));
4443ab56b82SJunchao Zhang #else
4453ab56b82SJunchao Zhang       hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);
4463ab56b82SJunchao Zhang #endif
447a32e93adSJunchao Zhang     } else {
4483ab56b82SJunchao Zhang       for (i=0; i<nr_cpu_ulongs; i++)  {
4493ab56b82SJunchao Zhang #if HWLOC_API_VERSION >= 0x00020000
450*5f80ce2aSJacob Faibussowitsch         CHKERRQ(hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]));
4513ab56b82SJunchao Zhang #else
4523ab56b82SJunchao Zhang         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);
4533ab56b82SJunchao Zhang #endif
4543ab56b82SJunchao Zhang       }
455a32e93adSJunchao Zhang     }
456a32e93adSJunchao Zhang   }
457a32e93adSJunchao Zhang 
458*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(cpu_ulongs));
459a32e93adSJunchao Zhang   *pctrl = ctrl;
460a32e93adSJunchao Zhang   PetscFunctionReturn(0);
461a32e93adSJunchao Zhang }
462a32e93adSJunchao Zhang 
463eff715bbSJunchao Zhang /*@C
46464f49babSJed Brown     PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller
465eff715bbSJunchao Zhang 
466eff715bbSJunchao Zhang     Input Parameter:
4678fcaa860SBarry Smith .   pctrl  - a PETSc OpenMP controller
468eff715bbSJunchao Zhang 
469eff715bbSJunchao Zhang     Level: developer
470eff715bbSJunchao Zhang 
471eff715bbSJunchao Zhang .seealso PetscOmpCtrlCreate()
472eff715bbSJunchao Zhang @*/
473a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
474a32e93adSJunchao Zhang {
475a32e93adSJunchao Zhang   PetscOmpCtrl ctrl = *pctrl;
476a32e93adSJunchao Zhang 
477a32e93adSJunchao Zhang   PetscFunctionBegin;
478a32e93adSJunchao Zhang   hwloc_bitmap_free(ctrl->cpuset);
479a32e93adSJunchao Zhang   hwloc_topology_destroy(ctrl->topology);
480a32e93adSJunchao Zhang   PetscOmpCtrlDestroyBarrier(ctrl);
481*5f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Comm_free(&ctrl->omp_comm));
482a32e93adSJunchao Zhang   if (ctrl->is_omp_master) {
483a32e93adSJunchao Zhang     hwloc_bitmap_free(ctrl->omp_cpuset);
484*5f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Comm_free(&ctrl->omp_master_comm));
485a32e93adSJunchao Zhang   }
486*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(ctrl));
487a32e93adSJunchao Zhang   PetscFunctionReturn(0);
488a32e93adSJunchao Zhang }
489a32e93adSJunchao Zhang 
490a32e93adSJunchao Zhang /*@C
4918fcaa860SBarry Smith     PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller
492a32e93adSJunchao Zhang 
493a32e93adSJunchao Zhang     Input Parameter:
4948fcaa860SBarry Smith .   ctrl - a PETSc OMP controller
495a32e93adSJunchao Zhang 
496d8d19677SJose E. Roman     Output Parameters:
497eff715bbSJunchao Zhang +   omp_comm         - a communicator that includes a master rank and slave ranks where master spawns threads
498a32e93adSJunchao Zhang .   omp_master_comm  - on master ranks, return a communicator that include master ranks of each omp_comm;
499a32e93adSJunchao Zhang                        on slave ranks, MPI_COMM_NULL will be return in reality.
500a32e93adSJunchao Zhang -   is_omp_master    - true if the calling process is an OMP master rank.
501a32e93adSJunchao Zhang 
502eff715bbSJunchao Zhang     Notes: any output parameter can be NULL. The parameter is just ignored.
503eff715bbSJunchao Zhang 
504a32e93adSJunchao Zhang     Level: developer
505a32e93adSJunchao Zhang @*/
506a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl,MPI_Comm *omp_comm,MPI_Comm *omp_master_comm,PetscBool *is_omp_master)
507a32e93adSJunchao Zhang {
508a32e93adSJunchao Zhang   PetscFunctionBegin;
509a32e93adSJunchao Zhang   if (omp_comm)        *omp_comm        = ctrl->omp_comm;
510a32e93adSJunchao Zhang   if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
511a32e93adSJunchao Zhang   if (is_omp_master)   *is_omp_master   = ctrl->is_omp_master;
512a32e93adSJunchao Zhang   PetscFunctionReturn(0);
513a32e93adSJunchao Zhang }
514a32e93adSJunchao Zhang 
515eff715bbSJunchao Zhang /*@C
5168fcaa860SBarry Smith     PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU)
517eff715bbSJunchao Zhang 
518eff715bbSJunchao Zhang     Input Parameter:
5198fcaa860SBarry Smith .   ctrl - a PETSc OMP controller
520eff715bbSJunchao Zhang 
521eff715bbSJunchao Zhang     Notes:
522eff715bbSJunchao Zhang     this is a pthread barrier on MPI processes. Using MPI_Barrier instead is conceptually correct. But MPI standard does not
523eff715bbSJunchao Zhang     require processes blocked by MPI_Barrier free their CPUs to let other processes progress. In practice, to minilize latency,
524eff715bbSJunchao Zhang     MPI processes stuck in MPI_Barrier keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.
525eff715bbSJunchao Zhang 
526eff715bbSJunchao Zhang     A code using PetscOmpCtrlBarrier() would be like this,
527eff715bbSJunchao Zhang 
528eff715bbSJunchao Zhang     if (is_omp_master) {
529eff715bbSJunchao Zhang       PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
530eff715bbSJunchao Zhang       Call the library using OpenMP
531eff715bbSJunchao Zhang       PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
532eff715bbSJunchao Zhang     }
533eff715bbSJunchao Zhang     PetscOmpCtrlBarrier(ctrl);
534eff715bbSJunchao Zhang 
535eff715bbSJunchao Zhang     Level: developer
536eff715bbSJunchao Zhang 
537eff715bbSJunchao Zhang .seealso PetscOmpCtrlOmpRegionOnMasterBegin(), PetscOmpCtrlOmpRegionOnMasterEnd()
538eff715bbSJunchao Zhang @*/
539a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
540a32e93adSJunchao Zhang {
5412da392ccSBarry Smith   int err;
542a32e93adSJunchao Zhang 
543a32e93adSJunchao Zhang   PetscFunctionBegin;
5442da392ccSBarry Smith   err = pthread_barrier_wait(ctrl->barrier);
5452c71b3e2SJacob Faibussowitsch   PetscCheckFalse(err && err != PTHREAD_BARRIER_SERIAL_THREAD,PETSC_COMM_SELF,PETSC_ERR_LIB,"pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %" PetscInt_FMT, err);
546a32e93adSJunchao Zhang   PetscFunctionReturn(0);
547a32e93adSJunchao Zhang }
548a32e93adSJunchao Zhang 
549eff715bbSJunchao Zhang /*@C
550eff715bbSJunchao Zhang     PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks
551eff715bbSJunchao Zhang 
552eff715bbSJunchao Zhang     Input Parameter:
5538fcaa860SBarry Smith .   ctrl - a PETSc OMP controller
554eff715bbSJunchao Zhang 
555eff715bbSJunchao Zhang     Notes:
5568fcaa860SBarry Smith     Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms() to know if this is a master rank.
557eff715bbSJunchao Zhang     This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime
558eff715bbSJunchao Zhang 
559eff715bbSJunchao Zhang     Level: developer
560eff715bbSJunchao Zhang 
561eff715bbSJunchao Zhang .seealso: PetscOmpCtrlOmpRegionOnMasterEnd()
562eff715bbSJunchao Zhang @*/
563a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
564a32e93adSJunchao Zhang {
565a32e93adSJunchao Zhang   PetscFunctionBegin;
566*5f80ce2aSJacob Faibussowitsch   CHKERRQ(hwloc_set_cpubind(ctrl->topology,ctrl->omp_cpuset,HWLOC_CPUBIND_PROCESS));
567eff715bbSJunchao Zhang   omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
568a32e93adSJunchao Zhang   PetscFunctionReturn(0);
569a32e93adSJunchao Zhang }
570a32e93adSJunchao Zhang 
571eff715bbSJunchao Zhang /*@C
572eff715bbSJunchao Zhang    PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks
573eff715bbSJunchao Zhang 
574eff715bbSJunchao Zhang    Input Parameter:
5758fcaa860SBarry Smith .  ctrl - a PETSc OMP controller
576eff715bbSJunchao Zhang 
577eff715bbSJunchao Zhang    Notes:
5788fcaa860SBarry Smith    Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms() to know if this is a master rank.
579eff715bbSJunchao Zhang    This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.
580eff715bbSJunchao Zhang 
581eff715bbSJunchao Zhang    Level: developer
582eff715bbSJunchao Zhang 
583eff715bbSJunchao Zhang .seealso: PetscOmpCtrlOmpRegionOnMasterBegin()
584eff715bbSJunchao Zhang @*/
585a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
586a32e93adSJunchao Zhang {
587a32e93adSJunchao Zhang   PetscFunctionBegin;
588*5f80ce2aSJacob Faibussowitsch   CHKERRQ(hwloc_set_cpubind(ctrl->topology,ctrl->cpuset,HWLOC_CPUBIND_PROCESS));
589eff715bbSJunchao Zhang   omp_set_num_threads(1);
590a32e93adSJunchao Zhang   PetscFunctionReturn(0);
591a32e93adSJunchao Zhang }
592a32e93adSJunchao Zhang 
5934df5c2c7SJunchao Zhang #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
59420b3346cSJunchao Zhang #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */
595