xref: /petsc/src/sys/utils/mpishm.c (revision d083f849a86f1f43e18d534ee43954e2786cb29a)
15f7487a0SJunchao Zhang #include <petscsys.h>        /*I  "petscsys.h"  I*/
25f7487a0SJunchao Zhang #include <petsc/private/petscimpl.h>
35f7487a0SJunchao Zhang 
45f7487a0SJunchao Zhang struct _n_PetscShmComm {
55f7487a0SJunchao Zhang   PetscMPIInt *globranks;       /* global ranks of each rank in the shared memory communicator */
65f7487a0SJunchao Zhang   PetscMPIInt shmsize;          /* size of the shared memory communicator */
75f7487a0SJunchao Zhang   MPI_Comm    globcomm,shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */
85f7487a0SJunchao Zhang };
95f7487a0SJunchao Zhang 
105f7487a0SJunchao Zhang /*
115f7487a0SJunchao Zhang    Private routine to delete internal tag/name shared memory communicator when a communicator is freed.
125f7487a0SJunchao Zhang 
135f7487a0SJunchao Zhang    This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this  data as an attribute is freed.
145f7487a0SJunchao Zhang 
155f7487a0SJunchao Zhang    Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval()
165f7487a0SJunchao Zhang 
175f7487a0SJunchao Zhang */
185f7487a0SJunchao Zhang PETSC_EXTERN PetscMPIInt MPIAPI Petsc_DelComm_Shm(MPI_Comm comm,PetscMPIInt keyval,void *val,void *extra_state)
195f7487a0SJunchao Zhang {
205f7487a0SJunchao Zhang   PetscErrorCode  ierr;
215f7487a0SJunchao Zhang   PetscShmComm p = (PetscShmComm)val;
225f7487a0SJunchao Zhang 
235f7487a0SJunchao Zhang   PetscFunctionBegin;
245f7487a0SJunchao Zhang   ierr = PetscInfo1(0,"Deleting shared memory subcommunicator in a MPI_Comm %ld\n",(long)comm);CHKERRMPI(ierr);
255f7487a0SJunchao Zhang   ierr = MPI_Comm_free(&p->shmcomm);CHKERRMPI(ierr);
265f7487a0SJunchao Zhang   ierr = PetscFree(p->globranks);CHKERRMPI(ierr);
275f7487a0SJunchao Zhang   ierr = PetscFree(val);CHKERRMPI(ierr);
285f7487a0SJunchao Zhang   PetscFunctionReturn(MPI_SUCCESS);
295f7487a0SJunchao Zhang }
305f7487a0SJunchao Zhang 
315f7487a0SJunchao Zhang /*@C
325f7487a0SJunchao Zhang     PetscShmCommGet - Given a PETSc communicator returns a communicator of all ranks that share a common memory
335f7487a0SJunchao Zhang 
345f7487a0SJunchao Zhang 
35*d083f849SBarry Smith     Collective.
365f7487a0SJunchao Zhang 
375f7487a0SJunchao Zhang     Input Parameter:
385f7487a0SJunchao Zhang .   globcomm - MPI_Comm
395f7487a0SJunchao Zhang 
405f7487a0SJunchao Zhang     Output Parameter:
415f7487a0SJunchao Zhang .   pshmcomm - the PETSc shared memory communicator object
425f7487a0SJunchao Zhang 
435f7487a0SJunchao Zhang     Level: developer
445f7487a0SJunchao Zhang 
455f7487a0SJunchao Zhang     Notes:
465f7487a0SJunchao Zhang     This should be called only with an PetscCommDuplicate() communictor
475f7487a0SJunchao Zhang 
485f7487a0SJunchao Zhang            When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis
495f7487a0SJunchao Zhang 
505f7487a0SJunchao Zhang @*/
515f7487a0SJunchao Zhang PetscErrorCode PetscShmCommGet(MPI_Comm globcomm,PetscShmComm *pshmcomm)
525f7487a0SJunchao Zhang {
535f7487a0SJunchao Zhang #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
545f7487a0SJunchao Zhang   PetscErrorCode   ierr;
555f7487a0SJunchao Zhang   MPI_Group        globgroup,shmgroup;
565f7487a0SJunchao Zhang   PetscMPIInt      *shmranks,i,flg;
575f7487a0SJunchao Zhang   PetscCommCounter *counter;
585f7487a0SJunchao Zhang 
595f7487a0SJunchao Zhang   PetscFunctionBegin;
605f7487a0SJunchao Zhang   ierr = MPI_Comm_get_attr(globcomm,Petsc_Counter_keyval,&counter,&flg);CHKERRQ(ierr);
615f7487a0SJunchao Zhang   if (!flg) SETERRQ(globcomm,PETSC_ERR_ARG_CORRUPT,"Bad MPI communicator supplied; must be a PETSc communicator");
625f7487a0SJunchao Zhang 
635f7487a0SJunchao Zhang   ierr = MPI_Comm_get_attr(globcomm,Petsc_ShmComm_keyval,pshmcomm,&flg);CHKERRQ(ierr);
645f7487a0SJunchao Zhang   if (flg) PetscFunctionReturn(0);
655f7487a0SJunchao Zhang 
665f7487a0SJunchao Zhang   ierr        = PetscNew(pshmcomm);CHKERRQ(ierr);
675f7487a0SJunchao Zhang   (*pshmcomm)->globcomm = globcomm;
685f7487a0SJunchao Zhang 
695f7487a0SJunchao Zhang   ierr = MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED,0, MPI_INFO_NULL,&(*pshmcomm)->shmcomm);CHKERRQ(ierr);
705f7487a0SJunchao Zhang 
715f7487a0SJunchao Zhang   ierr = MPI_Comm_size((*pshmcomm)->shmcomm,&(*pshmcomm)->shmsize);CHKERRQ(ierr);
725f7487a0SJunchao Zhang   ierr = MPI_Comm_group(globcomm, &globgroup);CHKERRQ(ierr);
735f7487a0SJunchao Zhang   ierr = MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup);CHKERRQ(ierr);
745f7487a0SJunchao Zhang   ierr = PetscMalloc1((*pshmcomm)->shmsize,&shmranks);CHKERRQ(ierr);
755f7487a0SJunchao Zhang   ierr = PetscMalloc1((*pshmcomm)->shmsize,&(*pshmcomm)->globranks);CHKERRQ(ierr);
765f7487a0SJunchao Zhang   for (i=0; i<(*pshmcomm)->shmsize; i++) shmranks[i] = i;
775f7487a0SJunchao Zhang   ierr = MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks);CHKERRQ(ierr);
785f7487a0SJunchao Zhang   ierr = PetscFree(shmranks);CHKERRQ(ierr);
795f7487a0SJunchao Zhang   ierr = MPI_Group_free(&globgroup);CHKERRQ(ierr);
805f7487a0SJunchao Zhang   ierr = MPI_Group_free(&shmgroup);CHKERRQ(ierr);
815f7487a0SJunchao Zhang 
825f7487a0SJunchao Zhang   for (i=0; i<(*pshmcomm)->shmsize; i++) {
835f7487a0SJunchao Zhang     ierr = PetscInfo2(NULL,"Shared memory rank %d global rank %d\n",i,(*pshmcomm)->globranks[i]);CHKERRQ(ierr);
845f7487a0SJunchao Zhang   }
855f7487a0SJunchao Zhang   ierr = MPI_Comm_set_attr(globcomm,Petsc_ShmComm_keyval,*pshmcomm);CHKERRQ(ierr);
865f7487a0SJunchao Zhang   PetscFunctionReturn(0);
875f7487a0SJunchao Zhang #else
885f7487a0SJunchao Zhang   SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich.");
895f7487a0SJunchao Zhang #endif
905f7487a0SJunchao Zhang }
915f7487a0SJunchao Zhang 
925f7487a0SJunchao Zhang /*@C
935f7487a0SJunchao Zhang     PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator
945f7487a0SJunchao Zhang 
955f7487a0SJunchao Zhang     Input Parameters:
965f7487a0SJunchao Zhang +   pshmcomm - the shared memory communicator object
975f7487a0SJunchao Zhang -   grank    - the global rank
985f7487a0SJunchao Zhang 
995f7487a0SJunchao Zhang     Output Parameter:
1005f7487a0SJunchao Zhang .   lrank - the local rank, or MPI_PROC_NULL if it does not exist
1015f7487a0SJunchao Zhang 
1025f7487a0SJunchao Zhang     Level: developer
1035f7487a0SJunchao Zhang 
1045f7487a0SJunchao Zhang     Developer Notes:
1055f7487a0SJunchao Zhang     Assumes the pshmcomm->globranks[] is sorted
1065f7487a0SJunchao Zhang 
1075f7487a0SJunchao Zhang     It may be better to rewrite this to map multiple global ranks to local in the same function call
1085f7487a0SJunchao Zhang 
1095f7487a0SJunchao Zhang @*/
1105f7487a0SJunchao Zhang PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm,PetscMPIInt grank,PetscMPIInt *lrank)
1115f7487a0SJunchao Zhang {
1125f7487a0SJunchao Zhang   PetscMPIInt    low,high,t,i;
1135f7487a0SJunchao Zhang   PetscBool      flg = PETSC_FALSE;
1145f7487a0SJunchao Zhang   PetscErrorCode ierr;
1155f7487a0SJunchao Zhang 
1165f7487a0SJunchao Zhang   PetscFunctionBegin;
1175f7487a0SJunchao Zhang   *lrank = MPI_PROC_NULL;
1185f7487a0SJunchao Zhang   if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(0);
1195f7487a0SJunchao Zhang   if (grank > pshmcomm->globranks[pshmcomm->shmsize-1]) PetscFunctionReturn(0);
1205f7487a0SJunchao Zhang   ierr = PetscOptionsGetBool(NULL,NULL,"-noshared",&flg,NULL);CHKERRQ(ierr);
1215f7487a0SJunchao Zhang   if (flg) PetscFunctionReturn(0);
1225f7487a0SJunchao Zhang   low  = 0;
1235f7487a0SJunchao Zhang   high = pshmcomm->shmsize;
1245f7487a0SJunchao Zhang   while (high-low > 5) {
1255f7487a0SJunchao Zhang     t = (low+high)/2;
1265f7487a0SJunchao Zhang     if (pshmcomm->globranks[t] > grank) high = t;
1275f7487a0SJunchao Zhang     else low = t;
1285f7487a0SJunchao Zhang   }
1295f7487a0SJunchao Zhang   for (i=low; i<high; i++) {
1305f7487a0SJunchao Zhang     if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(0);
1315f7487a0SJunchao Zhang     if (pshmcomm->globranks[i] == grank) {
1325f7487a0SJunchao Zhang       *lrank = i;
1335f7487a0SJunchao Zhang       PetscFunctionReturn(0);
1345f7487a0SJunchao Zhang     }
1355f7487a0SJunchao Zhang   }
1365f7487a0SJunchao Zhang   PetscFunctionReturn(0);
1375f7487a0SJunchao Zhang }
1385f7487a0SJunchao Zhang 
1395f7487a0SJunchao Zhang /*@C
1405f7487a0SJunchao Zhang     PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank
1415f7487a0SJunchao Zhang 
1425f7487a0SJunchao Zhang     Input Parameters:
1435f7487a0SJunchao Zhang +   pshmcomm - the shared memory communicator object
1445f7487a0SJunchao Zhang -   lrank    - the local rank in the shared memory communicator
1455f7487a0SJunchao Zhang 
1465f7487a0SJunchao Zhang     Output Parameter:
1475f7487a0SJunchao Zhang .   grank - the global rank in the global communicator where the shared memory communicator is built
1485f7487a0SJunchao Zhang 
1495f7487a0SJunchao Zhang     Level: developer
1505f7487a0SJunchao Zhang 
1515f7487a0SJunchao Zhang @*/
1525f7487a0SJunchao Zhang PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm,PetscMPIInt lrank,PetscMPIInt *grank)
1535f7487a0SJunchao Zhang {
1545f7487a0SJunchao Zhang   PetscFunctionBegin;
1555f7487a0SJunchao Zhang #ifdef PETSC_USE_DEBUG
1565f7487a0SJunchao Zhang   {
1575f7487a0SJunchao Zhang     PetscErrorCode ierr;
1585f7487a0SJunchao Zhang     if (lrank < 0 || lrank >= pshmcomm->shmsize) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"No rank %D in the shared memory communicator",lrank);CHKERRQ(ierr); }
1595f7487a0SJunchao Zhang   }
1605f7487a0SJunchao Zhang #endif
1615f7487a0SJunchao Zhang   *grank = pshmcomm->globranks[lrank];
1625f7487a0SJunchao Zhang   PetscFunctionReturn(0);
1635f7487a0SJunchao Zhang }
1645f7487a0SJunchao Zhang 
1655f7487a0SJunchao Zhang /*@C
1665f7487a0SJunchao Zhang     PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory
1675f7487a0SJunchao Zhang 
1685f7487a0SJunchao Zhang     Input Parameter:
1695f7487a0SJunchao Zhang .   pshmcomm - PetscShmComm object obtained with PetscShmCommGet()
1705f7487a0SJunchao Zhang 
1715f7487a0SJunchao Zhang     Output Parameter:
1725f7487a0SJunchao Zhang .   comm     - the MPI communicator
1735f7487a0SJunchao Zhang 
1745f7487a0SJunchao Zhang     Level: developer
1755f7487a0SJunchao Zhang 
1765f7487a0SJunchao Zhang @*/
1775f7487a0SJunchao Zhang PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm,MPI_Comm *comm)
1785f7487a0SJunchao Zhang {
1795f7487a0SJunchao Zhang   PetscFunctionBegin;
1805f7487a0SJunchao Zhang   *comm = pshmcomm->shmcomm;
1815f7487a0SJunchao Zhang   PetscFunctionReturn(0);
1825f7487a0SJunchao Zhang }
1835f7487a0SJunchao Zhang 
18420b3346cSJunchao Zhang #if defined(PETSC_HAVE_OPENMP_SUPPORT)
185a32e93adSJunchao Zhang #include <pthread.h>
186a32e93adSJunchao Zhang #include <hwloc.h>
187a32e93adSJunchao Zhang #include <omp.h>
188a32e93adSJunchao Zhang 
189eff715bbSJunchao Zhang /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
190eff715bbSJunchao Zhang    otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
1914df5c2c7SJunchao Zhang    simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
1924df5c2c7SJunchao Zhang    by 50%. Until the reason is found out, we use mmap() instead.
1934df5c2c7SJunchao Zhang */
1944df5c2c7SJunchao Zhang #define USE_MMAP_ALLOCATE_SHARED_MEMORY
1954df5c2c7SJunchao Zhang 
1964df5c2c7SJunchao Zhang #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
1974df5c2c7SJunchao Zhang #include <sys/mman.h>
1984df5c2c7SJunchao Zhang #include <sys/types.h>
1994df5c2c7SJunchao Zhang #include <sys/stat.h>
2004df5c2c7SJunchao Zhang #include <fcntl.h>
2014df5c2c7SJunchao Zhang #endif
2024df5c2c7SJunchao Zhang 
203a32e93adSJunchao Zhang struct _n_PetscOmpCtrl {
204a32e93adSJunchao Zhang   MPI_Comm          omp_comm;        /* a shared memory communicator to spawn omp threads */
205a32e93adSJunchao Zhang   MPI_Comm          omp_master_comm; /* a communicator to give to third party libraries */
206a32e93adSJunchao Zhang   PetscMPIInt       omp_comm_size;   /* size of omp_comm, a kind of OMP_NUM_THREADS */
207a32e93adSJunchao Zhang   PetscBool         is_omp_master;   /* rank 0's in omp_comm */
208a32e93adSJunchao Zhang   MPI_Win           omp_win;         /* a shared memory window containing a barrier */
209a32e93adSJunchao Zhang   pthread_barrier_t *barrier;        /* pointer to the barrier */
210a32e93adSJunchao Zhang   hwloc_topology_t  topology;
211a32e93adSJunchao Zhang   hwloc_cpuset_t    cpuset;          /* cpu bindings of omp master */
212a32e93adSJunchao Zhang   hwloc_cpuset_t    omp_cpuset;      /* union of cpu bindings of ranks in omp_comm */
213a32e93adSJunchao Zhang };
214a32e93adSJunchao Zhang 
2154df5c2c7SJunchao Zhang 
216eff715bbSJunchao Zhang /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
217eff715bbSJunchao Zhang    contained by the controler.
218eff715bbSJunchao Zhang 
219eff715bbSJunchao Zhang    PETSc OpenMP controler users do not call this function directly. This function exists
220eff715bbSJunchao Zhang    only because we want to separate shared memory allocation methods from other code.
221eff715bbSJunchao Zhang  */
222a32e93adSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
223a32e93adSJunchao Zhang {
224a32e93adSJunchao Zhang   PetscErrorCode        ierr;
225a32e93adSJunchao Zhang   MPI_Aint              size;
226a32e93adSJunchao Zhang   void                  *baseptr;
227a32e93adSJunchao Zhang   pthread_barrierattr_t  attr;
228a32e93adSJunchao Zhang 
2294df5c2c7SJunchao Zhang #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
2304df5c2c7SJunchao Zhang   PetscInt              fd;
2314df5c2c7SJunchao Zhang   PetscChar             pathname[PETSC_MAX_PATH_LEN];
2324df5c2c7SJunchao Zhang #else
2334df5c2c7SJunchao Zhang   PetscMPIInt           disp_unit;
2344df5c2c7SJunchao Zhang #endif
2354df5c2c7SJunchao Zhang 
2364df5c2c7SJunchao Zhang   PetscFunctionBegin;
2374df5c2c7SJunchao Zhang #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
2384df5c2c7SJunchao Zhang   size = sizeof(pthread_barrier_t);
2394df5c2c7SJunchao Zhang   if (ctrl->is_omp_master) {
240eff715bbSJunchao Zhang     /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
2414df5c2c7SJunchao Zhang     ierr    = PetscGetTmp(PETSC_COMM_SELF,pathname,PETSC_MAX_PATH_LEN);CHKERRQ(ierr);
2424df5c2c7SJunchao Zhang     ierr    = PetscStrlcat(pathname,"/petsc-shm-XXXXXX",PETSC_MAX_PATH_LEN);CHKERRQ(ierr);
2434df5c2c7SJunchao Zhang     /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
2444df5c2c7SJunchao Zhang     fd      = mkstemp(pathname); if(fd == -1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not create tmp file %s with mkstemp\n", pathname);
2454df5c2c7SJunchao Zhang     ierr    = ftruncate(fd,size);CHKERRQ(ierr);
2464df5c2c7SJunchao Zhang     baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); if (baseptr == MAP_FAILED) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed\n");
2474df5c2c7SJunchao Zhang     ierr    = close(fd);CHKERRQ(ierr);
2484df5c2c7SJunchao Zhang     ierr    = MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);CHKERRQ(ierr);
249eff715bbSJunchao Zhang     /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
2504df5c2c7SJunchao Zhang     ierr    = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
2514df5c2c7SJunchao Zhang     ierr    = unlink(pathname);CHKERRQ(ierr);
2524df5c2c7SJunchao Zhang   } else {
2534df5c2c7SJunchao Zhang     ierr    = MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);CHKERRQ(ierr);
2544df5c2c7SJunchao Zhang     fd      = open(pathname,O_RDWR); if(fd == -1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not open tmp file %s\n", pathname);
2554df5c2c7SJunchao Zhang     baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); if (baseptr == MAP_FAILED) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed\n");
2564df5c2c7SJunchao Zhang     ierr    = close(fd);CHKERRQ(ierr);
2574df5c2c7SJunchao Zhang     ierr    = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
2584df5c2c7SJunchao Zhang   }
2594df5c2c7SJunchao Zhang #else
260a32e93adSJunchao Zhang   size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
261a32e93adSJunchao Zhang   ierr = MPI_Win_allocate_shared(size,1,MPI_INFO_NULL,ctrl->omp_comm,&baseptr,&ctrl->omp_win);CHKERRQ(ierr);
262a32e93adSJunchao Zhang   ierr = MPI_Win_shared_query(ctrl->omp_win,0,&size,&disp_unit,&baseptr);CHKERRQ(ierr);
2634df5c2c7SJunchao Zhang #endif
264a32e93adSJunchao Zhang   ctrl->barrier = (pthread_barrier_t*)baseptr;
265a32e93adSJunchao Zhang 
266a32e93adSJunchao Zhang   /* omp master initializes the barrier */
267a32e93adSJunchao Zhang   if (ctrl->is_omp_master) {
268a32e93adSJunchao Zhang     ierr = MPI_Comm_size(ctrl->omp_comm,&ctrl->omp_comm_size);CHKERRQ(ierr);
269a32e93adSJunchao Zhang     ierr = pthread_barrierattr_init(&attr);CHKERRQ(ierr);
270a32e93adSJunchao Zhang     ierr = pthread_barrierattr_setpshared(&attr,PTHREAD_PROCESS_SHARED);CHKERRQ(ierr); /* make the barrier also work for processes */
271a32e93adSJunchao Zhang     ierr = pthread_barrier_init(ctrl->barrier,&attr,(unsigned int)ctrl->omp_comm_size);CHKERRQ(ierr);
272a32e93adSJunchao Zhang     ierr = pthread_barrierattr_destroy(&attr);CHKERRQ(ierr);
273a32e93adSJunchao Zhang   }
274a32e93adSJunchao Zhang 
2754df5c2c7SJunchao Zhang   /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
2764df5c2c7SJunchao Zhang   ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
277a32e93adSJunchao Zhang   PetscFunctionReturn(0);
278a32e93adSJunchao Zhang }
279a32e93adSJunchao Zhang 
280eff715bbSJunchao Zhang /* Destroy the pthread barrier in the PETSc OpenMP controler */
281a32e93adSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
282a32e93adSJunchao Zhang {
283a32e93adSJunchao Zhang   PetscErrorCode ierr;
284a32e93adSJunchao Zhang 
2854df5c2c7SJunchao Zhang   PetscFunctionBegin;
2864df5c2c7SJunchao Zhang   /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
287a32e93adSJunchao Zhang   ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
288a32e93adSJunchao Zhang   if (ctrl->is_omp_master) { ierr = pthread_barrier_destroy(ctrl->barrier);CHKERRQ(ierr); }
2894df5c2c7SJunchao Zhang 
2904df5c2c7SJunchao Zhang #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
2914df5c2c7SJunchao Zhang   ierr = munmap(ctrl->barrier,sizeof(pthread_barrier_t));CHKERRQ(ierr);
2924df5c2c7SJunchao Zhang #else
293a32e93adSJunchao Zhang   ierr = MPI_Win_free(&ctrl->omp_win);CHKERRQ(ierr);
2944df5c2c7SJunchao Zhang #endif
295a32e93adSJunchao Zhang   PetscFunctionReturn(0);
296a32e93adSJunchao Zhang }
297a32e93adSJunchao Zhang 
298eff715bbSJunchao Zhang /*@C
299eff715bbSJunchao Zhang     PetscOmpCtrlCreate - create a PETSc OpenMP controler, which manages PETSc's interaction with third party libraries using OpenMP
300eff715bbSJunchao Zhang 
301eff715bbSJunchao Zhang     Input Parameter:
302eff715bbSJunchao Zhang +   petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
3037c405c4aSJunchao Zhang .   nthreads   - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value
304eff715bbSJunchao Zhang 
305eff715bbSJunchao Zhang     Output Parameter:
306eff715bbSJunchao Zhang .   pctrl      - a PETSc OpenMP controler
307eff715bbSJunchao Zhang 
308eff715bbSJunchao Zhang     Level: developer
309eff715bbSJunchao Zhang 
310eff715bbSJunchao Zhang .seealso PetscOmpCtrlDestroy()
311eff715bbSJunchao Zhang @*/
312a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm,PetscInt nthreads,PetscOmpCtrl *pctrl)
313a32e93adSJunchao Zhang {
314a32e93adSJunchao Zhang   PetscErrorCode        ierr;
315a32e93adSJunchao Zhang   PetscOmpCtrl          ctrl;
316a32e93adSJunchao Zhang   unsigned long         *cpu_ulongs=NULL;
317a32e93adSJunchao Zhang   PetscInt              i,nr_cpu_ulongs;
318a32e93adSJunchao Zhang   PetscShmComm          pshmcomm;
319a32e93adSJunchao Zhang   MPI_Comm              shm_comm;
320a32e93adSJunchao Zhang   PetscMPIInt           shm_rank,shm_comm_size,omp_rank,color;
3217c405c4aSJunchao Zhang   PetscInt              num_packages,num_cores;
322a32e93adSJunchao Zhang 
323a32e93adSJunchao Zhang   PetscFunctionBegin;
324a32e93adSJunchao Zhang   ierr = PetscNew(&ctrl);CHKERRQ(ierr);
325a32e93adSJunchao Zhang 
326a32e93adSJunchao Zhang   /*=================================================================================
3277c405c4aSJunchao Zhang     Init hwloc
3287c405c4aSJunchao Zhang    ==================================================================================*/
3297c405c4aSJunchao Zhang   ierr = hwloc_topology_init(&ctrl->topology);CHKERRQ(ierr);
3307c405c4aSJunchao Zhang #if HWLOC_API_VERSION >= 0x00020000
3317c405c4aSJunchao Zhang   /* to filter out unneeded info and have faster hwloc_topology_load */
3327c405c4aSJunchao Zhang   ierr = hwloc_topology_set_all_types_filter(ctrl->topology,HWLOC_TYPE_FILTER_KEEP_NONE);CHKERRQ(ierr);
3337c405c4aSJunchao Zhang   ierr = hwloc_topology_set_type_filter(ctrl->topology,HWLOC_OBJ_CORE,HWLOC_TYPE_FILTER_KEEP_ALL);CHKERRQ(ierr);
3347c405c4aSJunchao Zhang #endif
3357c405c4aSJunchao Zhang   ierr = hwloc_topology_load(ctrl->topology);CHKERRQ(ierr);
3367c405c4aSJunchao Zhang 
3377c405c4aSJunchao Zhang   /*=================================================================================
338a32e93adSJunchao Zhang     Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
339a32e93adSJunchao Zhang     physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
340a32e93adSJunchao Zhang     others are called slaves. OMP Masters make up a new comm called omp_master_comm,
341a32e93adSJunchao Zhang     which is usually passed to third party libraries.
342a32e93adSJunchao Zhang    ==================================================================================*/
343a32e93adSJunchao Zhang 
344a32e93adSJunchao Zhang   /* fetch the stored shared memory communicator */
345a32e93adSJunchao Zhang   ierr = PetscShmCommGet(petsc_comm,&pshmcomm);CHKERRQ(ierr);
346a32e93adSJunchao Zhang   ierr = PetscShmCommGetMpiShmComm(pshmcomm,&shm_comm);CHKERRQ(ierr);
347a32e93adSJunchao Zhang 
348a32e93adSJunchao Zhang   ierr = MPI_Comm_rank(shm_comm,&shm_rank);CHKERRQ(ierr);
349a32e93adSJunchao Zhang   ierr = MPI_Comm_size(shm_comm,&shm_comm_size);CHKERRQ(ierr);
350a32e93adSJunchao Zhang 
3517c405c4aSJunchao Zhang   /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
3527c405c4aSJunchao Zhang   if (nthreads == -1) {
3537c405c4aSJunchao Zhang     num_packages = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_PACKAGE); if (num_packages <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not determine number of sockets(packages) per compute node\n");
3547c405c4aSJunchao Zhang     num_cores    = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_CORE);    if (num_cores    <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not determine number of cores per compute node\n");
3557c405c4aSJunchao Zhang     nthreads     = num_cores/num_packages;
3567c405c4aSJunchao Zhang     if (nthreads > shm_comm_size) nthreads = shm_comm_size;
3577c405c4aSJunchao Zhang   }
3587c405c4aSJunchao Zhang 
359a32e93adSJunchao Zhang   if (nthreads < 1 || nthreads > shm_comm_size) SETERRQ2(petsc_comm,PETSC_ERR_ARG_OUTOFRANGE,"number of OpenMP threads %d can not be < 1 or > the MPI shared memory communicator size %d\n",nthreads,shm_comm_size);
360a32e93adSJunchao Zhang   if (shm_comm_size % nthreads) { ierr = PetscPrintf(petsc_comm,"Warning: number of OpenMP threads %d is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n",nthreads,shm_comm_size);CHKERRQ(ierr); }
361a32e93adSJunchao Zhang 
362a32e93adSJunchao Zhang   /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
363a32e93adSJunchao Zhang      shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
364a32e93adSJunchao Zhang      color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
365a32e93adSJunchao Zhang      be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
366a32e93adSJunchao Zhang      Use 0 as key so that rank ordering wont change in new comm.
367a32e93adSJunchao Zhang    */
368a32e93adSJunchao Zhang   color = shm_rank / nthreads;
3693ab56b82SJunchao Zhang   ierr  = MPI_Comm_split(shm_comm,color,0/*key*/,&ctrl->omp_comm);CHKERRQ(ierr);
370a32e93adSJunchao Zhang 
371a32e93adSJunchao Zhang   /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
372a32e93adSJunchao Zhang   ierr = MPI_Comm_rank(ctrl->omp_comm,&omp_rank);CHKERRQ(ierr);
373a32e93adSJunchao Zhang   if (!omp_rank) {
374a32e93adSJunchao Zhang     ctrl->is_omp_master = PETSC_TRUE;  /* master */
375a32e93adSJunchao Zhang     color = 0;
376a32e93adSJunchao Zhang   } else {
377a32e93adSJunchao Zhang     ctrl->is_omp_master = PETSC_FALSE; /* slave */
378a32e93adSJunchao Zhang     color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
379a32e93adSJunchao Zhang   }
3803ab56b82SJunchao Zhang   ierr = MPI_Comm_split(petsc_comm,color,0/*key*/,&ctrl->omp_master_comm);CHKERRQ(ierr); /* rank 0 in omp_master_comm is rank 0 in petsc_comm */
381a32e93adSJunchao Zhang 
382a32e93adSJunchao Zhang   /*=================================================================================
383a32e93adSJunchao Zhang     Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
384a32e93adSJunchao Zhang     slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
385a32e93adSJunchao Zhang     and run them on the idle CPUs.
386a32e93adSJunchao Zhang    ==================================================================================*/
387a32e93adSJunchao Zhang   ierr = PetscOmpCtrlCreateBarrier(ctrl);CHKERRQ(ierr);
388a32e93adSJunchao Zhang 
389a32e93adSJunchao Zhang   /*=================================================================================
390a32e93adSJunchao Zhang     omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
391a32e93adSJunchao Zhang     is the union of the bindings of all ranks in the omp_comm
392a32e93adSJunchao Zhang     =================================================================================*/
393a32e93adSJunchao Zhang 
3943ab56b82SJunchao Zhang   ctrl->cpuset = hwloc_bitmap_alloc(); if (!ctrl->cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed\n");
395a32e93adSJunchao Zhang   ierr = hwloc_get_cpubind(ctrl->topology,ctrl->cpuset, HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
396a32e93adSJunchao Zhang 
3973ab56b82SJunchao Zhang   /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
398a32e93adSJunchao Zhang   nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset (ctrl->topology))+sizeof(unsigned long)*8)/sizeof(unsigned long)/8;
399a32e93adSJunchao Zhang   ierr = PetscMalloc1(nr_cpu_ulongs,&cpu_ulongs);CHKERRQ(ierr);
400a32e93adSJunchao Zhang   if (nr_cpu_ulongs == 1) {
401a32e93adSJunchao Zhang     cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
402a32e93adSJunchao Zhang   } else {
403a32e93adSJunchao Zhang     for (i=0; i<nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset,(unsigned)i);
404a32e93adSJunchao Zhang   }
405a32e93adSJunchao Zhang 
406a32e93adSJunchao Zhang   ierr = MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs,nr_cpu_ulongs, MPI_UNSIGNED_LONG,MPI_BOR,0,ctrl->omp_comm);CHKERRQ(ierr);
407a32e93adSJunchao Zhang 
408a32e93adSJunchao Zhang   if (ctrl->is_omp_master) {
4093ab56b82SJunchao Zhang     ctrl->omp_cpuset = hwloc_bitmap_alloc(); if (!ctrl->omp_cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed\n");
410a32e93adSJunchao Zhang     if (nr_cpu_ulongs == 1) {
4113ab56b82SJunchao Zhang #if HWLOC_API_VERSION >= 0x00020000
412a32e93adSJunchao Zhang       ierr = hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);CHKERRQ(ierr);
4133ab56b82SJunchao Zhang #else
4143ab56b82SJunchao Zhang       hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);
4153ab56b82SJunchao Zhang #endif
416a32e93adSJunchao Zhang     } else {
4173ab56b82SJunchao Zhang       for (i=0; i<nr_cpu_ulongs; i++)  {
4183ab56b82SJunchao Zhang #if HWLOC_API_VERSION >= 0x00020000
4193ab56b82SJunchao Zhang         ierr = hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);CHKERRQ(ierr);
4203ab56b82SJunchao Zhang #else
4213ab56b82SJunchao Zhang         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);
4223ab56b82SJunchao Zhang #endif
4233ab56b82SJunchao Zhang       }
424a32e93adSJunchao Zhang     }
425a32e93adSJunchao Zhang   }
426a32e93adSJunchao Zhang 
427a32e93adSJunchao Zhang   ierr = PetscFree(cpu_ulongs);CHKERRQ(ierr);
428a32e93adSJunchao Zhang   *pctrl = ctrl;
429a32e93adSJunchao Zhang   PetscFunctionReturn(0);
430a32e93adSJunchao Zhang }
431a32e93adSJunchao Zhang 
432eff715bbSJunchao Zhang /*@C
433eff715bbSJunchao Zhang     PetscOmpCtrlDestroy - destory the PETSc OpenMP controler
434eff715bbSJunchao Zhang 
435eff715bbSJunchao Zhang     Input Parameter:
436eff715bbSJunchao Zhang .   pctrl  - a PETSc OpenMP controler
437eff715bbSJunchao Zhang 
438eff715bbSJunchao Zhang     Level: developer
439eff715bbSJunchao Zhang 
440eff715bbSJunchao Zhang .seealso PetscOmpCtrlCreate()
441eff715bbSJunchao Zhang @*/
442a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
443a32e93adSJunchao Zhang {
444a32e93adSJunchao Zhang   PetscErrorCode  ierr;
445a32e93adSJunchao Zhang   PetscOmpCtrl    ctrl = *pctrl;
446a32e93adSJunchao Zhang 
447a32e93adSJunchao Zhang   PetscFunctionBegin;
448a32e93adSJunchao Zhang   hwloc_bitmap_free(ctrl->cpuset);
449a32e93adSJunchao Zhang   hwloc_topology_destroy(ctrl->topology);
450a32e93adSJunchao Zhang   PetscOmpCtrlDestroyBarrier(ctrl);
451a32e93adSJunchao Zhang   ierr = MPI_Comm_free(&ctrl->omp_comm);CHKERRQ(ierr);
452a32e93adSJunchao Zhang   if (ctrl->is_omp_master) {
453a32e93adSJunchao Zhang     hwloc_bitmap_free(ctrl->omp_cpuset);
454a32e93adSJunchao Zhang     ierr = MPI_Comm_free(&ctrl->omp_master_comm);CHKERRQ(ierr);
455a32e93adSJunchao Zhang   }
456a32e93adSJunchao Zhang   ierr = PetscFree(ctrl);CHKERRQ(ierr);
457a32e93adSJunchao Zhang   PetscFunctionReturn(0);
458a32e93adSJunchao Zhang }
459a32e93adSJunchao Zhang 
460a32e93adSJunchao Zhang /*@C
461eff715bbSJunchao Zhang     PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controler
462a32e93adSJunchao Zhang 
463a32e93adSJunchao Zhang     Input Parameter:
464eff715bbSJunchao Zhang .   ctrl - a PETSc OMP controler
465a32e93adSJunchao Zhang 
466a32e93adSJunchao Zhang     Output Parameter:
467eff715bbSJunchao Zhang +   omp_comm         - a communicator that includes a master rank and slave ranks where master spawns threads
468a32e93adSJunchao Zhang .   omp_master_comm  - on master ranks, return a communicator that include master ranks of each omp_comm;
469a32e93adSJunchao Zhang                        on slave ranks, MPI_COMM_NULL will be return in reality.
470a32e93adSJunchao Zhang -   is_omp_master    - true if the calling process is an OMP master rank.
471a32e93adSJunchao Zhang 
472eff715bbSJunchao Zhang     Notes: any output parameter can be NULL. The parameter is just ignored.
473eff715bbSJunchao Zhang 
474a32e93adSJunchao Zhang     Level: developer
475a32e93adSJunchao Zhang @*/
476a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl,MPI_Comm *omp_comm,MPI_Comm *omp_master_comm,PetscBool *is_omp_master)
477a32e93adSJunchao Zhang {
478a32e93adSJunchao Zhang   PetscFunctionBegin;
479a32e93adSJunchao Zhang   if (omp_comm)        *omp_comm        = ctrl->omp_comm;
480a32e93adSJunchao Zhang   if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
481a32e93adSJunchao Zhang   if (is_omp_master)   *is_omp_master   = ctrl->is_omp_master;
482a32e93adSJunchao Zhang   PetscFunctionReturn(0);
483a32e93adSJunchao Zhang }
484a32e93adSJunchao Zhang 
485eff715bbSJunchao Zhang /*@C
486eff715bbSJunchao Zhang     PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controler (to let slave ranks free their CPU)
487eff715bbSJunchao Zhang 
488eff715bbSJunchao Zhang     Input Parameter:
489eff715bbSJunchao Zhang .   ctrl - a PETSc OMP controler
490eff715bbSJunchao Zhang 
491eff715bbSJunchao Zhang     Notes:
492eff715bbSJunchao Zhang     this is a pthread barrier on MPI processes. Using MPI_Barrier instead is conceptually correct. But MPI standard does not
493eff715bbSJunchao Zhang     require processes blocked by MPI_Barrier free their CPUs to let other processes progress. In practice, to minilize latency,
494eff715bbSJunchao Zhang     MPI processes stuck in MPI_Barrier keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.
495eff715bbSJunchao Zhang 
496eff715bbSJunchao Zhang     A code using PetscOmpCtrlBarrier() would be like this,
497eff715bbSJunchao Zhang 
498eff715bbSJunchao Zhang     if (is_omp_master) {
499eff715bbSJunchao Zhang       PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
500eff715bbSJunchao Zhang       Call the library using OpenMP
501eff715bbSJunchao Zhang       PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
502eff715bbSJunchao Zhang     }
503eff715bbSJunchao Zhang     PetscOmpCtrlBarrier(ctrl);
504eff715bbSJunchao Zhang 
505eff715bbSJunchao Zhang     Level: developer
506eff715bbSJunchao Zhang 
507eff715bbSJunchao Zhang .seealso PetscOmpCtrlOmpRegionOnMasterBegin(), PetscOmpCtrlOmpRegionOnMasterEnd()
508eff715bbSJunchao Zhang @*/
509a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
510a32e93adSJunchao Zhang {
511a32e93adSJunchao Zhang   PetscErrorCode ierr;
512a32e93adSJunchao Zhang 
513a32e93adSJunchao Zhang   PetscFunctionBegin;
514a32e93adSJunchao Zhang   ierr = pthread_barrier_wait(ctrl->barrier);
515a32e93adSJunchao Zhang   if (ierr && ierr != PTHREAD_BARRIER_SERIAL_THREAD) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %D\n", ierr);
516a32e93adSJunchao Zhang   PetscFunctionReturn(0);
517a32e93adSJunchao Zhang }
518a32e93adSJunchao Zhang 
519eff715bbSJunchao Zhang /*@C
520eff715bbSJunchao Zhang     PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks
521eff715bbSJunchao Zhang 
522eff715bbSJunchao Zhang     Input Parameter:
523eff715bbSJunchao Zhang .   ctrl - a PETSc OMP controler
524eff715bbSJunchao Zhang 
525eff715bbSJunchao Zhang     Notes:
526eff715bbSJunchao Zhang     Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms to know if this is a master rank.
527eff715bbSJunchao Zhang     This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime
528eff715bbSJunchao Zhang 
529eff715bbSJunchao Zhang     Level: developer
530eff715bbSJunchao Zhang 
531eff715bbSJunchao Zhang .seealso: PetscOmpCtrlOmpRegionOnMasterEnd()
532eff715bbSJunchao Zhang @*/
533a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
534a32e93adSJunchao Zhang {
535a32e93adSJunchao Zhang   PetscErrorCode ierr;
536a32e93adSJunchao Zhang 
537a32e93adSJunchao Zhang   PetscFunctionBegin;
538a32e93adSJunchao Zhang   ierr = hwloc_set_cpubind(ctrl->topology,ctrl->omp_cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
539eff715bbSJunchao Zhang   omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
540a32e93adSJunchao Zhang   PetscFunctionReturn(0);
541a32e93adSJunchao Zhang }
542a32e93adSJunchao Zhang 
543eff715bbSJunchao Zhang /*@C
544eff715bbSJunchao Zhang    PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks
545eff715bbSJunchao Zhang 
546eff715bbSJunchao Zhang    Input Parameter:
547eff715bbSJunchao Zhang .  ctrl - a PETSc OMP controler
548eff715bbSJunchao Zhang 
549eff715bbSJunchao Zhang    Notes:
550eff715bbSJunchao Zhang    Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms to know if this is a master rank.
551eff715bbSJunchao Zhang    This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.
552eff715bbSJunchao Zhang 
553eff715bbSJunchao Zhang    Level: developer
554eff715bbSJunchao Zhang 
555eff715bbSJunchao Zhang .seealso: PetscOmpCtrlOmpRegionOnMasterBegin()
556eff715bbSJunchao Zhang @*/
557a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
558a32e93adSJunchao Zhang {
559a32e93adSJunchao Zhang   PetscErrorCode ierr;
560a32e93adSJunchao Zhang 
561a32e93adSJunchao Zhang   PetscFunctionBegin;
562a32e93adSJunchao Zhang   ierr = hwloc_set_cpubind(ctrl->topology,ctrl->cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
563eff715bbSJunchao Zhang   omp_set_num_threads(1);
564a32e93adSJunchao Zhang   PetscFunctionReturn(0);
565a32e93adSJunchao Zhang }
566a32e93adSJunchao Zhang 
5674df5c2c7SJunchao Zhang #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
56820b3346cSJunchao Zhang #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */
569